Identify Documents

The Identify operation automatically detects which type of document you have. Given a document and a list of possible types, DocuTray returns the best match with a confidence score and ranked alternatives.

Quick Start

from pathlib import Path
from docutray import Client

client = Client(api_key="YOUR_API_KEY")

result = client.identify.run(
    file=Path("document.pdf"),
    document_type_code_options=["invoice", "receipt", "contract"]
)

print(f"Type: {result.document_type.name}")
print(f"Confidence: {result.document_type.confidence:.0%}")

import DocuTray from 'docutray';
import { readFileSync } from 'fs';

const client = new DocuTray({ apiKey: 'YOUR_API_KEY' });

const result = await client.identify.run({
  file: readFileSync('document.pdf'),
  filename: 'document.pdf',
  documentTypeCodeOptions: ['invoice', 'receipt', 'contract'],
});

console.log(`Type: ${result.document_type.name}`);
console.log(`Confidence: ${(result.document_type.confidence * 100).toFixed(0)}%`);

curl -X POST https://app.docutray.com/api/identify \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -F "image=@document.pdf" \
  -F 'document_type_code_options=["invoice", "receipt", "contract"]'

Response

# result is an IdentificationResult
print(result.document_type.code)        # "invoice"
print(result.document_type.name)        # "Invoice"
print(result.document_type.confidence)  # 0.95

# View alternatives ranked by confidence
for alt in result.alternatives:
    print(f"  {alt.name}: {alt.confidence:.0%}")

// result is an IdentificationResult
console.log(result.document_type.code);        // "invoice"
console.log(result.document_type.name);        // "Invoice"
console.log(result.document_type.confidence);  // 0.95

// View alternatives ranked by confidence
for (const alt of result.alternatives) {
  console.log(`  ${alt.name}: ${(alt.confidence * 100).toFixed(0)}%`);
}

{
  "document_type": {
    "code": "invoice",
    "name": "Invoice",
    "confidence": 0.95
  },
  "alternatives": [
    {
      "code": "receipt",
      "name": "Receipt",
      "confidence": 0.04
    },
    {
      "code": "contract",
      "name": "Contract",
      "confidence": 0.01
    }
  ]
}

Async Identification

For large documents, use async identification to process in the background.

# Start async identification
status = client.identify.run_async(
    file=Path("document.pdf"),
    document_type_code_options=["invoice", "receipt"]
)

# Wait for completion
result = status.wait()

if result.is_success():
    print(f"Type: {result.document_type.code}")

// Start async identification
const status = await client.identify.runAsync({
  file: readFileSync('document.pdf'),
  filename: 'document.pdf',
  documentTypeCodeOptions: ['invoice', 'receipt'],
});

// Wait for completion
const result = await status.wait({
  onStatus: (s) => console.log(`Status: ${s.status}`),
});

if (result.isSuccess()) {
  console.log(`Type: ${result.document_type.code}`);
}

# Start async identification
curl -X POST https://app.docutray.com/api/identify-async \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -F "image=@document.pdf" \
  -F 'document_type_code_options=["invoice", "receipt"]'

# Poll for status
curl https://app.docutray.com/api/identify-async/status/IDENTIFICATION_ID \
  -H "Authorization: Bearer YOUR_API_KEY"

Identify Then Convert

A common pattern is to first identify a document, then convert it using the detected type. This is useful when you receive documents of unknown types.

from pathlib import Path
from docutray import Client

client = Client(api_key="YOUR_API_KEY")
document = Path("unknown_document.pdf")

# Step 1: Identify the document type
identification = client.identify.run(
    file=document,
    document_type_code_options=["invoice", "receipt", "contract"]
)

detected_type = identification.document_type.code
confidence = identification.document_type.confidence
print(f"Detected: {detected_type} ({confidence:.0%})")

# Step 2: Convert using the detected type
if confidence > 0.8:
    result = client.convert.run(
        file=document,
        document_type_code=detected_type
    )
    print(result.data)
else:
    print("Low confidence — review manually")

import DocuTray from 'docutray';
import { readFileSync } from 'fs';

const client = new DocuTray({ apiKey: 'YOUR_API_KEY' });
const document = readFileSync('unknown_document.pdf');

// Step 1: Identify the document type
const identification = await client.identify.run({
  file: document,
  documentTypeCodeOptions: ['invoice', 'receipt', 'contract'],
});

const detectedType = identification.document_type.code;
const confidence = identification.document_type.confidence;
console.log(`Detected: ${detectedType} (${(confidence * 100).toFixed(0)}%)`);

// Step 2: Convert using the detected type
if (confidence > 0.8) {
  const result = await client.convert.run({
    file: document,
    documentTypeCode: detectedType,
  });
  console.log(result.data);
} else {
  console.log('Low confidence — review manually');
}

# Step 1: Identify the document type
IDENTIFY_RESULT=$(curl -s -X POST https://app.docutray.com/api/identify \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -F "image=@unknown_document.pdf" \
  -F 'document_type_code_options=["invoice", "receipt", "contract"]')

# Extract the detected type code
DOC_TYPE=$(echo $IDENTIFY_RESULT | jq -r '.document_type.code')
echo "Detected type: $DOC_TYPE"

# Step 2: Convert using the detected type
curl -X POST https://app.docutray.com/api/convert \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -F "image=@unknown_document.pdf" \
  -F "document_type_code=$DOC_TYPE"

Parameters

Parameter	Type	Required	Description
`file`	File	No	File to identify (path, bytes, or file object)
`url`	string	No	Public URL of the document to download and identify
`file_base64` / `base64`	string	No	Base64-encoded document content
`document_type_code_options`	string[]	Yes	List of document type codes to consider
`content_type`	string	No	MIME type of the document (auto-detected if not provided)
`document_metadata`	object	No	Custom metadata to attach to the identification

You must provide exactly one of file, url, or file_base64/base64.

Complete Code

End-to-end example with the identify-then-convert pattern and error handling.

from pathlib import Path
from docutray import Client, NotFoundError, DocuTrayError

client = Client(api_key="YOUR_API_KEY")
DOCUMENT_TYPES = ["invoice", "receipt", "contract", "id_card"]

try:
    document = Path("incoming_document.pdf")

    # Identify document type
    identification = client.identify.run(
        file=document,
        document_type_code_options=DOCUMENT_TYPES
    )

    best_match = identification.document_type
    print(f"Identified as: {best_match.name} ({best_match.confidence:.0%})")

    # Show alternatives if confidence is moderate
    if best_match.confidence < 0.9:
        print("Alternatives:")
        for alt in identification.alternatives:
            print(f"  - {alt.name}: {alt.confidence:.0%}")

    # Convert if confidence is sufficient
    if best_match.confidence >= 0.7:
        result = client.convert.run(
            file=document,
            document_type_code=best_match.code
        )
        print(f"Extracted {len(result.data)} fields")
    else:
        print("Confidence too low for automatic conversion")

except NotFoundError:
    print("One or more document types not found")
except DocuTrayError as e:
    print(f"Error: {e.message}")
finally:
    client.close()

import DocuTray, { NotFoundError, DocuTrayError } from 'docutray';
import { readFileSync } from 'fs';

const client = new DocuTray({ apiKey: 'YOUR_API_KEY' });
const DOCUMENT_TYPES = ['invoice', 'receipt', 'contract', 'id_card'];

try {
  const document = readFileSync('incoming_document.pdf');

  // Identify document type
  const identification = await client.identify.run({
    file: document,
    documentTypeCodeOptions: DOCUMENT_TYPES,
  });

  const bestMatch = identification.document_type;
  console.log(`Identified as: ${bestMatch.name} (${(bestMatch.confidence * 100).toFixed(0)}%)`);

  // Show alternatives if confidence is moderate
  if (bestMatch.confidence < 0.9) {
    console.log('Alternatives:');
    for (const alt of identification.alternatives) {
      console.log(`  - ${alt.name}: ${(alt.confidence * 100).toFixed(0)}%`);
    }
  }

  // Convert if confidence is sufficient
  if (bestMatch.confidence >= 0.7) {
    const result = await client.convert.run({
      file: document,
      documentTypeCode: bestMatch.code,
    });
    console.log(`Extracted ${Object.keys(result.data).length} fields`);
  } else {
    console.log('Confidence too low for automatic conversion');
  }
} catch (error) {
  if (error instanceof NotFoundError) {
    console.error('One or more document types not found');
  } else if (error instanceof DocuTrayError) {
    console.error(`Error: ${error.message}`);
  }
}

SDK Reference

For detailed class and method documentation:

Identify Documents

On this page