Skip to main content

Extract Data

Extract structured data from documents using AI-powered OCR and field detection. Upload a document and get back structured JSON with key-value pairs, tables, entities, and more.

POST /v1/extract

Request

Content-Type: multipart/form-data

ParameterTypeRequiredDescription
fileFile(s)YesDocument file(s) to extract from. PDF, PNG, JPG, JPEG, BMP, TIFF
modelstringNoOCR model to use (see below)
schemastringNoJSON array of target fields: [{"name":"field","type":"string"}]
output_formatstringNojson (default), csv, or xml
detailbooleanNoInclude full AI extraction detail with entities, summaries, and secondary data (default: false)
chunk_sizeintegerNoSplit OCR text into chunks of this many characters (for large documents)
chunk_overlapintegerNoNumber of overlapping characters between chunks (default: 0)
webhook_urlstringNoURL to receive results when processing completes

OCR Models

ModelDescription
qomplement-OCR-v1Standard extraction — fast, good for most documents (default)
qomplement-OCR-XL-v1High precision — richer markdown output, better for complex layouts and tables

Schema Field Types

When providing a schema, each field has a name and type:

TypeDescriptionExample Value
stringText value"INV-2025-001"
numberNumeric value1500.50
dateDate value"2025-02-26"
identifierID, reference number"SSN-123-45-6789"

Sync Response (200)

Returned when: single file, ≤5 pages, no webhook.

{
"id": "550e8400-e29b-41d4-a716-446655440000",
"status": "completed",
"result": {
"model": "qomplement-OCR-v1",
"document_type": "invoice",
"language": "en",
"confidence": 92,
"fields": {
"invoice_number": "INV-2025-001234",
"total_amount": "1500.50",
"vendor_name": "ABC Corp",
"invoice_date": "2025-02-26",
"due_date": "2025-03-26"
},
"tables": [
{
"headers": ["Item", "Quantity", "Price"],
"rows": [
["Widget A", "10", "$50.00"],
["Widget B", "5", "$200.00"]
]
}
],
"pages_processed": 2,
"processing_time_ms": 5234
}
}

Result Fields

FieldTypeDescription
modelstringOCR model that was used
document_typestringDetected type: invoice, receipt, contract, form, letter, report, id_card, tax_form, other, multiple
languagestringDetected document language (e.g., en, es, fr)
confidencenumberConfidence score 0–100
fieldsobjectExtracted key-value pairs as a flat dictionary
tablesarrayDetected tables with headers and rows
pages_processednumberNumber of pages processed
processing_time_msnumberProcessing time in milliseconds

Detail Mode

When detail=true, the response includes additional extraction data:

{
"result": {
"fields": { ... },
"tables": [ ... ],
"detail": {
"primary_entity": {
"fields": {
"name": "ABC Corp",
"address": "123 Main St"
}
},
"secondary_entity": {
"fields": {
"name": "John Smith",
"email": "john@example.com"
}
},
"key_value_pairs": [
{ "key": "invoice_number", "value": "INV-001" }
],
"entities": {
"people": ["John Smith"],
"companies": ["ABC Corp"],
"locations": ["123 Main St, NY"],
"dates": ["2025-02-26"],
"amounts": [1500.50]
},
"summary": "Invoice from ABC Corp to John Smith for $1,500.50"
}
}
}

Chunking

When chunk_size is provided, the response includes a chunks array for processing large documents in segments:

{
"result": {
"fields": { ... },
"chunks": [
{
"index": 0,
"text": "First chunk of OCR text...",
"start": 0,
"end": 1000,
"char_count": 1000
},
{
"index": 1,
"text": "Second chunk of OCR text...",
"start": 800,
"end": 1800,
"char_count": 1000
}
]
}
}

Async Response (202)

Returned when: multiple files, >5 pages, or webhook provided.

{
"id": "550e8400-e29b-41d4-a716-446655440000",
"status": "processing",
"created_at": "2025-02-26T10:30:00Z",
"poll_url": "/v1/jobs/550e8400-e29b-41d4-a716-446655440000",
"estimated_time_seconds": 64
}

Poll GET /v1/jobs/{id} until status is completed or failed. See Jobs.


Examples

cURL

Basic extraction:

curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@invoice.pdf"

With schema:

curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@invoice.pdf" \
-F 'schema=[{"name":"invoice_number","type":"string"},{"name":"total","type":"number"},{"name":"date","type":"date"}]'

High precision model:

curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@complex_table.pdf" \
-F "model=qomplement-OCR-XL-v1"

With detail mode:

curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@contract.pdf" \
-F "detail=true"

Multiple files (async):

curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@page1.pdf" \
-F "file=@page2.pdf" \
-F "file=@page3.jpg"

Python

import requests
import time
import os

API_KEY = os.environ["QOMPLEMENT_API_KEY"]
BASE_URL = "https://developer-api.qomplement.com/v1"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}


def extract_document(file_path, schema=None, model=None, detail=False):
"""Extract structured data from a document."""
with open(file_path, "rb") as f:
files = {"file": f}
data = {}

if schema:
import json
data["schema"] = json.dumps(schema)
if model:
data["model"] = model
if detail:
data["detail"] = "true"

response = requests.post(
f"{BASE_URL}/extract",
headers=HEADERS,
files=files,
data=data,
)

response.raise_for_status()
result = response.json()

if result["status"] == "completed":
return result["result"]
else:
# Async job — poll for results
return wait_for_job(result["id"])


def wait_for_job(job_id, interval=3, timeout=300):
"""Poll a job until completion."""
elapsed = 0
while elapsed < timeout:
resp = requests.get(f"{BASE_URL}/jobs/{job_id}", headers=HEADERS)
job = resp.json()

if job["status"] == "completed":
return job["result"]
elif job["status"] == "failed":
raise Exception(f"Job failed: {job['error']['message']}")

# Show progress if available
if job.get("progress"):
p = job["progress"]
print(f"Progress: {p['current']}/{p['total']}{p['message']}")

time.sleep(interval)
elapsed += interval

raise TimeoutError("Job timed out")


# Basic extraction
result = extract_document("invoice.pdf")
print(f"Document type: {result['document_type']}")
print(f"Language: {result['language']}")
for key, value in result["fields"].items():
print(f" {key}: {value}")

# With schema
schema = [
{"name": "invoice_number", "type": "string"},
{"name": "total", "type": "number"},
{"name": "date", "type": "date"},
]
result = extract_document("invoice.pdf", schema=schema)

# High precision model
result = extract_document("complex_table.pdf", model="qomplement-OCR-XL-v1")

# With detail mode
result = extract_document("contract.pdf", detail=True)
print(result["detail"]["summary"])
print(result["detail"]["entities"])

JavaScript / TypeScript

const fs = require("fs");
const path = require("path");

const API_KEY = process.env.QOMPLEMENT_API_KEY;
const BASE_URL = "https://developer-api.qomplement.com/v1";

async function extractDocument(filePath, options = {}) {
const form = new FormData();
form.append("file", new Blob([fs.readFileSync(filePath)]), path.basename(filePath));

if (options.schema) {
form.append("schema", JSON.stringify(options.schema));
}
if (options.model) {
form.append("model", options.model);
}
if (options.detail) {
form.append("detail", "true");
}

const response = await fetch(`${BASE_URL}/extract`, {
method: "POST",
headers: { Authorization: `Bearer ${API_KEY}` },
body: form,
});

if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
}

const data = await response.json();

if (data.status === "completed") {
return data.result;
}

// Async job — poll for results
return await waitForJob(data.id);
}

async function waitForJob(jobId, interval = 3000, timeout = 300000) {
const start = Date.now();
while (Date.now() - start < timeout) {
const resp = await fetch(`${BASE_URL}/jobs/${jobId}`, {
headers: { Authorization: `Bearer ${API_KEY}` },
});
const job = await resp.json();

if (job.status === "completed") return job.result;
if (job.status === "failed") throw new Error(job.error.message);

await new Promise((r) => setTimeout(r, interval));
}
throw new Error("Job timed out");
}

// Usage
(async () => {
// Basic extraction
const result = await extractDocument("invoice.pdf");
console.log("Document type:", result.document_type);
console.log("Fields:", result.fields);

// With schema
const result2 = await extractDocument("invoice.pdf", {
schema: [
{ name: "invoice_number", type: "string" },
{ name: "total", type: "number" },
],
});

// High precision model
const result3 = await extractDocument("complex_table.pdf", {
model: "qomplement-OCR-XL-v1",
});
})();

Go

package main

import (
"bytes"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
"time"
)

const baseURL = "https://developer-api.qomplement.com/v1"

func extractDocument(filePath string, apiKey string) (map[string]interface{}, error) {
// Create multipart form
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)

file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()

part, err := writer.CreateFormFile("file", filePath)
if err != nil {
return nil, err
}
io.Copy(part, file)
writer.Close()

// Send request
req, _ := http.NewRequest("POST", baseURL+"/extract", body)
req.Header.Set("Authorization", "Bearer "+apiKey)
req.Header.Set("Content-Type", writer.FormDataContentType())

resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)

if result["status"] == "completed" {
return result["result"].(map[string]interface{}), nil
}

// Async — poll for results
return waitForJob(result["id"].(string), apiKey)
}

func waitForJob(jobID string, apiKey string) (map[string]interface{}, error) {
for i := 0; i < 100; i++ {
req, _ := http.NewRequest("GET", baseURL+"/jobs/"+jobID, nil)
req.Header.Set("Authorization", "Bearer "+apiKey)

resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}

var job map[string]interface{}
json.NewDecoder(resp.Body).Decode(&job)
resp.Body.Close()

if job["status"] == "completed" {
return job["result"].(map[string]interface{}), nil
}
if job["status"] == "failed" {
return nil, fmt.Errorf("job failed: %v", job["error"])
}

time.Sleep(3 * time.Second)
}
return nil, fmt.Errorf("job timed out")
}

func main() {
apiKey := os.Getenv("QOMPLEMENT_API_KEY")

result, err := extractDocument("invoice.pdf", apiKey)
if err != nil {
fmt.Println("Error:", err)
return
}

fmt.Println("Document type:", result["document_type"])
fmt.Println("Fields:", result["fields"])
}