Extract Data
Extract structured data from documents using AI-powered OCR and field detection. Upload a document and get back structured JSON with key-value pairs, tables, entities, and more.
POST /v1/extract
Request
Content-Type: multipart/form-data
| Parameter | Type | Required | Description |
|---|---|---|---|
file | File(s) | Yes | Document file(s) to extract from. PDF, PNG, JPG, JPEG, BMP, TIFF |
model | string | No | OCR model to use (see below) |
schema | string | No | JSON array of target fields: [{"name":"field","type":"string"}] |
output_format | string | No | json (default), csv, or xml |
detail | boolean | No | Include full AI extraction detail with entities, summaries, and secondary data (default: false) |
chunk_size | integer | No | Split OCR text into chunks of this many characters (for large documents) |
chunk_overlap | integer | No | Number of overlapping characters between chunks (default: 0) |
webhook_url | string | No | URL to receive results when processing completes |
OCR Models
| Model | Description |
|---|---|
qomplement-OCR-v1 | Standard extraction — fast, good for most documents (default) |
qomplement-OCR-XL-v1 | High precision — richer markdown output, better for complex layouts and tables |
Schema Field Types
When providing a schema, each field has a name and type:
| Type | Description | Example Value |
|---|---|---|
string | Text value | "INV-2025-001" |
number | Numeric value | 1500.50 |
date | Date value | "2025-02-26" |
identifier | ID, reference number | "SSN-123-45-6789" |
Sync Response (200)
Returned when: single file, ≤5 pages, no webhook.
{
"id": "550e8400-e29b-41d4-a716-446655440000",
"status": "completed",
"result": {
"model": "qomplement-OCR-v1",
"document_type": "invoice",
"language": "en",
"confidence": 92,
"fields": {
"invoice_number": "INV-2025-001234",
"total_amount": "1500.50",
"vendor_name": "ABC Corp",
"invoice_date": "2025-02-26",
"due_date": "2025-03-26"
},
"tables": [
{
"headers": ["Item", "Quantity", "Price"],
"rows": [
["Widget A", "10", "$50.00"],
["Widget B", "5", "$200.00"]
]
}
],
"pages_processed": 2,
"processing_time_ms": 5234
}
}
Result Fields
| Field | Type | Description |
|---|---|---|
model | string | OCR model that was used |
document_type | string | Detected type: invoice, receipt, contract, form, letter, report, id_card, tax_form, other, multiple |
language | string | Detected document language (e.g., en, es, fr) |
confidence | number | Confidence score 0–100 |
fields | object | Extracted key-value pairs as a flat dictionary |
tables | array | Detected tables with headers and rows |
pages_processed | number | Number of pages processed |
processing_time_ms | number | Processing time in milliseconds |
Detail Mode
When detail=true, the response includes additional extraction data:
{
"result": {
"fields": { ... },
"tables": [ ... ],
"detail": {
"primary_entity": {
"fields": {
"name": "ABC Corp",
"address": "123 Main St"
}
},
"secondary_entity": {
"fields": {
"name": "John Smith",
"email": "john@example.com"
}
},
"key_value_pairs": [
{ "key": "invoice_number", "value": "INV-001" }
],
"entities": {
"people": ["John Smith"],
"companies": ["ABC Corp"],
"locations": ["123 Main St, NY"],
"dates": ["2025-02-26"],
"amounts": [1500.50]
},
"summary": "Invoice from ABC Corp to John Smith for $1,500.50"
}
}
}
Chunking
When chunk_size is provided, the response includes a chunks array for processing large documents in segments:
{
"result": {
"fields": { ... },
"chunks": [
{
"index": 0,
"text": "First chunk of OCR text...",
"start": 0,
"end": 1000,
"char_count": 1000
},
{
"index": 1,
"text": "Second chunk of OCR text...",
"start": 800,
"end": 1800,
"char_count": 1000
}
]
}
}
Async Response (202)
Returned when: multiple files, >5 pages, or webhook provided.
{
"id": "550e8400-e29b-41d4-a716-446655440000",
"status": "processing",
"created_at": "2025-02-26T10:30:00Z",
"poll_url": "/v1/jobs/550e8400-e29b-41d4-a716-446655440000",
"estimated_time_seconds": 64
}
Poll GET /v1/jobs/{id} until status is completed or failed. See Jobs.
Examples
cURL
Basic extraction:
curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@invoice.pdf"
With schema:
curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@invoice.pdf" \
-F 'schema=[{"name":"invoice_number","type":"string"},{"name":"total","type":"number"},{"name":"date","type":"date"}]'
High precision model:
curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@complex_table.pdf" \
-F "model=qomplement-OCR-XL-v1"
With detail mode:
curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@contract.pdf" \
-F "detail=true"
Multiple files (async):
curl -X POST https://developer-api.qomplement.com/v1/extract \
-H "Authorization: Bearer $QOMPLEMENT_API_KEY" \
-F "file=@page1.pdf" \
-F "file=@page2.pdf" \
-F "file=@page3.jpg"
Python
import requests
import time
import os
API_KEY = os.environ["QOMPLEMENT_API_KEY"]
BASE_URL = "https://developer-api.qomplement.com/v1"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}
def extract_document(file_path, schema=None, model=None, detail=False):
"""Extract structured data from a document."""
with open(file_path, "rb") as f:
files = {"file": f}
data = {}
if schema:
import json
data["schema"] = json.dumps(schema)
if model:
data["model"] = model
if detail:
data["detail"] = "true"
response = requests.post(
f"{BASE_URL}/extract",
headers=HEADERS,
files=files,
data=data,
)
response.raise_for_status()
result = response.json()
if result["status"] == "completed":
return result["result"]
else:
# Async job — poll for results
return wait_for_job(result["id"])
def wait_for_job(job_id, interval=3, timeout=300):
"""Poll a job until completion."""
elapsed = 0
while elapsed < timeout:
resp = requests.get(f"{BASE_URL}/jobs/{job_id}", headers=HEADERS)
job = resp.json()
if job["status"] == "completed":
return job["result"]
elif job["status"] == "failed":
raise Exception(f"Job failed: {job['error']['message']}")
# Show progress if available
if job.get("progress"):
p = job["progress"]
print(f"Progress: {p['current']}/{p['total']} — {p['message']}")
time.sleep(interval)
elapsed += interval
raise TimeoutError("Job timed out")
# Basic extraction
result = extract_document("invoice.pdf")
print(f"Document type: {result['document_type']}")
print(f"Language: {result['language']}")
for key, value in result["fields"].items():
print(f" {key}: {value}")
# With schema
schema = [
{"name": "invoice_number", "type": "string"},
{"name": "total", "type": "number"},
{"name": "date", "type": "date"},
]
result = extract_document("invoice.pdf", schema=schema)
# High precision model
result = extract_document("complex_table.pdf", model="qomplement-OCR-XL-v1")
# With detail mode
result = extract_document("contract.pdf", detail=True)
print(result["detail"]["summary"])
print(result["detail"]["entities"])
JavaScript / TypeScript
const fs = require("fs");
const path = require("path");
const API_KEY = process.env.QOMPLEMENT_API_KEY;
const BASE_URL = "https://developer-api.qomplement.com/v1";
async function extractDocument(filePath, options = {}) {
const form = new FormData();
form.append("file", new Blob([fs.readFileSync(filePath)]), path.basename(filePath));
if (options.schema) {
form.append("schema", JSON.stringify(options.schema));
}
if (options.model) {
form.append("model", options.model);
}
if (options.detail) {
form.append("detail", "true");
}
const response = await fetch(`${BASE_URL}/extract`, {
method: "POST",
headers: { Authorization: `Bearer ${API_KEY}` },
body: form,
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
}
const data = await response.json();
if (data.status === "completed") {
return data.result;
}
// Async job — poll for results
return await waitForJob(data.id);
}
async function waitForJob(jobId, interval = 3000, timeout = 300000) {
const start = Date.now();
while (Date.now() - start < timeout) {
const resp = await fetch(`${BASE_URL}/jobs/${jobId}`, {
headers: { Authorization: `Bearer ${API_KEY}` },
});
const job = await resp.json();
if (job.status === "completed") return job.result;
if (job.status === "failed") throw new Error(job.error.message);
await new Promise((r) => setTimeout(r, interval));
}
throw new Error("Job timed out");
}
// Usage
(async () => {
// Basic extraction
const result = await extractDocument("invoice.pdf");
console.log("Document type:", result.document_type);
console.log("Fields:", result.fields);
// With schema
const result2 = await extractDocument("invoice.pdf", {
schema: [
{ name: "invoice_number", type: "string" },
{ name: "total", type: "number" },
],
});
// High precision model
const result3 = await extractDocument("complex_table.pdf", {
model: "qomplement-OCR-XL-v1",
});
})();
Go
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
"time"
)
const baseURL = "https://developer-api.qomplement.com/v1"
func extractDocument(filePath string, apiKey string) (map[string]interface{}, error) {
// Create multipart form
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
file, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer file.Close()
part, err := writer.CreateFormFile("file", filePath)
if err != nil {
return nil, err
}
io.Copy(part, file)
writer.Close()
// Send request
req, _ := http.NewRequest("POST", baseURL+"/extract", body)
req.Header.Set("Authorization", "Bearer "+apiKey)
req.Header.Set("Content-Type", writer.FormDataContentType())
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)
if result["status"] == "completed" {
return result["result"].(map[string]interface{}), nil
}
// Async — poll for results
return waitForJob(result["id"].(string), apiKey)
}
func waitForJob(jobID string, apiKey string) (map[string]interface{}, error) {
for i := 0; i < 100; i++ {
req, _ := http.NewRequest("GET", baseURL+"/jobs/"+jobID, nil)
req.Header.Set("Authorization", "Bearer "+apiKey)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
var job map[string]interface{}
json.NewDecoder(resp.Body).Decode(&job)
resp.Body.Close()
if job["status"] == "completed" {
return job["result"].(map[string]interface{}), nil
}
if job["status"] == "failed" {
return nil, fmt.Errorf("job failed: %v", job["error"])
}
time.Sleep(3 * time.Second)
}
return nil, fmt.Errorf("job timed out")
}
func main() {
apiKey := os.Getenv("QOMPLEMENT_API_KEY")
result, err := extractDocument("invoice.pdf", apiKey)
if err != nil {
fmt.Println("Error:", err)
return
}
fmt.Println("Document type:", result["document_type"])
fmt.Println("Fields:", result["fields"])
}