Skip to main content

Basic Usage

import json
from datalab_sdk import DatalabClient, ExtractOptions

client = DatalabClient()

# Define a JSON schema for extraction
page_schema = json.dumps({
    "invoice_number": {"type": "string", "description": "Invoice number"},
    "total": {"type": "number", "description": "Total amount due"},
    "vendor": {"type": "string", "description": "Vendor or company name"},
    "items": {
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "description": {"type": "string"},
                "amount": {"type": "number"}
            }
        }
    }
})

options = ExtractOptions(page_schema=page_schema)
result = client.extract("invoice.pdf", options=options)

# Access the extracted data
extracted = json.loads(result.extraction_schema_json)
print(extracted)

Extract Options

Use ExtractOptions to configure extraction behavior:
OptionTypeDefaultDescription
page_schemastrRequiredJSON schema defining the fields to extract
checkpoint_idstrNoneCheckpoint ID from a previous convert() call
modestr"fast"Processing mode: "fast", "balanced", "accurate"
output_formatstr"markdown"Output format: "markdown", "html", "json", "chunks"
save_checkpointboolFalseSave checkpoint for reuse with subsequent calls
max_pagesintNoneMaximum number of pages to process
page_rangestrNoneSpecific pages to process (e.g., "0-5,10"). For spreadsheets, filters by sheet index.
skip_cacheboolFalseSkip cached results, force reprocessing
webhook_urlstrNoneWebhook URL for completion notification

Checkpoint Reuse

Use checkpoints to avoid re-parsing a document when running extraction after conversion. First convert with save_checkpoint=True, then extract using the returned checkpoint_id:
import json
from datalab_sdk import DatalabClient, ConvertOptions, ExtractOptions

client = DatalabClient()

# Step 1: Convert and save a checkpoint
convert_options = ConvertOptions(
    mode="accurate",
    save_checkpoint=True,
)
convert_result = client.convert("report.pdf", options=convert_options)
print(convert_result.markdown)

# Step 2: Extract using the checkpoint (no re-parsing needed)
page_schema = json.dumps({
    "title": {"type": "string", "description": "Document title"},
    "author": {"type": "string", "description": "Author name"},
    "date": {"type": "string", "description": "Publication date"},
    "summary": {"type": "string", "description": "Brief summary of the document"},
})

extract_options = ExtractOptions(
    page_schema=page_schema,
    checkpoint_id=convert_result.checkpoint_id,
)
extract_result = client.extract("report.pdf", options=extract_options)
extracted = json.loads(extract_result.extraction_schema_json)
print(extracted)

Extraction Result

The result object contains the extracted data alongside standard conversion fields:
result = client.extract("invoice.pdf", options=options)

# Extracted structured data (JSON string)
extracted = json.loads(result.extraction_schema_json)
print(extracted["invoice_number"])
print(extracted["total"])

# Standard conversion fields are also available
print(result.success)
print(result.markdown)
print(result.page_count)
print(result.cost_breakdown)

Async Usage

import asyncio
import json
from datalab_sdk import AsyncDatalabClient, ExtractOptions

async def extract_data():
    async with AsyncDatalabClient() as client:
        page_schema = json.dumps({
            "title": {"type": "string", "description": "Document title"},
            "author": {"type": "string", "description": "Author name"},
        })
        options = ExtractOptions(page_schema=page_schema)
        result = await client.extract("document.pdf", options=options)
        return json.loads(result.extraction_schema_json)

extracted = asyncio.run(extract_data())
print(extracted)

Next Steps

Extraction Recipe

Learn more about structured extraction patterns and best practices.

Document Segmentation

Segment documents into logical sections using schemas.

Document Conversion

Convert documents to Markdown, HTML, JSON, or chunks.

Batch Processing

Process multiple documents efficiently in parallel.