Skip to main content

Basic Usage

import json
from datalab_sdk import DatalabClient, SegmentOptions

client = DatalabClient()

# Define a segmentation schema with section names and descriptions
segmentation_schema = json.dumps({
    "sections": [
        {"name": "introduction", "description": "Introduction and overview"},
        {"name": "methodology", "description": "Methods and approach"},
        {"name": "results", "description": "Findings and results"},
        {"name": "conclusion", "description": "Summary and conclusions"},
        {"name": "references", "description": "Bibliography and references"}
    ]
})

options = SegmentOptions(segmentation_schema=segmentation_schema)
result = client.segment("research_paper.pdf", options=options)

# Access segmentation results
segments = result.segmentation_results
for segment in segments:
    print(f"{segment['name']}: pages {segment['page_range']}")

Segment Options

Use SegmentOptions to configure segmentation behavior:
OptionTypeDefaultDescription
segmentation_schemastrRequiredJSON schema defining segment names and descriptions
checkpoint_idstrNoneCheckpoint ID from a previous convert() call
modestr"fast"Processing mode: "fast", "balanced", "accurate"
save_checkpointboolFalseSave checkpoint for reuse with subsequent calls
max_pagesintNoneMaximum number of pages to process
page_rangestrNoneSpecific pages to process (e.g., "0-5,10")
skip_cacheboolFalseSkip cached results, force reprocessing
webhook_urlstrNoneWebhook URL for completion notification

Checkpoint Workflow

Use checkpoints to avoid re-parsing a document when running segmentation after conversion. First convert with save_checkpoint=True, then segment using the returned checkpoint_id:
import json
from datalab_sdk import DatalabClient, ConvertOptions, SegmentOptions

client = DatalabClient()

# Step 1: Convert and save a checkpoint
convert_options = ConvertOptions(
    mode="accurate",
    save_checkpoint=True,
)
convert_result = client.convert("report.pdf", options=convert_options)
print(convert_result.markdown)

# Step 2: Segment using the checkpoint (no re-parsing needed)
segmentation_schema = json.dumps({
    "sections": [
        {"name": "executive_summary", "description": "Executive summary"},
        {"name": "financials", "description": "Financial data and analysis"},
        {"name": "outlook", "description": "Future outlook and projections"},
    ]
})

segment_options = SegmentOptions(
    segmentation_schema=segmentation_schema,
    checkpoint_id=convert_result.checkpoint_id,
)
segment_result = client.segment("report.pdf", options=segment_options)
print(segment_result.segmentation_results)

Segmentation Result

The result object contains the segmentation data alongside standard conversion fields:
result = client.segment("document.pdf", options=options)

# Segmentation results (list of segments with names and page ranges)
segments = result.segmentation_results
for segment in segments:
    print(f"Section: {segment['name']}")
    print(f"  Pages: {segment['page_range']}")

# Standard conversion fields are also available
print(result.success)
print(result.markdown)
print(result.page_count)
print(result.cost_breakdown)

Async Usage

import asyncio
import json
from datalab_sdk import AsyncDatalabClient, SegmentOptions

async def segment_document():
    async with AsyncDatalabClient() as client:
        segmentation_schema = json.dumps({
            "sections": [
                {"name": "introduction", "description": "Introduction"},
                {"name": "body", "description": "Main content"},
                {"name": "conclusion", "description": "Conclusion"},
            ]
        })
        options = SegmentOptions(segmentation_schema=segmentation_schema)
        result = await client.segment("document.pdf", options=options)
        return result.segmentation_results

segments = asyncio.run(segment_document())
print(segments)

Next Steps