Skip to main content

Basic Usage

import json
from datalab_sdk import DatalabClient, SegmentOptions

client = DatalabClient()

# Define a segmentation schema with section names and descriptions
segmentation_schema = json.dumps({
    "sections": [
        {"name": "introduction", "description": "Introduction and overview"},
        {"name": "methodology", "description": "Methods and approach"},
        {"name": "results", "description": "Findings and results"},
        {"name": "conclusion", "description": "Summary and conclusions"},
        {"name": "references", "description": "Bibliography and references"}
    ]
})

options = SegmentOptions(segmentation_schema=segmentation_schema)
result = client.segment("research_paper.pdf", options=options)

# Access segmentation results
segments = result.segmentation_results
for segment in segments:
    print(f"{segment['name']}: pages {segment['page_range']}")

Segment Options

Use SegmentOptions to configure segmentation behavior:
OptionTypeDefaultDescription
segmentation_schemastrRequiredJSON schema defining segment names and descriptions
checkpoint_idstrNoneCheckpoint ID from a previous convert() call
modestr"fast"Processing mode: "fast", "balanced", "accurate"
save_checkpointboolFalseSave checkpoint for reuse with subsequent calls
max_pagesintNoneMaximum number of pages to process
page_rangestrNoneSpecific pages to process (e.g., "0-5,10"). For spreadsheets, filters by sheet index.
skip_cacheboolFalseSkip cached results, force reprocessing
webhook_urlstrNoneWebhook URL for completion notification

Checkpoint Reuse

Use checkpoints to avoid re-parsing a document when running segmentation after conversion. First convert with save_checkpoint=True, then segment using the returned checkpoint_id:
import json
from datalab_sdk import DatalabClient, ConvertOptions, SegmentOptions

client = DatalabClient()

# Step 1: Convert and save a checkpoint
convert_options = ConvertOptions(
    mode="accurate",
    save_checkpoint=True,
)
convert_result = client.convert("report.pdf", options=convert_options)
print(convert_result.markdown)

# Step 2: Segment using the checkpoint (no re-parsing needed)
segmentation_schema = json.dumps({
    "sections": [
        {"name": "executive_summary", "description": "Executive summary"},
        {"name": "financials", "description": "Financial data and analysis"},
        {"name": "outlook", "description": "Future outlook and projections"},
    ]
})

segment_options = SegmentOptions(
    segmentation_schema=segmentation_schema,
    checkpoint_id=convert_result.checkpoint_id,
)
segment_result = client.segment("report.pdf", options=segment_options)
print(segment_result.segmentation_results)

Segmentation Result

The result object contains the segmentation data alongside standard conversion fields:
result = client.segment("document.pdf", options=options)

# Segmentation results (list of segments with names and page ranges)
segments = result.segmentation_results
for segment in segments:
    print(f"Section: {segment['name']}")
    print(f"  Pages: {segment['page_range']}")

# Standard conversion fields are also available
print(result.success)
print(result.markdown)
print(result.page_count)
print(result.cost_breakdown)

Async Usage

import asyncio
import json
from datalab_sdk import AsyncDatalabClient, SegmentOptions

async def segment_document():
    async with AsyncDatalabClient() as client:
        segmentation_schema = json.dumps({
            "sections": [
                {"name": "introduction", "description": "Introduction"},
                {"name": "body", "description": "Main content"},
                {"name": "conclusion", "description": "Conclusion"},
            ]
        })
        options = SegmentOptions(segmentation_schema=segmentation_schema)
        result = await client.segment("document.pdf", options=options)
        return result.segmentation_results

segments = asyncio.run(segment_document())
print(segments)

Next Steps

Segmentation Recipe

Learn more about document segmentation patterns and use cases.

Structured Extraction

Extract structured data from documents using JSON schemas.

Document Conversion

Convert documents to Markdown, HTML, JSON, or chunks.