Skip to main content
Automatically identify and split PDFs that contain multiple documents (like batch-scanned files) into their component parts.

SDK Usage

import json
from datalab_sdk import DatalabClient, ConvertOptions

client = DatalabClient()

# Define segmentation schema
segmentation_schema = {
    "segments": []
}

options = ConvertOptions(
    segmentation_schema=json.dumps(segmentation_schema),
    mode="balanced"
)

result = client.convert("combined_documents.pdf", options=options)

# Access segmentation results
for segment in result.segmentation_results["segments"]:
    print(f"{segment['name']}: pages {segment['pages']}")

When to Use

Segmentation is useful when:
  • Batch-scanned documents are combined into a single PDF
  • Multiple document types are stapled together
  • You need to apply different processing to different sections

Response Format

{
  "segmentation_results": {
    "segments": [
      {
        "name": "Research Paper",
        "pages": [0, 1, 2],
        "confidence": "medium"
      },
      {
        "name": "Invoice",
        "pages": [3, 4],
        "confidence": "high"
      }
    ],
    "metadata": {
      "total_pages": 5,
      "segmentation_method": "auto_detected"
    }
  }
}

Process Each Segment

After segmentation, process each segment separately:
import json
from datalab_sdk import DatalabClient, ConvertOptions

client = DatalabClient()

# First, get segments
seg_options = ConvertOptions(
    segmentation_schema=json.dumps({"segments": []}),
    mode="balanced"
)
result = client.convert("combined.pdf", options=seg_options)

# Process each segment with appropriate schema
extraction_schemas = {
    "Invoice": {
        "type": "object",
        "properties": {
            "invoice_number": {"type": "string"},
            "total": {"type": "number"}
        }
    },
    "Contract": {
        "type": "object",
        "properties": {
            "parties": {"type": "array", "items": {"type": "string"}},
            "effective_date": {"type": "string"}
        }
    }
}

extracted_data = {}

for segment in result.segmentation_results["segments"]:
    segment_name = segment["name"]
    pages = segment["pages"]

    schema = extraction_schemas.get(segment_name)
    if schema:
        # Build page range string
        page_range = ",".join(str(p) for p in pages)

        options = ConvertOptions(
            page_schema=json.dumps(schema),
            page_range=page_range,
            mode="balanced"
        )

        seg_result = client.convert("combined.pdf", options=options)
        extracted_data[segment_name] = json.loads(seg_result.extraction_schema_json)

print(extracted_data)

REST API

curl -X POST https://www.datalab.to/api/v1/marker \
  -H "X-API-Key: YOUR_API_KEY" \
  -F "file=@combined_documents.pdf" \
  -F "output_format=markdown" \
  -F "mode=balanced" \
  -F 'segmentation_schema={"segments": []}'

Python Example

import requests
import json
import time

API_KEY = "YOUR_API_KEY"
headers = {"X-API-Key": API_KEY}

# Submit segmentation request
with open("combined.pdf", "rb") as f:
    response = requests.post(
        "https://www.datalab.to/api/v1/marker",
        files={"file": ("combined.pdf", f, "application/pdf")},
        data={
            "output_format": "markdown",
            "mode": "balanced",
            "segmentation_schema": json.dumps({"segments": []})
        },
        headers=headers
    )

check_url = response.json()["request_check_url"]

# Poll for results
while True:
    result = requests.get(check_url, headers=headers).json()

    if result["status"] == "complete":
        segments = result["segmentation_results"]["segments"]
        for seg in segments:
            print(f"{seg['name']}: pages {seg['pages']}")
        break
    elif result["status"] == "failed":
        print(f"Error: {result.get('error')}")
        break

    time.sleep(2)

Custom Segmentation Schema

Define expected segment types for better accuracy:
segmentation_schema = {
    "segments": [
        {"type": "invoice", "description": "Invoice or billing document"},
        {"type": "contract", "description": "Legal contract or agreement"},
        {"type": "receipt", "description": "Payment receipt"}
    ]
}

Try Datalab

Get started with our API in less than a minute. We include free credits.