> ## Documentation Index
> Fetch the complete documentation index at: https://documentation.datalab.to/llms.txt
> Use this file to discover all available pages before exploring further.

# Handling Long Documents

> Tips for structured extraction on documents with 50+ pages.

For long documents, use page ranges and document segmentation to improve speed and accuracy.

## Restrict to Specific Pages

If you know which pages contain the data you need, use `page_range`:

```python theme={null}
from datalab_sdk import DatalabClient, ConvertOptions
import json

client = DatalabClient()

schema = {
    "type": "object",
    "properties": {
        "executive_summary": {"type": "string", "description": "Executive summary text"}
    }
}

# Only process pages 0-5 (first 6 pages)
options = ConvertOptions(
    page_schema=json.dumps(schema),
    page_range="0-5",
    mode="balanced"
)

result = client.convert("long_document.pdf", options=options)
```

You're only charged for the pages you process.

## Segment and Chain Extractions

For documents with distinct sections (like financial reports or contracts), extract the table of contents first, then process each section separately.

### Step 1: Extract Table of Contents

```python theme={null}
import json
from datalab_sdk import DatalabClient, ConvertOptions

client = DatalabClient()

toc_schema = {
    "type": "object",
    "properties": {
        "table_of_contents": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "section_name": {"type": "string"},
                    "page_number": {"type": "number"}
                }
            }
        }
    }
}

# Extract TOC from first few pages
options = ConvertOptions(
    page_schema=json.dumps(toc_schema),
    page_range="0-5",
    mode="balanced"
)

result = client.convert("report.pdf", options=options)
toc = json.loads(result.extraction_schema_json)

print("Sections found:")
for item in toc["table_of_contents"]:
    print(f"  {item['section_name']}: page {item['page_number']}")
```

### Step 2: Extract Each Section

```python theme={null}
# Define schemas for different sections
section_schemas = {
    "Financial Highlights": {
        "type": "object",
        "properties": {
            "revenue": {"type": "number"},
            "net_income": {"type": "number"},
            "eps": {"type": "number"}
        }
    },
    "Risk Factors": {
        "type": "object",
        "properties": {
            "risks": {
                "type": "array",
                "items": {"type": "string"}
            }
        }
    }
}

# Build page ranges from TOC
sections = toc["table_of_contents"]
results = {}

for i, section in enumerate(sections):
    section_name = section["section_name"]
    start_page = section["page_number"]

    # End page is start of next section (or end of document)
    end_page = sections[i + 1]["page_number"] - 1 if i + 1 < len(sections) else None

    # Get schema for this section if we have one
    schema = section_schemas.get(section_name)
    if schema:
        page_range = f"{start_page}-{end_page}" if end_page else str(start_page)

        options = ConvertOptions(
            page_schema=json.dumps(schema),
            page_range=page_range,
            mode="balanced"
        )

        result = client.convert("report.pdf", options=options)
        results[section_name] = json.loads(result.extraction_schema_json)

print(results)
```

## Use Document Segmentation

For documents without a clear table of contents, use [Document Segmentation](/docs/recipes/document-segmentation/auto-segmentation) to automatically split by section headers.

```python theme={null}
segmentation_schema = {
    "type": "object",
    "properties": {
        "sections": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "title": {"type": "string"},
                    "type": {"type": "string", "enum": ["introduction", "methods", "results", "conclusion"]}
                }
            }
        }
    }
}

options = ConvertOptions(
    segmentation_schema=json.dumps(segmentation_schema),
    mode="balanced"
)

result = client.convert("paper.pdf", options=options)
# Access segmentation results
segments = result.segmentation_results
```

## Full Example

Complete workflow for processing a 100+ page financial report:

```python theme={null}
import json
from datalab_sdk import DatalabClient, ConvertOptions

client = DatalabClient()


def extract_with_toc(pdf_path: str, section_schemas: dict) -> dict:
    """Extract data from a long document using TOC-based segmentation."""

    # Step 1: Extract table of contents
    toc_schema = {
        "type": "object",
        "properties": {
            "table_of_contents": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "section_name": {"type": "string"},
                        "page_number": {"type": "number"}
                    }
                }
            }
        }
    }

    options = ConvertOptions(
        page_schema=json.dumps(toc_schema),
        page_range="0-6",
        mode="balanced"
    )

    result = client.convert(pdf_path, options=options)
    toc = json.loads(result.extraction_schema_json)
    sections = toc.get("table_of_contents", [])

    # Step 2: Extract each section with its schema
    results = {}

    for i, section in enumerate(sections):
        section_name = section["section_name"]
        start_page = int(section["page_number"])

        # Calculate page range
        if i + 1 < len(sections):
            end_page = int(sections[i + 1]["page_number"]) - 1
            page_range = f"{start_page}-{end_page}"
        else:
            page_range = str(start_page)

        # Check if we have a schema for this section
        schema = section_schemas.get(section_name)
        if not schema:
            continue

        options = ConvertOptions(
            page_schema=json.dumps(schema),
            page_range=page_range,
            mode="balanced"
        )

        try:
            result = client.convert(pdf_path, options=options)
            results[section_name] = json.loads(result.extraction_schema_json)
            print(f"Extracted: {section_name}")
        except Exception as e:
            print(f"Error extracting {section_name}: {e}")

    return results


# Define schemas for sections you care about
schemas = {
    "Financial Highlights": {
        "type": "object",
        "properties": {
            "total_revenue": {"type": "number", "description": "Total revenue"},
            "net_income": {"type": "number", "description": "Net income"},
            "year": {"type": "string", "description": "Fiscal year"}
        }
    },
    "Business Overview": {
        "type": "object",
        "properties": {
            "description": {"type": "string", "description": "Business description"},
            "products": {"type": "array", "items": {"type": "string"}}
        }
    }
}

results = extract_with_toc("annual_report.pdf", schemas)
print(json.dumps(results, indent=2))
```

## Tips

1. **Process pages you need** - Use `page_range` to avoid processing unnecessary pages
2. **Extract TOC first** - Build page ranges dynamically from the document structure
3. **Use appropriate modes** - `balanced` is usually sufficient; use `accurate` for complex tables
4. **Handle errors** - Some sections may not match your schema exactly

## Next Steps

<CardGroup cols={2}>
  <Card title="Structured Extraction" icon="table" href="/docs/recipes/structured-extraction/api-overview">
    Learn the full structured extraction API and schema options.
  </Card>

  <Card title="Document Segmentation" icon="scissors" href="/docs/recipes/document-segmentation/auto-segmentation">
    Automatically split documents by section headers.
  </Card>

  <Card title="Batch Processing" icon="layer-group" href="/docs/recipes/conversion/batch-documents">
    Process multiple long documents efficiently in parallel.
  </Card>

  <Card title="Pipelines" icon="workflow" href="/docs/recipes/pipelines/pipeline-overview">
    Chain processors into versioned, reusable pipelines.
  </Card>
</CardGroup>
