import json
from datalab_sdk import DatalabClient, ConvertOptions
client = DatalabClient()
def extract_with_toc(pdf_path: str, section_schemas: dict) -> dict:
"""Extract data from a long document using TOC-based segmentation."""
# Step 1: Extract table of contents
toc_schema = {
"type": "object",
"properties": {
"table_of_contents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section_name": {"type": "string"},
"page_number": {"type": "number"}
}
}
}
}
}
options = ConvertOptions(
page_schema=json.dumps(toc_schema),
page_range="0-6",
mode="balanced"
)
result = client.convert(pdf_path, options=options)
toc = json.loads(result.extraction_schema_json)
sections = toc.get("table_of_contents", [])
# Step 2: Extract each section with its schema
results = {}
for i, section in enumerate(sections):
section_name = section["section_name"]
start_page = int(section["page_number"])
# Calculate page range
if i + 1 < len(sections):
end_page = int(sections[i + 1]["page_number"]) - 1
page_range = f"{start_page}-{end_page}"
else:
page_range = str(start_page)
# Check if we have a schema for this section
schema = section_schemas.get(section_name)
if not schema:
continue
options = ConvertOptions(
page_schema=json.dumps(schema),
page_range=page_range,
mode="balanced"
)
try:
result = client.convert(pdf_path, options=options)
results[section_name] = json.loads(result.extraction_schema_json)
print(f"Extracted: {section_name}")
except Exception as e:
print(f"Error extracting {section_name}: {e}")
return results
# Define schemas for sections you care about
schemas = {
"Financial Highlights": {
"type": "object",
"properties": {
"total_revenue": {"type": "number", "description": "Total revenue"},
"net_income": {"type": "number", "description": "Net income"},
"year": {"type": "string", "description": "Fiscal year"}
}
},
"Business Overview": {
"type": "object",
"properties": {
"description": {"type": "string", "description": "Business description"},
"products": {"type": "array", "items": {"type": "string"}}
}
}
}
results = extract_with_toc("annual_report.pdf", schemas)
print(json.dumps(results, indent=2))