import requests
import time
import json
API_URL = "<https://www.datalab.to/api/v1/marker>"
API_KEY = "YOUR_API_KEY"
HEADERS = {"X-Api-Key": API_KEY}
SAMPLE_TOC_EXTRACION_SCHEMA = {
"type": "object",
"title": "ToCExtractionSchema",
"description": "Schema to pull out table of contents",
"properties": {
"table_of_contents": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section_name": {
"type": "string",
"description": "the name of the section from table of contents"
},
"page_range": {
"type": "string",
"description": "the page range or page number of the item from the table of contents"
}
}
}
}
},
"required": [
"table_of_contents"
]
}
def run_marker_extraction(pdf_path, schema_json, page_range=None):
"""
Submit a marker request with schema and optional page range.
Poll until complete, then return the parsed extraction schema as a dict.
"""
with open(pdf_path, "rb") as f:
files = {
'file': ('document.pdf', f, 'application/pdf'),
'page_schema': (None, schema_json),
'use_llm': (None, True)
}
if page_range:
files['page_range'] = (None, page_range)
# Submit request
response = requests.post(API_URL, files=files, headers=HEADERS)
data = response.json()
check_url = data["request_check_url"]
# Poll until complete
max_polls = 300
for _ in range(max_polls):
time.sleep(2)
poll = requests.get(check_url, headers=HEADERS).json()
if poll.get("status") == "failed":
raise RuntimeError(f"Extraction failed: {poll.get('error')}")
if poll.get("status") == "complete":
return json.loads(poll.get('extraction_schema_json'))
raise TimeoutError("Extraction job did not complete in time.")
def dynamic_page_range_extraction(pdf_path, toc_schema, schemas_by_section):
"""
1. Extract TOC from first few pages.
2. Parse TOC into section -> page_range mappings.
3. Run marker again per section using its schema + page range.
4. Merge results into a single dict.
"""
# Step 1: Extract TOC
toc_result = run_marker_extraction(pdf_path, schema_json=toc_schema, page_range="0-6")
# Step 2: Parse TOC into usable mapping (customize parser as needed)
section_page_ranges = parse_toc(toc_result)
# Step 3: Extract per-section
all_results = {}
for section, page_range in section_page_ranges.items():
schema_json = schemas_by_section.get(section)
if schema_json:
section_result = run_marker_extraction(pdf_path, schema_json=schema_json, page_range=page_range)
all_results[section] = section_result
return all_results
def parse_toc(toc_dict):
"""
Example TOC parser: converts the TOC dict into {section: page_range}.
In practice you'd implement parsing logic based on your schema design.
"""
page_map = {}
for item in toc_dict.get("table_of_contents", []):
section = item.get("item")
page = item.get("page number")
if section and page:
page_map[section] = page # right now single page; could expand to ranges if you need to
return page_map