import os
import time
import requests
from pathlib import Path
from typing import Optional
API_URL = "https://www.datalab.to/api/v1/marker"
API_KEY = os.getenv("DATALAB_API_KEY")
def submit_and_poll_pdf_conversion(
pdf_path: Path,
output_format: Optional[str] = 'markdown',
use_llm: Optional[bool] = True
):
url = "https://www.datalab.to/api/v1/marker"
#
# Submit initial request
#
with open(pdf_path, 'rb') as f:
form_data = {
'file': (pdf_path.name, f, 'application/pdf'),
"force_ocr": (None, False),
"paginate": (None, False),
'output_format': (None, output_format),
"use_llm": (None, use_llm),
"strip_existing_ocr": (None, False),
"disable_image_extraction": (None, False)
}
headers = {"X-Api-Key": API_KEY}
response = requests.post(url, files=form_data, headers=headers)
data = response.json()
#
# Poll for completion
#
max_polls = 300
check_url = data["request_check_url"]
for i in range(max_polls):
response = requests.get(check_url, headers=headers) # Need to include headers for API key
check_result = response.json()
if check_result['status'] == 'complete':
#
# Your processing is finished, you can do your post-processing!
#
converted_document = check_result[output_format] # the 'html', 'markdown', or 'json' field in the response will contain what you're looking for (maps to our initial `output_format`)
#
# .. do something with it!
#
elif check_result["status"] == "failed":
print("Failed to convert, uh oh...")
break
else:
print("Waiting 2 more seconds to re-check conversion status")
time.sleep(2)