import os
import requests
import time
import json
API_URL = "https://www.datalab.to/api/v1/marker"
DATALAB_API_KEY = os.getenv("DATALAB_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
def extract_tracked_changes(docx_path, output_format='html', paginate=False):
"""Extract tracked changes from a Word document."""
with open(docx_path, 'rb') as f:
form_data = {
'file': (os.path.basename(docx_path), f,
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'),
'extras': (None, 'track_changes'),
'output_format': (None, output_format),
'paginate': (None, paginate)
}
headers = {"X-Api-Key": DATALAB_API_KEY}
response = requests.post(API_URL, files=form_data, headers=headers)
data = response.json()
# Poll for completion
check_url = data["request_check_url"]
max_polls = 300
for i in range(max_polls):
time.sleep(2)
response = requests.get(check_url, headers=headers)
result = response.json()
if result["status"] == "complete":
return result
elif result["status"] == "failed":
raise Exception(f"Conversion failed: {result.get('error')}")
raise TimeoutError("Conversion did not complete in time")
def analyze_with_llm(content, prompt_template):
"""Send content to LLM for analysis via OpenRouter."""
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json"
},
json={
"model": "anthropic/claude-3.5-sonnet",
"messages": [
{
"role": "user",
"content": prompt_template.format(content=content)
}
]
}
)
return response.json()['choices'][0]['message']['content']
def generate_legal_review(docx_path):
"""
Complete workflow: extract tracked changes and generate legal review.
"""
print(f"Processing {docx_path}...")
# Extract tracked changes
result = extract_tracked_changes(docx_path, output_format='html', paginate=True)
marked_up_doc = result['html']
print("Document converted with tracked changes preserved.")
# Generate comprehensive legal review
review_prompt = """You are a legal reviewer analyzing a contract with tracked changes.
Please provide:
1. **Executive Summary**: Brief overview of the document and key changes
2. **Material Changes**: List substantive changes that affect rights, obligations, or liabilities
3. **Risk Assessment**: Identify any changes that increase risk exposure
4. **Comments Analysis**: Summarize unresolved comments and action items
5. **Recommendations**: Specific next steps for legal review
Document with tracked changes:
{content}"""
print("\nGenerating legal review with LLM...")
review = analyze_with_llm(marked_up_doc, review_prompt)
# Also generate author-specific analysis
author_prompt = """Analyze this document's tracked changes by author.
For each author who made changes:
- Total number of insertions and deletions
- Types of changes (substantive vs. editorial)
- Key themes in their revisions
- Any patterns in their negotiation strategy
Document:
{content}"""
print("Generating per-author analysis...")
author_analysis = analyze_with_llm(marked_up_doc, author_prompt)
return {
'marked_up_document': marked_up_doc,
'legal_review': review,
'author_analysis': author_analysis
}
if __name__ == "__main__":
# Process a contract with tracked changes
results = generate_legal_review('contract_redline_v3.docx')
# Save results
with open('legal_review.txt', 'w') as f:
f.write("LEGAL REVIEW\n")
f.write("="*80 + "\n\n")
f.write(results['legal_review'])
f.write("\n\n" + "="*80 + "\n\n")
f.write("AUTHOR ANALYSIS\n")
f.write("="*80 + "\n\n")
f.write(results['author_analysis'])
with open('marked_up_document.html', 'w') as f:
f.write(results['marked_up_document'])
print("\nReview complete! Results saved to:")
print(" - legal_review.txt")
print(" - marked_up_document.html")