#!/usr/bin/env python3
"""
Extract structured data from PDFs using Claude API.
Supports multiple PDF processing methods and prompt caching for efficiency.
This script template needs to be customized with your specific extraction schema.
"""
import argparse
import base64
import json
import os
import time
from pathlib import Path
from typing import Dict, List, Optional
import re
from anthropic import Anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request
# Configuration
BATCH_SIZE = 5
SIMULTANEOUS_BATCHES = 4
BATCH_CHECK_INTERVAL = 30
BATCH_SUBMISSION_INTERVAL = 20
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description='Extract structured data from PDFs using Claude'
)
parser.add_argument(
'--metadata',
required=True,
help='Input metadata JSON file (from step 01 or 02)'
)
parser.add_argument(
'--schema',
required=True,
help='JSON file defining extraction schema and prompts'
)
parser.add_argument(
'--output',
default='extracted_data.json',
help='Output JSON file with extraction results'
)
parser.add_argument(
'--method',
choices=['base64', 'files_api', 'batches'],
default='batches',
help='PDF processing method (default: batches)'
)
parser.add_argument(
'--use-caching',
action='store_true',
help='Enable prompt caching (reduces costs by ~90%% for repeated queries)'
)
parser.add_argument(
'--test',
action='store_true',
help='Run in test mode (process only 3 PDFs)'
)
parser.add_argument(
'--model',
default='claude-3-5-sonnet-20241022',
help='Claude model to use'
)
parser.add_argument(
'--filter-results',
help='Optional: JSON file with filter results from step 02 (only process relevant papers)'
)
return parser.parse_args()
def load_metadata(metadata_path: Path) -> List[Dict]:
"""Load metadata from JSON file"""
with open(metadata_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_schema(schema_path: Path) -> Dict:
"""Load extraction schema definition"""
with open(schema_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_filter_results(filter_path: Path) -> Dict:
"""Load filter results from step 02"""
with open(filter_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_existing_results(output_path: Path) -> Dict:
"""Load existing extraction results if available"""
if output_path.exists():
with open(output_path, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
def save_results(results: Dict, output_path: Path):
"""Save extraction results to JSON file"""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
def create_extraction_prompt(schema: Dict) -> str:
"""
Create extraction prompt from schema definition.
The schema JSON should contain:
- system_context: Description of the analysis task
- instructions: Step-by-step analysis instructions
- output_schema: JSON schema for the output
- output_example: Example of desired output
TODO: Customize schema.json for your specific use case
"""
prompt_parts = []
# Add objective
if 'objective' in schema:
prompt_parts.append(f"Your objective is to {schema['objective']}\n")
# Add instructions
if 'instructions' in schema:
prompt_parts.append("Please follow these steps:\n")
for i, instruction in enumerate(schema['instructions'], 1):
prompt_parts.append(f"{i}. {instruction}")
prompt_parts.append("")
# Add analysis framework
if 'analysis_steps' in schema:
prompt_parts.append("")
for step in schema['analysis_steps']:
prompt_parts.append(f"- {step}")
prompt_parts.append("\n")
prompt_parts.append(
"Your analysis must be wrapped within tags. "
"Be thorough and explicit in your reasoning.\n"
)
# Add output schema explanation
if 'output_schema' in schema:
prompt_parts.append("")
prompt_parts.append(json.dumps(schema['output_schema'], indent=2))
prompt_parts.append("\n")
# Add output example
if 'output_example' in schema:
prompt_parts.append("")
prompt_parts.append(json.dumps(schema['output_example'], indent=2))
prompt_parts.append("\n")
# Add important notes
if 'important_notes' in schema:
prompt_parts.append("Important considerations:")
for note in schema['important_notes']:
prompt_parts.append(f"- {note}")
prompt_parts.append("")
# Add final instruction
prompt_parts.append(
"After your analysis, provide the final output in the following JSON format, "
"wrapped in