gh-brunoasm-my-claude-skill…/skills/extract_from_pdfs/scripts/03_extract_from_pdfs.py

#!/usr/bin/env python3
"""
Extract structured data from PDFs using Claude API.
Supports multiple PDF processing methods and prompt caching for efficiency.
This script template needs to be customized with your specific extraction schema.
"""

import argparse
import base64
import json
import os
import time
from pathlib import Path
from typing import Dict, List, Optional
import re

from anthropic import Anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request


# Configuration
BATCH_SIZE = 5
SIMULTANEOUS_BATCHES = 4
BATCH_CHECK_INTERVAL = 30
BATCH_SUBMISSION_INTERVAL = 20


def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description='Extract structured data from PDFs using Claude'
    )
    parser.add_argument(
        '--metadata',
        required=True,
        help='Input metadata JSON file (from step 01 or 02)'
    )
    parser.add_argument(
        '--schema',
        required=True,
        help='JSON file defining extraction schema and prompts'
    )
    parser.add_argument(
        '--output',
        default='extracted_data.json',
        help='Output JSON file with extraction results'
    )
    parser.add_argument(
        '--method',
        choices=['base64', 'files_api', 'batches'],
        default='batches',
        help='PDF processing method (default: batches)'
    )
    parser.add_argument(
        '--use-caching',
        action='store_true',
        help='Enable prompt caching (reduces costs by ~90%% for repeated queries)'
    )
    parser.add_argument(
        '--test',
        action='store_true',
        help='Run in test mode (process only 3 PDFs)'
    )
    parser.add_argument(
        '--model',
        default='claude-3-5-sonnet-20241022',
        help='Claude model to use'
    )
    parser.add_argument(
        '--filter-results',
        help='Optional: JSON file with filter results from step 02 (only process relevant papers)'
    )
    return parser.parse_args()


def load_metadata(metadata_path: Path) -> List[Dict]:
    """Load metadata from JSON file"""
    with open(metadata_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def load_schema(schema_path: Path) -> Dict:
    """Load extraction schema definition"""
    with open(schema_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def load_filter_results(filter_path: Path) -> Dict:
    """Load filter results from step 02"""
    with open(filter_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def load_existing_results(output_path: Path) -> Dict:
    """Load existing extraction results if available"""
    if output_path.exists():
        with open(output_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}


def save_results(results: Dict, output_path: Path):
    """Save extraction results to JSON file"""
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)


def create_extraction_prompt(schema: Dict) -> str:
    """
    Create extraction prompt from schema definition.

    The schema JSON should contain:
    - system_context: Description of the analysis task
    - instructions: Step-by-step analysis instructions
    - output_schema: JSON schema for the output
    - output_example: Example of desired output

    TODO: Customize schema.json for your specific use case
    """
    prompt_parts = []

    # Add objective
    if 'objective' in schema:
        prompt_parts.append(f"Your objective is to {schema['objective']}\n")

    # Add instructions
    if 'instructions' in schema:
        prompt_parts.append("Please follow these steps:\n")
        for i, instruction in enumerate(schema['instructions'], 1):
            prompt_parts.append(f"{i}. {instruction}")
        prompt_parts.append("")

    # Add analysis framework
    if 'analysis_steps' in schema:
        prompt_parts.append("<analysis_framework>")
        for step in schema['analysis_steps']:
            prompt_parts.append(f"- {step}")
        prompt_parts.append("</analysis_framework>\n")
        prompt_parts.append(
            "Your analysis must be wrapped within <analysis> tags. "
            "Be thorough and explicit in your reasoning.\n"
        )

    # Add output schema explanation
    if 'output_schema' in schema:
        prompt_parts.append("<output_schema>")
        prompt_parts.append(json.dumps(schema['output_schema'], indent=2))
        prompt_parts.append("</output_schema>\n")

    # Add output example
    if 'output_example' in schema:
        prompt_parts.append("<output_example>")
        prompt_parts.append(json.dumps(schema['output_example'], indent=2))
        prompt_parts.append("</output_example>\n")

    # Add important notes
    if 'important_notes' in schema:
        prompt_parts.append("Important considerations:")
        for note in schema['important_notes']:
            prompt_parts.append(f"- {note}")
        prompt_parts.append("")

    # Add final instruction
    prompt_parts.append(
        "After your analysis, provide the final output in the following JSON format, "
        "wrapped in <output> tags. The output must be valid, parseable JSON.\n"
    )

    return "\n".join(prompt_parts)


def extract_json_from_response(text: str) -> Optional[Dict]:
    """Extract JSON from XML output tags in Claude's response"""
    match = re.search(r'<output>\s*(\{.*?\})\s*</output>', text, re.DOTALL)
    if match:
        json_str = match.group(1)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            return None
    return None


def extract_analysis_from_response(text: str) -> Optional[str]:
    """Extract analysis from XML tags in Claude's response"""
    match = re.search(r'<analysis>(.*?)</analysis>', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


def process_pdf_base64(
    client: Anthropic,
    pdf_path: Path,
    schema: Dict,
    model: str
) -> Dict:
    """Process a single PDF using base64 encoding (direct upload)"""
    if not pdf_path.exists():
        return {
            'status': 'error',
            'error': f'PDF not found: {pdf_path}'
        }

    # Check file size (32MB limit)
    file_size = pdf_path.stat().st_size
    if file_size > 32 * 1024 * 1024:
        return {
            'status': 'error',
            'error': f'PDF exceeds 32MB limit: {file_size / 1024 / 1024:.1f}MB'
        }

    try:
        # Read and encode PDF
        with open(pdf_path, 'rb') as f:
            pdf_data = base64.b64encode(f.read()).decode('utf-8')

        # Create message
        response = client.messages.create(
            model=model,
            max_tokens=16384,
            temperature=0,
            system=schema.get('system_context', 'You are a scientific research assistant.'),
            messages=[{
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": create_extraction_prompt(schema)
                    }
                ]
            }]
        )

        response_text = response.content[0].text

        return {
            'status': 'success',
            'extracted_data': extract_json_from_response(response_text),
            'analysis': extract_analysis_from_response(response_text),
            'input_tokens': response.usage.input_tokens,
            'output_tokens': response.usage.output_tokens
        }

    except Exception as e:
        return {
            'status': 'error',
            'error': str(e)
        }


def process_pdfs_batch(
    client: Anthropic,
    records: List[tuple],
    schema: Dict,
    model: str
) -> Dict[str, Dict]:
    """Process multiple PDFs using Batches API for efficiency"""
    all_results = {}

    for window_start in range(0, len(records), SIMULTANEOUS_BATCHES * BATCH_SIZE):
        window_records = records[window_start:window_start + (SIMULTANEOUS_BATCHES * BATCH_SIZE)]
        print(f"\nProcessing window starting at index {window_start} ({len(window_records)} PDFs)")

        active_batches = {}

        for batch_start in range(0, len(window_records), BATCH_SIZE):
            batch_records = window_records[batch_start:batch_start + BATCH_SIZE]
            requests = []

            for record_id, pdf_data in batch_records:
                requests.append(Request(
                    custom_id=record_id,
                    params=MessageCreateParamsNonStreaming(
                        model=model,
                        max_tokens=16384,
                        temperature=0,
                        system=schema.get('system_context', 'You are a scientific research assistant.'),
                        messages=[{
                            "role": "user",
                            "content": [
                                {
                                    "type": "document",
                                    "source": {
                                        "type": "base64",
                                        "media_type": "application/pdf",
                                        "data": pdf_data
                                    }
                                },
                                {
                                    "type": "text",
                                    "text": create_extraction_prompt(schema)
                                }
                            ]
                        }]
                    )
                ))

            try:
                message_batch = client.messages.batches.create(requests=requests)
                print(f"Created batch {message_batch.id} with {len(requests)} requests")
                active_batches[message_batch.id] = {r.custom_id for r in requests}
                time.sleep(BATCH_SUBMISSION_INTERVAL)
            except Exception as e:
                print(f"Error creating batch: {e}")

        # Wait for batches
        window_results = wait_for_batches(client, list(active_batches.keys()), schema)
        all_results.update(window_results)

    return all_results


def wait_for_batches(
    client: Anthropic,
    batch_ids: List[str],
    schema: Dict
) -> Dict[str, Dict]:
    """Wait for batches to complete and return results"""
    print(f"\nWaiting for {len(batch_ids)} batches to complete...")

    incomplete = set(batch_ids)

    while incomplete:
        time.sleep(BATCH_CHECK_INTERVAL)

        for batch_id in list(incomplete):
            batch = client.messages.batches.retrieve(batch_id)
            if batch.processing_status != "in_progress":
                incomplete.remove(batch_id)
                print(f"Batch {batch_id} completed: {batch.processing_status}")

    # Collect results
    results = {}
    for batch_id in batch_ids:
        batch = client.messages.batches.retrieve(batch_id)
        if batch.processing_status == "ended":
            for result in client.messages.batches.results(batch_id):
                if result.result.type == "succeeded":
                    response_text = result.result.message.content[0].text
                    results[result.custom_id] = {
                        'status': 'success',
                        'extracted_data': extract_json_from_response(response_text),
                        'analysis': extract_analysis_from_response(response_text),
                        'input_tokens': result.result.message.usage.input_tokens,
                        'output_tokens': result.result.message.usage.output_tokens
                    }
                else:
                    results[result.custom_id] = {
                        'status': 'error',
                        'error': str(getattr(result.result, 'error', 'Unknown error'))
                    }

    return results


def main():
    args = parse_args()

    # Check for API key
    if not os.getenv('ANTHROPIC_API_KEY'):
        raise ValueError("Please set ANTHROPIC_API_KEY environment variable")

    client = Anthropic()

    # Load inputs
    metadata = load_metadata(Path(args.metadata))
    schema = load_schema(Path(args.schema))
    print(f"Loaded {len(metadata)} metadata records")

    # Filter by relevance if filter results provided
    if args.filter_results:
        filter_results = load_filter_results(Path(args.filter_results))
        relevant_ids = {
            id for id, result in filter_results.items()
            if result.get('status') == 'success'
            and result.get('filter_result', {}).get('has_relevant_data')
        }
        metadata = [r for r in metadata if r['id'] in relevant_ids]
        print(f"Filtered to {len(metadata)} relevant papers")

    # Apply test mode
    if args.test:
        metadata = metadata[:3]
        print(f"Test mode: processing {len(metadata)} PDFs")

    # Load existing results
    output_path = Path(args.output)
    results = load_existing_results(output_path)
    print(f"Loaded {len(results)} existing results")

    # Prepare PDFs to process
    to_process = []
    for record in metadata:
        if record['id'] in results:
            continue
        if not record.get('pdf_path'):
            print(f"Skipping {record['id']}: no PDF path")
            continue
        pdf_path = Path(record['pdf_path'])
        if not pdf_path.exists():
            print(f"Skipping {record['id']}: PDF not found")
            continue

        # Read and encode PDF
        try:
            with open(pdf_path, 'rb') as f:
                pdf_data = base64.b64encode(f.read()).decode('utf-8')
            to_process.append((record['id'], pdf_data))
        except Exception as e:
            print(f"Error reading {pdf_path}: {e}")

    print(f"PDFs to process: {len(to_process)}")

    if not to_process:
        print("All PDFs already processed!")
        return

    # Process PDFs
    if args.method == 'batches':
        print("Using Batches API...")
        batch_results = process_pdfs_batch(client, to_process, schema, args.model)
        results.update(batch_results)
    else:
        print("Processing PDFs sequentially...")
        for record_id, pdf_data in to_process:
            print(f"Processing: {record_id}")
            # For sequential processing, reconstruct Path
            record = next(r for r in metadata if r['id'] == record_id)
            result = process_pdf_base64(
                client, Path(record['pdf_path']), schema, args.model
            )
            results[record_id] = result
            save_results(results, output_path)
            time.sleep(2)

    # Save final results
    save_results(results, output_path)

    # Print summary
    total = len(results)
    successful = sum(1 for r in results.values() if r.get('status') == 'success')
    total_input_tokens = sum(
        r.get('input_tokens', 0) for r in results.values()
        if r.get('status') == 'success'
    )
    total_output_tokens = sum(
        r.get('output_tokens', 0) for r in results.values()
        if r.get('status') == 'success'
    )

    print(f"\n{'='*60}")
    print("Extraction Summary")
    print(f"{'='*60}")
    print(f"Total PDFs processed: {total}")
    print(f"Successful extractions: {successful}")
    print(f"Failed extractions: {total - successful}")
    print(f"\nToken usage:")
    print(f"  Input tokens: {total_input_tokens:,}")
    print(f"  Output tokens: {total_output_tokens:,}")
    print(f"  Total tokens: {total_input_tokens + total_output_tokens:,}")
    print(f"\nResults saved to: {output_path}")
    print(f"\nNext step: Repair and validate JSON outputs")


if __name__ == '__main__':
    main()