gh-brunoasm-my-claude-skill…/skills/extract_from_pdfs/scripts/05_validate_with_apis.py

#!/usr/bin/env python3
"""
Validate and enrich extracted data using external API databases.
Supports common scientific databases for taxonomy, geography, chemistry, etc.

This script template includes examples for common databases. Customize for your needs.
"""

import argparse
import json
import time
from pathlib import Path
from typing import Dict, List, Optional, Any
import requests
from urllib.parse import quote


def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(
        description='Validate and enrich data with external APIs'
    )
    parser.add_argument(
        '--input',
        required=True,
        help='Input JSON file with cleaned extraction results from step 04'
    )
    parser.add_argument(
        '--output',
        default='validated_data.json',
        help='Output JSON file with validated and enriched data'
    )
    parser.add_argument(
        '--apis',
        required=True,
        help='JSON configuration file specifying which APIs to use and for which fields'
    )
    parser.add_argument(
        '--skip-validation',
        action='store_true',
        help='Skip API calls, only load and structure data'
    )
    return parser.parse_args()


def load_results(input_path: Path) -> Dict:
    """Load extraction results from JSON file"""
    with open(input_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def load_api_config(config_path: Path) -> Dict:
    """Load API configuration"""
    with open(config_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def save_results(results: Dict, output_path: Path):
    """Save validated results to JSON file"""
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)


# ==============================================================================
# Taxonomy validation functions
# ==============================================================================

def validate_gbif_taxonomy(scientific_name: str) -> Optional[Dict]:
    """
    Validate taxonomic name using GBIF (Global Biodiversity Information Facility).
    Returns standardized taxonomy if found.
    """
    url = f"https://api.gbif.org/v1/species/match?name={quote(scientific_name)}"

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get('matchType') != 'NONE':
                return {
                    'matched_name': data.get('canonicalName', scientific_name),
                    'scientific_name': data.get('scientificName'),
                    'rank': data.get('rank'),
                    'kingdom': data.get('kingdom'),
                    'phylum': data.get('phylum'),
                    'class': data.get('class'),
                    'order': data.get('order'),
                    'family': data.get('family'),
                    'genus': data.get('genus'),
                    'gbif_id': data.get('usageKey'),
                    'confidence': data.get('confidence'),
                    'match_type': data.get('matchType'),
                    'status': data.get('status')
                }
    except Exception as e:
        print(f"GBIF API error for '{scientific_name}': {e}")

    return None


def validate_wfo_plant(scientific_name: str) -> Optional[Dict]:
    """
    Validate plant name using World Flora Online.
    Returns standardized plant taxonomy if found.
    """
    # WFO requires name parsing - this is a simplified example
    url = f"http://www.worldfloraonline.org/api/1.0/search?query={quote(scientific_name)}"

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get('results'):
                first_result = data['results'][0]
                return {
                    'matched_name': first_result.get('name'),
                    'scientific_name': first_result.get('scientificName'),
                    'authors': first_result.get('authors'),
                    'family': first_result.get('family'),
                    'wfo_id': first_result.get('wfoId'),
                    'status': first_result.get('status')
                }
    except Exception as e:
        print(f"WFO API error for '{scientific_name}': {e}")

    return None


# ==============================================================================
# Geography validation functions
# ==============================================================================

def validate_geonames(location: str, country: Optional[str] = None) -> Optional[Dict]:
    """
    Validate location using GeoNames.
    Note: Requires free GeoNames account and username.
    Set GEONAMES_USERNAME environment variable.
    """
    import os
    username = os.getenv('GEONAMES_USERNAME')
    if not username:
        print("Warning: GEONAMES_USERNAME not set. Skipping GeoNames validation.")
        return None

    url = f"http://api.geonames.org/searchJSON?q={quote(location)}&maxRows=1&username={username}"
    if country:
        url += f"&country={country[:2]}"  # Country code

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get('geonames'):
                place = data['geonames'][0]
                return {
                    'matched_name': place.get('name'),
                    'country': place.get('countryName'),
                    'country_code': place.get('countryCode'),
                    'admin1': place.get('adminName1'),
                    'admin2': place.get('adminName2'),
                    'latitude': place.get('lat'),
                    'longitude': place.get('lng'),
                    'geonames_id': place.get('geonameId')
                }
    except Exception as e:
        print(f"GeoNames API error for '{location}': {e}")

    return None


def geocode_location(address: str) -> Optional[Dict]:
    """
    Geocode an address using OpenStreetMap Nominatim (free, no API key needed).
    Please use responsibly - add delays between calls.
    """
    url = f"https://nominatim.openstreetmap.org/search?q={quote(address)}&format=json&limit=1"
    headers = {'User-Agent': 'Scientific-PDF-Extraction/1.0'}

    try:
        time.sleep(1)  # Be nice to OSM
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data:
                place = data[0]
                return {
                    'display_name': place.get('display_name'),
                    'latitude': place.get('lat'),
                    'longitude': place.get('lon'),
                    'osm_type': place.get('osm_type'),
                    'osm_id': place.get('osm_id'),
                    'place_rank': place.get('place_rank')
                }
    except Exception as e:
        print(f"Nominatim error for '{address}': {e}")

    return None


# ==============================================================================
# Chemistry validation functions
# ==============================================================================

def validate_pubchem_compound(compound_name: str) -> Optional[Dict]:
    """
    Validate chemical compound using PubChem.
    Returns standardized compound information.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(compound_name)}/JSON"

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if 'PC_Compounds' in data and data['PC_Compounds']:
                compound = data['PC_Compounds'][0]
                return {
                    'cid': compound['id']['id']['cid'],
                    'molecular_formula': compound.get('props', [{}])[0].get('value', {}).get('sval'),
                    'pubchem_url': f"https://pubchem.ncbi.nlm.nih.gov/compound/{compound['id']['id']['cid']}"
                }
    except Exception as e:
        print(f"PubChem API error for '{compound_name}': {e}")

    return None


# ==============================================================================
# Gene/Protein validation functions
# ==============================================================================

def validate_ncbi_gene(gene_symbol: str, organism: Optional[str] = None) -> Optional[Dict]:
    """
    Validate gene using NCBI Gene database.
    """
    query = gene_symbol
    if organism:
        query += f"[Gene Name] AND {organism}[Organism]"

    search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term={quote(query)}&retmode=json"

    try:
        response = requests.get(search_url, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get('esearchresult', {}).get('idlist'):
                gene_id = data['esearchresult']['idlist'][0]
                return {
                    'gene_id': gene_id,
                    'ncbi_url': f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
                }
    except Exception as e:
        print(f"NCBI Gene API error for '{gene_symbol}': {e}")

    return None


# ==============================================================================
# Main validation orchestration
# ==============================================================================

API_VALIDATORS = {
    'gbif_taxonomy': validate_gbif_taxonomy,
    'wfo_plants': validate_wfo_plant,
    'geonames': validate_geonames,
    'geocode': geocode_location,
    'pubchem': validate_pubchem_compound,
    'ncbi_gene': validate_ncbi_gene
}


def validate_field(value: Any, api_name: str, extra_params: Dict = None) -> Optional[Dict]:
    """
    Validate a single field value using the specified API.
    """
    if not value or value == 'none' or value == '':
        return None

    validator = API_VALIDATORS.get(api_name)
    if not validator:
        print(f"Unknown API: {api_name}")
        return None

    try:
        if extra_params:
            return validator(value, **extra_params)
        else:
            return validator(value)
    except Exception as e:
        print(f"Validation error for {api_name} with value '{value}': {e}")
        return None


def process_record(
    record_data: Dict,
    api_config: Dict,
    skip_validation: bool = False
) -> Dict:
    """
    Process a single record, validating specified fields.

    api_config should map field names to API names:
    {
        "field_mappings": {
            "species": {"api": "gbif_taxonomy", "output_field": "validated_species"},
            "location": {"api": "geocode", "output_field": "geocoded_location"}
        }
    }
    """
    if skip_validation:
        return record_data

    field_mappings = api_config.get('field_mappings', {})

    for field_name, field_config in field_mappings.items():
        api_name = field_config.get('api')
        output_field = field_config.get('output_field', f'validated_{field_name}')
        extra_params = field_config.get('extra_params', {})

        # Handle nested fields (e.g., 'records.species')
        if '.' in field_name:
            # This is a simplified example - you'd need to implement proper nested access
            continue

        value = record_data.get(field_name)
        if value:
            validated = validate_field(value, api_name, extra_params)
            if validated:
                record_data[output_field] = validated

    return record_data


def main():
    args = parse_args()

    # Load inputs
    results = load_results(Path(args.input))
    api_config = load_api_config(Path(args.apis))
    print(f"Loaded {len(results)} extraction results")

    # Process each result
    validated_results = {}
    stats = {'total': 0, 'validated': 0, 'failed': 0}

    for record_id, result in results.items():
        if result.get('status') != 'success':
            validated_results[record_id] = result
            stats['failed'] += 1
            continue

        stats['total'] += 1

        # Get extracted data
        extracted_data = result.get('extracted_data', {})

        # Process/validate the data
        validated_data = process_record(
            extracted_data.copy(),
            api_config,
            args.skip_validation
        )

        # Update result
        result['validated_data'] = validated_data
        validated_results[record_id] = result
        stats['validated'] += 1

        # Rate limiting
        if not args.skip_validation:
            time.sleep(0.5)

    # Save results
    output_path = Path(args.output)
    save_results(validated_results, output_path)

    # Print summary
    print(f"\n{'='*60}")
    print("Validation and Enrichment Summary")
    print(f"{'='*60}")
    print(f"Total records: {len(results)}")
    print(f"Successfully validated: {stats['validated']}")
    print(f"Failed extractions: {stats['failed']}")
    print(f"\nResults saved to: {output_path}")
    print(f"\nNext step: Export to analysis format")


if __name__ == '__main__':
    main()