Initial commit

2025-11-29 18:02:40 +08:00
commit 69617b598e
25 changed files with 5790 additions and 0 deletions
--- a/skills/extract_from_pdfs/scripts/05_validate_with_apis.py
+++ b/skills/extract_from_pdfs/scripts/05_validate_with_apis.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+"""
+Validate and enrich extracted data using external API databases.
+Supports common scientific databases for taxonomy, geography, chemistry, etc.
+
+This script template includes examples for common databases. Customize for your needs.
+"""
+
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+import requests
+from urllib.parse import quote
+
+
+def parse_args():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description='Validate and enrich data with external APIs'
+    )
+    parser.add_argument(
+        '--input',
+        required=True,
+        help='Input JSON file with cleaned extraction results from step 04'
+    )
+    parser.add_argument(
+        '--output',
+        default='validated_data.json',
+        help='Output JSON file with validated and enriched data'
+    )
+    parser.add_argument(
+        '--apis',
+        required=True,
+        help='JSON configuration file specifying which APIs to use and for which fields'
+    )
+    parser.add_argument(
+        '--skip-validation',
+        action='store_true',
+        help='Skip API calls, only load and structure data'
+    )
+    return parser.parse_args()
+
+
+def load_results(input_path: Path) -> Dict:
+    """Load extraction results from JSON file"""
+    with open(input_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def load_api_config(config_path: Path) -> Dict:
+    """Load API configuration"""
+    with open(config_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def save_results(results: Dict, output_path: Path):
+    """Save validated results to JSON file"""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+
+# ==============================================================================
+# Taxonomy validation functions
+# ==============================================================================
+
+def validate_gbif_taxonomy(scientific_name: str) -> Optional[Dict]:
+    """
+    Validate taxonomic name using GBIF (Global Biodiversity Information Facility).
+    Returns standardized taxonomy if found.
+    """
+    url = f"https://api.gbif.org/v1/species/match?name={quote(scientific_name)}"
+
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('matchType') != 'NONE':
+                return {
+                    'matched_name': data.get('canonicalName', scientific_name),
+                    'scientific_name': data.get('scientificName'),
+                    'rank': data.get('rank'),
+                    'kingdom': data.get('kingdom'),
+                    'phylum': data.get('phylum'),
+                    'class': data.get('class'),
+                    'order': data.get('order'),
+                    'family': data.get('family'),
+                    'genus': data.get('genus'),
+                    'gbif_id': data.get('usageKey'),
+                    'confidence': data.get('confidence'),
+                    'match_type': data.get('matchType'),
+                    'status': data.get('status')
+                }
+    except Exception as e:
+        print(f"GBIF API error for '{scientific_name}': {e}")
+
+    return None
+
+
+def validate_wfo_plant(scientific_name: str) -> Optional[Dict]:
+    """
+    Validate plant name using World Flora Online.
+    Returns standardized plant taxonomy if found.
+    """
+    # WFO requires name parsing - this is a simplified example
+    url = f"http://www.worldfloraonline.org/api/1.0/search?query={quote(scientific_name)}"
+
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('results'):
+                first_result = data['results'][0]
+                return {
+                    'matched_name': first_result.get('name'),
+                    'scientific_name': first_result.get('scientificName'),
+                    'authors': first_result.get('authors'),
+                    'family': first_result.get('family'),
+                    'wfo_id': first_result.get('wfoId'),
+                    'status': first_result.get('status')
+                }
+    except Exception as e:
+        print(f"WFO API error for '{scientific_name}': {e}")
+
+    return None
+
+
+# ==============================================================================
+# Geography validation functions
+# ==============================================================================
+
+def validate_geonames(location: str, country: Optional[str] = None) -> Optional[Dict]:
+    """
+    Validate location using GeoNames.
+    Note: Requires free GeoNames account and username.
+    Set GEONAMES_USERNAME environment variable.
+    """
+    import os
+    username = os.getenv('GEONAMES_USERNAME')
+    if not username:
+        print("Warning: GEONAMES_USERNAME not set. Skipping GeoNames validation.")
+        return None
+
+    url = f"http://api.geonames.org/searchJSON?q={quote(location)}&maxRows=1&username={username}"
+    if country:
+        url += f"&country={country[:2]}"  # Country code
+
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('geonames'):
+                place = data['geonames'][0]
+                return {
+                    'matched_name': place.get('name'),
+                    'country': place.get('countryName'),
+                    'country_code': place.get('countryCode'),
+                    'admin1': place.get('adminName1'),
+                    'admin2': place.get('adminName2'),
+                    'latitude': place.get('lat'),
+                    'longitude': place.get('lng'),
+                    'geonames_id': place.get('geonameId')
+                }
+    except Exception as e:
+        print(f"GeoNames API error for '{location}': {e}")
+
+    return None
+
+
+def geocode_location(address: str) -> Optional[Dict]:
+    """
+    Geocode an address using OpenStreetMap Nominatim (free, no API key needed).
+    Please use responsibly - add delays between calls.
+    """
+    url = f"https://nominatim.openstreetmap.org/search?q={quote(address)}&format=json&limit=1"
+    headers = {'User-Agent': 'Scientific-PDF-Extraction/1.0'}
+
+    try:
+        time.sleep(1)  # Be nice to OSM
+        response = requests.get(url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data:
+                place = data[0]
+                return {
+                    'display_name': place.get('display_name'),
+                    'latitude': place.get('lat'),
+                    'longitude': place.get('lon'),
+                    'osm_type': place.get('osm_type'),
+                    'osm_id': place.get('osm_id'),
+                    'place_rank': place.get('place_rank')
+                }
+    except Exception as e:
+        print(f"Nominatim error for '{address}': {e}")
+
+    return None
+
+
+# ==============================================================================
+# Chemistry validation functions
+# ==============================================================================
+
+def validate_pubchem_compound(compound_name: str) -> Optional[Dict]:
+    """
+    Validate chemical compound using PubChem.
+    Returns standardized compound information.
+    """
+    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(compound_name)}/JSON"
+
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if 'PC_Compounds' in data and data['PC_Compounds']:
+                compound = data['PC_Compounds'][0]
+                return {
+                    'cid': compound['id']['id']['cid'],
+                    'molecular_formula': compound.get('props', [{}])[0].get('value', {}).get('sval'),
+                    'pubchem_url': f"https://pubchem.ncbi.nlm.nih.gov/compound/{compound['id']['id']['cid']}"
+                }
+    except Exception as e:
+        print(f"PubChem API error for '{compound_name}': {e}")
+
+    return None
+
+
+# ==============================================================================
+# Gene/Protein validation functions
+# ==============================================================================
+
+def validate_ncbi_gene(gene_symbol: str, organism: Optional[str] = None) -> Optional[Dict]:
+    """
+    Validate gene using NCBI Gene database.
+    """
+    query = gene_symbol
+    if organism:
+        query += f"[Gene Name] AND {organism}[Organism]"
+
+    search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term={quote(query)}&retmode=json"
+
+    try:
+        response = requests.get(search_url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('esearchresult', {}).get('idlist'):
+                gene_id = data['esearchresult']['idlist'][0]
+                return {
+                    'gene_id': gene_id,
+                    'ncbi_url': f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
+                }
+    except Exception as e:
+        print(f"NCBI Gene API error for '{gene_symbol}': {e}")
+
+    return None
+
+
+# ==============================================================================
+# Main validation orchestration
+# ==============================================================================
+
+API_VALIDATORS = {
+    'gbif_taxonomy': validate_gbif_taxonomy,
+    'wfo_plants': validate_wfo_plant,
+    'geonames': validate_geonames,
+    'geocode': geocode_location,
+    'pubchem': validate_pubchem_compound,
+    'ncbi_gene': validate_ncbi_gene
+}
+
+
+def validate_field(value: Any, api_name: str, extra_params: Dict = None) -> Optional[Dict]:
+    """
+    Validate a single field value using the specified API.
+    """
+    if not value or value == 'none' or value == '':
+        return None
+
+    validator = API_VALIDATORS.get(api_name)
+    if not validator:
+        print(f"Unknown API: {api_name}")
+        return None
+
+    try:
+        if extra_params:
+            return validator(value, **extra_params)
+        else:
+            return validator(value)
+    except Exception as e:
+        print(f"Validation error for {api_name} with value '{value}': {e}")
+        return None
+
+
+def process_record(
+    record_data: Dict,
+    api_config: Dict,
+    skip_validation: bool = False
+) -> Dict:
+    """
+    Process a single record, validating specified fields.
+
+    api_config should map field names to API names:
+    {
+        "field_mappings": {
+            "species": {"api": "gbif_taxonomy", "output_field": "validated_species"},
+            "location": {"api": "geocode", "output_field": "geocoded_location"}
+        }
+    }
+    """
+    if skip_validation:
+        return record_data
+
+    field_mappings = api_config.get('field_mappings', {})
+
+    for field_name, field_config in field_mappings.items():
+        api_name = field_config.get('api')
+        output_field = field_config.get('output_field', f'validated_{field_name}')
+        extra_params = field_config.get('extra_params', {})
+
+        # Handle nested fields (e.g., 'records.species')
+        if '.' in field_name:
+            # This is a simplified example - you'd need to implement proper nested access
+            continue
+
+        value = record_data.get(field_name)
+        if value:
+            validated = validate_field(value, api_name, extra_params)
+            if validated:
+                record_data[output_field] = validated
+
+    return record_data
+
+
+def main():
+    args = parse_args()
+
+    # Load inputs
+    results = load_results(Path(args.input))
+    api_config = load_api_config(Path(args.apis))
+    print(f"Loaded {len(results)} extraction results")
+
+    # Process each result
+    validated_results = {}
+    stats = {'total': 0, 'validated': 0, 'failed': 0}
+
+    for record_id, result in results.items():
+        if result.get('status') != 'success':
+            validated_results[record_id] = result
+            stats['failed'] += 1
+            continue
+
+        stats['total'] += 1
+
+        # Get extracted data
+        extracted_data = result.get('extracted_data', {})
+
+        # Process/validate the data
+        validated_data = process_record(
+            extracted_data.copy(),
+            api_config,
+            args.skip_validation
+        )
+
+        # Update result
+        result['validated_data'] = validated_data
+        validated_results[record_id] = result
+        stats['validated'] += 1
+
+        # Rate limiting
+        if not args.skip_validation:
+            time.sleep(0.5)
+
+    # Save results
+    output_path = Path(args.output)
+    save_results(validated_results, output_path)
+
+    # Print summary
+    print(f"\n{'='*60}")
+    print("Validation and Enrichment Summary")
+    print(f"{'='*60}")
+    print(f"Total records: {len(results)}")
+    print(f"Successfully validated: {stats['validated']}")
+    print(f"Failed extractions: {stats['failed']}")
+    print(f"\nResults saved to: {output_path}")
+    print(f"\nNext step: Export to analysis format")
+
+
+if __name__ == '__main__':
+    main()