391 lines
13 KiB
Python
391 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate and enrich extracted data using external API databases.
|
|
Supports common scientific databases for taxonomy, geography, chemistry, etc.
|
|
|
|
This script template includes examples for common databases. Customize for your needs.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
import requests
|
|
from urllib.parse import quote
|
|
|
|
|
|
def parse_args():
|
|
"""Parse command line arguments"""
|
|
parser = argparse.ArgumentParser(
|
|
description='Validate and enrich data with external APIs'
|
|
)
|
|
parser.add_argument(
|
|
'--input',
|
|
required=True,
|
|
help='Input JSON file with cleaned extraction results from step 04'
|
|
)
|
|
parser.add_argument(
|
|
'--output',
|
|
default='validated_data.json',
|
|
help='Output JSON file with validated and enriched data'
|
|
)
|
|
parser.add_argument(
|
|
'--apis',
|
|
required=True,
|
|
help='JSON configuration file specifying which APIs to use and for which fields'
|
|
)
|
|
parser.add_argument(
|
|
'--skip-validation',
|
|
action='store_true',
|
|
help='Skip API calls, only load and structure data'
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_results(input_path: Path) -> Dict:
|
|
"""Load extraction results from JSON file"""
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def load_api_config(config_path: Path) -> Dict:
|
|
"""Load API configuration"""
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def save_results(results: Dict, output_path: Path):
|
|
"""Save validated results to JSON file"""
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
# ==============================================================================
|
|
# Taxonomy validation functions
|
|
# ==============================================================================
|
|
|
|
def validate_gbif_taxonomy(scientific_name: str) -> Optional[Dict]:
|
|
"""
|
|
Validate taxonomic name using GBIF (Global Biodiversity Information Facility).
|
|
Returns standardized taxonomy if found.
|
|
"""
|
|
url = f"https://api.gbif.org/v1/species/match?name={quote(scientific_name)}"
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('matchType') != 'NONE':
|
|
return {
|
|
'matched_name': data.get('canonicalName', scientific_name),
|
|
'scientific_name': data.get('scientificName'),
|
|
'rank': data.get('rank'),
|
|
'kingdom': data.get('kingdom'),
|
|
'phylum': data.get('phylum'),
|
|
'class': data.get('class'),
|
|
'order': data.get('order'),
|
|
'family': data.get('family'),
|
|
'genus': data.get('genus'),
|
|
'gbif_id': data.get('usageKey'),
|
|
'confidence': data.get('confidence'),
|
|
'match_type': data.get('matchType'),
|
|
'status': data.get('status')
|
|
}
|
|
except Exception as e:
|
|
print(f"GBIF API error for '{scientific_name}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def validate_wfo_plant(scientific_name: str) -> Optional[Dict]:
|
|
"""
|
|
Validate plant name using World Flora Online.
|
|
Returns standardized plant taxonomy if found.
|
|
"""
|
|
# WFO requires name parsing - this is a simplified example
|
|
url = f"http://www.worldfloraonline.org/api/1.0/search?query={quote(scientific_name)}"
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('results'):
|
|
first_result = data['results'][0]
|
|
return {
|
|
'matched_name': first_result.get('name'),
|
|
'scientific_name': first_result.get('scientificName'),
|
|
'authors': first_result.get('authors'),
|
|
'family': first_result.get('family'),
|
|
'wfo_id': first_result.get('wfoId'),
|
|
'status': first_result.get('status')
|
|
}
|
|
except Exception as e:
|
|
print(f"WFO API error for '{scientific_name}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
# ==============================================================================
|
|
# Geography validation functions
|
|
# ==============================================================================
|
|
|
|
def validate_geonames(location: str, country: Optional[str] = None) -> Optional[Dict]:
|
|
"""
|
|
Validate location using GeoNames.
|
|
Note: Requires free GeoNames account and username.
|
|
Set GEONAMES_USERNAME environment variable.
|
|
"""
|
|
import os
|
|
username = os.getenv('GEONAMES_USERNAME')
|
|
if not username:
|
|
print("Warning: GEONAMES_USERNAME not set. Skipping GeoNames validation.")
|
|
return None
|
|
|
|
url = f"http://api.geonames.org/searchJSON?q={quote(location)}&maxRows=1&username={username}"
|
|
if country:
|
|
url += f"&country={country[:2]}" # Country code
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('geonames'):
|
|
place = data['geonames'][0]
|
|
return {
|
|
'matched_name': place.get('name'),
|
|
'country': place.get('countryName'),
|
|
'country_code': place.get('countryCode'),
|
|
'admin1': place.get('adminName1'),
|
|
'admin2': place.get('adminName2'),
|
|
'latitude': place.get('lat'),
|
|
'longitude': place.get('lng'),
|
|
'geonames_id': place.get('geonameId')
|
|
}
|
|
except Exception as e:
|
|
print(f"GeoNames API error for '{location}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
def geocode_location(address: str) -> Optional[Dict]:
|
|
"""
|
|
Geocode an address using OpenStreetMap Nominatim (free, no API key needed).
|
|
Please use responsibly - add delays between calls.
|
|
"""
|
|
url = f"https://nominatim.openstreetmap.org/search?q={quote(address)}&format=json&limit=1"
|
|
headers = {'User-Agent': 'Scientific-PDF-Extraction/1.0'}
|
|
|
|
try:
|
|
time.sleep(1) # Be nice to OSM
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data:
|
|
place = data[0]
|
|
return {
|
|
'display_name': place.get('display_name'),
|
|
'latitude': place.get('lat'),
|
|
'longitude': place.get('lon'),
|
|
'osm_type': place.get('osm_type'),
|
|
'osm_id': place.get('osm_id'),
|
|
'place_rank': place.get('place_rank')
|
|
}
|
|
except Exception as e:
|
|
print(f"Nominatim error for '{address}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
# ==============================================================================
|
|
# Chemistry validation functions
|
|
# ==============================================================================
|
|
|
|
def validate_pubchem_compound(compound_name: str) -> Optional[Dict]:
|
|
"""
|
|
Validate chemical compound using PubChem.
|
|
Returns standardized compound information.
|
|
"""
|
|
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(compound_name)}/JSON"
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if 'PC_Compounds' in data and data['PC_Compounds']:
|
|
compound = data['PC_Compounds'][0]
|
|
return {
|
|
'cid': compound['id']['id']['cid'],
|
|
'molecular_formula': compound.get('props', [{}])[0].get('value', {}).get('sval'),
|
|
'pubchem_url': f"https://pubchem.ncbi.nlm.nih.gov/compound/{compound['id']['id']['cid']}"
|
|
}
|
|
except Exception as e:
|
|
print(f"PubChem API error for '{compound_name}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
# ==============================================================================
|
|
# Gene/Protein validation functions
|
|
# ==============================================================================
|
|
|
|
def validate_ncbi_gene(gene_symbol: str, organism: Optional[str] = None) -> Optional[Dict]:
|
|
"""
|
|
Validate gene using NCBI Gene database.
|
|
"""
|
|
query = gene_symbol
|
|
if organism:
|
|
query += f"[Gene Name] AND {organism}[Organism]"
|
|
|
|
search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term={quote(query)}&retmode=json"
|
|
|
|
try:
|
|
response = requests.get(search_url, timeout=10)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get('esearchresult', {}).get('idlist'):
|
|
gene_id = data['esearchresult']['idlist'][0]
|
|
return {
|
|
'gene_id': gene_id,
|
|
'ncbi_url': f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
|
|
}
|
|
except Exception as e:
|
|
print(f"NCBI Gene API error for '{gene_symbol}': {e}")
|
|
|
|
return None
|
|
|
|
|
|
# ==============================================================================
|
|
# Main validation orchestration
|
|
# ==============================================================================
|
|
|
|
API_VALIDATORS = {
|
|
'gbif_taxonomy': validate_gbif_taxonomy,
|
|
'wfo_plants': validate_wfo_plant,
|
|
'geonames': validate_geonames,
|
|
'geocode': geocode_location,
|
|
'pubchem': validate_pubchem_compound,
|
|
'ncbi_gene': validate_ncbi_gene
|
|
}
|
|
|
|
|
|
def validate_field(value: Any, api_name: str, extra_params: Dict = None) -> Optional[Dict]:
|
|
"""
|
|
Validate a single field value using the specified API.
|
|
"""
|
|
if not value or value == 'none' or value == '':
|
|
return None
|
|
|
|
validator = API_VALIDATORS.get(api_name)
|
|
if not validator:
|
|
print(f"Unknown API: {api_name}")
|
|
return None
|
|
|
|
try:
|
|
if extra_params:
|
|
return validator(value, **extra_params)
|
|
else:
|
|
return validator(value)
|
|
except Exception as e:
|
|
print(f"Validation error for {api_name} with value '{value}': {e}")
|
|
return None
|
|
|
|
|
|
def process_record(
|
|
record_data: Dict,
|
|
api_config: Dict,
|
|
skip_validation: bool = False
|
|
) -> Dict:
|
|
"""
|
|
Process a single record, validating specified fields.
|
|
|
|
api_config should map field names to API names:
|
|
{
|
|
"field_mappings": {
|
|
"species": {"api": "gbif_taxonomy", "output_field": "validated_species"},
|
|
"location": {"api": "geocode", "output_field": "geocoded_location"}
|
|
}
|
|
}
|
|
"""
|
|
if skip_validation:
|
|
return record_data
|
|
|
|
field_mappings = api_config.get('field_mappings', {})
|
|
|
|
for field_name, field_config in field_mappings.items():
|
|
api_name = field_config.get('api')
|
|
output_field = field_config.get('output_field', f'validated_{field_name}')
|
|
extra_params = field_config.get('extra_params', {})
|
|
|
|
# Handle nested fields (e.g., 'records.species')
|
|
if '.' in field_name:
|
|
# This is a simplified example - you'd need to implement proper nested access
|
|
continue
|
|
|
|
value = record_data.get(field_name)
|
|
if value:
|
|
validated = validate_field(value, api_name, extra_params)
|
|
if validated:
|
|
record_data[output_field] = validated
|
|
|
|
return record_data
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
# Load inputs
|
|
results = load_results(Path(args.input))
|
|
api_config = load_api_config(Path(args.apis))
|
|
print(f"Loaded {len(results)} extraction results")
|
|
|
|
# Process each result
|
|
validated_results = {}
|
|
stats = {'total': 0, 'validated': 0, 'failed': 0}
|
|
|
|
for record_id, result in results.items():
|
|
if result.get('status') != 'success':
|
|
validated_results[record_id] = result
|
|
stats['failed'] += 1
|
|
continue
|
|
|
|
stats['total'] += 1
|
|
|
|
# Get extracted data
|
|
extracted_data = result.get('extracted_data', {})
|
|
|
|
# Process/validate the data
|
|
validated_data = process_record(
|
|
extracted_data.copy(),
|
|
api_config,
|
|
args.skip_validation
|
|
)
|
|
|
|
# Update result
|
|
result['validated_data'] = validated_data
|
|
validated_results[record_id] = result
|
|
stats['validated'] += 1
|
|
|
|
# Rate limiting
|
|
if not args.skip_validation:
|
|
time.sleep(0.5)
|
|
|
|
# Save results
|
|
output_path = Path(args.output)
|
|
save_results(validated_results, output_path)
|
|
|
|
# Print summary
|
|
print(f"\n{'='*60}")
|
|
print("Validation and Enrichment Summary")
|
|
print(f"{'='*60}")
|
|
print(f"Total records: {len(results)}")
|
|
print(f"Successfully validated: {stats['validated']}")
|
|
print(f"Failed extractions: {stats['failed']}")
|
|
print(f"\nResults saved to: {output_path}")
|
|
print(f"\nNext step: Export to analysis format")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|