Files
2025-11-30 08:30:10 +08:00

299 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""
Batch gene lookup using NCBI APIs.
This script efficiently processes multiple gene queries with proper
rate limiting and error handling.
"""
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from typing import Optional, List, Dict, Any
def read_gene_list(filepath: str) -> List[str]:
"""
Read gene identifiers from a file (one per line).
Args:
filepath: Path to file containing gene symbols or IDs
Returns:
List of gene identifiers
"""
try:
with open(filepath, 'r') as f:
genes = [line.strip() for line in f if line.strip()]
return genes
except FileNotFoundError:
print(f"Error: File '{filepath}' not found", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(1)
def batch_esearch(queries: List[str], organism: Optional[str] = None,
api_key: Optional[str] = None) -> Dict[str, str]:
"""
Search for multiple gene symbols and return their IDs.
Args:
queries: List of gene symbols
organism: Optional organism filter
api_key: Optional NCBI API key
Returns:
Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
results = {}
# Rate limiting
delay = 0.1 if api_key else 0.34 # 10 req/sec with key, 3 req/sec without
for query in queries:
# Build search term
search_term = f"{query}[gene]"
if organism:
search_term += f" AND {organism}[organism]"
params = {
'db': 'gene',
'term': search_term,
'retmax': 1,
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
id_list = data['esearchresult']['idlist']
results[query] = id_list[0] if id_list else 'NOT_FOUND'
else:
results[query] = 'ERROR'
except Exception as e:
print(f"Error searching for {query}: {e}", file=sys.stderr)
results[query] = 'ERROR'
time.sleep(delay)
return results
def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
"""
Get summaries for multiple genes in batches.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
chunk_size: Number of IDs per request (max 500)
Returns:
Dictionary mapping Gene ID to summary data
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
all_results = {}
# Rate limiting
delay = 0.1 if api_key else 0.34
# Process in chunks
for i in range(0, len(gene_ids), chunk_size):
chunk = gene_ids[i:i + chunk_size]
params = {
'db': 'gene',
'id': ','.join(chunk),
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
if 'result' in data:
for gene_id in chunk:
if gene_id in data['result']:
all_results[gene_id] = data['result'][gene_id]
except Exception as e:
print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)
time.sleep(delay)
return all_results
def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Lookup genes by IDs and return structured data.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
Returns:
List of gene information dictionaries
"""
summaries = batch_esummary(gene_ids, api_key=api_key)
results = []
for gene_id in gene_ids:
if gene_id in summaries:
gene = summaries[gene_id]
results.append({
'gene_id': gene_id,
'symbol': gene.get('name', 'N/A'),
'description': gene.get('description', 'N/A'),
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
'chromosome': gene.get('chromosome', 'N/A'),
'map_location': gene.get('maplocation', 'N/A'),
'type': gene.get('geneticsource', 'N/A')
})
else:
results.append({
'gene_id': gene_id,
'error': 'Not found or error fetching'
})
return results
def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
api_key: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Lookup genes by symbols and return structured data.
Args:
gene_symbols: List of gene symbols
organism: Organism name
api_key: Optional NCBI API key
Returns:
List of gene information dictionaries
"""
# First, search for IDs
print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)
# Filter to valid IDs
valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]
if not valid_ids:
print("No genes found", file=sys.stderr)
return []
print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)
# Fetch summaries
summaries = batch_esummary(valid_ids, api_key=api_key)
# Build results
results = []
for symbol, gene_id in symbol_to_id.items():
if gene_id == 'NOT_FOUND':
results.append({
'query_symbol': symbol,
'status': 'not_found'
})
elif gene_id == 'ERROR':
results.append({
'query_symbol': symbol,
'status': 'error'
})
elif gene_id in summaries:
gene = summaries[gene_id]
results.append({
'query_symbol': symbol,
'gene_id': gene_id,
'symbol': gene.get('name', 'N/A'),
'description': gene.get('description', 'N/A'),
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
'chromosome': gene.get('chromosome', 'N/A'),
'map_location': gene.get('maplocation', 'N/A'),
'type': gene.get('geneticsource', 'N/A')
})
return results
def main():
parser = argparse.ArgumentParser(
description='Batch gene lookup using NCBI APIs',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Lookup by gene IDs
%(prog)s --ids 672,7157,5594
# Lookup by symbols from a file
%(prog)s --file genes.txt --organism human
# Lookup with API key and save to file
%(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
"""
)
parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
parser.add_argument('--output', '-O', help='Output file path (JSON format)')
parser.add_argument('--api-key', '-k', help='NCBI API key')
parser.add_argument('--pretty', '-p', action='store_true',
help='Pretty-print JSON output')
args = parser.parse_args()
if not args.ids and not args.file:
parser.error("Either --ids or --file must be provided")
if args.file and not args.organism:
parser.error("--organism is required when using --file")
# Process genes
if args.ids:
gene_ids = [id.strip() for id in args.ids.split(',')]
results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
else:
gene_symbols = read_gene_list(args.file)
results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)
# Output results
indent = 2 if args.pretty else None
json_output = json.dumps(results, indent=indent)
if args.output:
try:
with open(args.output, 'w') as f:
f.write(json_output)
print(f"Results written to {args.output}", file=sys.stderr)
except Exception as e:
print(f"Error writing output file: {e}", file=sys.stderr)
sys.exit(1)
else:
print(json_output)
if __name__ == '__main__':
main()