Initial commit
This commit is contained in:
298
skills/gene-database/scripts/batch_gene_lookup.py
Normal file
298
skills/gene-database/scripts/batch_gene_lookup.py
Normal file
@@ -0,0 +1,298 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch gene lookup using NCBI APIs.
|
||||
|
||||
This script efficiently processes multiple gene queries with proper
|
||||
rate limiting and error handling.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
|
||||
def read_gene_list(filepath: str) -> List[str]:
|
||||
"""
|
||||
Read gene identifiers from a file (one per line).
|
||||
|
||||
Args:
|
||||
filepath: Path to file containing gene symbols or IDs
|
||||
|
||||
Returns:
|
||||
List of gene identifiers
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
genes = [line.strip() for line in f if line.strip()]
|
||||
return genes
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File '{filepath}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def batch_esearch(queries: List[str], organism: Optional[str] = None,
|
||||
api_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Search for multiple gene symbols and return their IDs.
|
||||
|
||||
Args:
|
||||
queries: List of gene symbols
|
||||
organism: Optional organism filter
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
|
||||
"""
|
||||
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
results = {}
|
||||
|
||||
# Rate limiting
|
||||
delay = 0.1 if api_key else 0.34 # 10 req/sec with key, 3 req/sec without
|
||||
|
||||
for query in queries:
|
||||
# Build search term
|
||||
search_term = f"{query}[gene]"
|
||||
if organism:
|
||||
search_term += f" AND {organism}[organism]"
|
||||
|
||||
params = {
|
||||
'db': 'gene',
|
||||
'term': search_term,
|
||||
'retmax': 1,
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
|
||||
id_list = data['esearchresult']['idlist']
|
||||
results[query] = id_list[0] if id_list else 'NOT_FOUND'
|
||||
else:
|
||||
results[query] = 'ERROR'
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error searching for {query}: {e}", file=sys.stderr)
|
||||
results[query] = 'ERROR'
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
|
||||
chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Get summaries for multiple genes in batches.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
chunk_size: Number of IDs per request (max 500)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping Gene ID to summary data
|
||||
"""
|
||||
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
all_results = {}
|
||||
|
||||
# Rate limiting
|
||||
delay = 0.1 if api_key else 0.34
|
||||
|
||||
# Process in chunks
|
||||
for i in range(0, len(gene_ids), chunk_size):
|
||||
chunk = gene_ids[i:i + chunk_size]
|
||||
|
||||
params = {
|
||||
'db': 'gene',
|
||||
'id': ','.join(chunk),
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
if 'result' in data:
|
||||
for gene_id in chunk:
|
||||
if gene_id in data['result']:
|
||||
all_results[gene_id] = data['result'][gene_id]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Lookup genes by IDs and return structured data.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
List of gene information dictionaries
|
||||
"""
|
||||
summaries = batch_esummary(gene_ids, api_key=api_key)
|
||||
|
||||
results = []
|
||||
for gene_id in gene_ids:
|
||||
if gene_id in summaries:
|
||||
gene = summaries[gene_id]
|
||||
results.append({
|
||||
'gene_id': gene_id,
|
||||
'symbol': gene.get('name', 'N/A'),
|
||||
'description': gene.get('description', 'N/A'),
|
||||
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
|
||||
'chromosome': gene.get('chromosome', 'N/A'),
|
||||
'map_location': gene.get('maplocation', 'N/A'),
|
||||
'type': gene.get('geneticsource', 'N/A')
|
||||
})
|
||||
else:
|
||||
results.append({
|
||||
'gene_id': gene_id,
|
||||
'error': 'Not found or error fetching'
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
|
||||
api_key: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Lookup genes by symbols and return structured data.
|
||||
|
||||
Args:
|
||||
gene_symbols: List of gene symbols
|
||||
organism: Organism name
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
List of gene information dictionaries
|
||||
"""
|
||||
# First, search for IDs
|
||||
print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
|
||||
symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)
|
||||
|
||||
# Filter to valid IDs
|
||||
valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]
|
||||
|
||||
if not valid_ids:
|
||||
print("No genes found", file=sys.stderr)
|
||||
return []
|
||||
|
||||
print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)
|
||||
|
||||
# Fetch summaries
|
||||
summaries = batch_esummary(valid_ids, api_key=api_key)
|
||||
|
||||
# Build results
|
||||
results = []
|
||||
for symbol, gene_id in symbol_to_id.items():
|
||||
if gene_id == 'NOT_FOUND':
|
||||
results.append({
|
||||
'query_symbol': symbol,
|
||||
'status': 'not_found'
|
||||
})
|
||||
elif gene_id == 'ERROR':
|
||||
results.append({
|
||||
'query_symbol': symbol,
|
||||
'status': 'error'
|
||||
})
|
||||
elif gene_id in summaries:
|
||||
gene = summaries[gene_id]
|
||||
results.append({
|
||||
'query_symbol': symbol,
|
||||
'gene_id': gene_id,
|
||||
'symbol': gene.get('name', 'N/A'),
|
||||
'description': gene.get('description', 'N/A'),
|
||||
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
|
||||
'chromosome': gene.get('chromosome', 'N/A'),
|
||||
'map_location': gene.get('maplocation', 'N/A'),
|
||||
'type': gene.get('geneticsource', 'N/A')
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Batch gene lookup using NCBI APIs',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Lookup by gene IDs
|
||||
%(prog)s --ids 672,7157,5594
|
||||
|
||||
# Lookup by symbols from a file
|
||||
%(prog)s --file genes.txt --organism human
|
||||
|
||||
# Lookup with API key and save to file
|
||||
%(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
|
||||
parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
|
||||
parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
|
||||
parser.add_argument('--output', '-O', help='Output file path (JSON format)')
|
||||
parser.add_argument('--api-key', '-k', help='NCBI API key')
|
||||
parser.add_argument('--pretty', '-p', action='store_true',
|
||||
help='Pretty-print JSON output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.ids and not args.file:
|
||||
parser.error("Either --ids or --file must be provided")
|
||||
|
||||
if args.file and not args.organism:
|
||||
parser.error("--organism is required when using --file")
|
||||
|
||||
# Process genes
|
||||
if args.ids:
|
||||
gene_ids = [id.strip() for id in args.ids.split(',')]
|
||||
results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
|
||||
else:
|
||||
gene_symbols = read_gene_list(args.file)
|
||||
results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)
|
||||
|
||||
# Output results
|
||||
indent = 2 if args.pretty else None
|
||||
json_output = json.dumps(results, indent=indent)
|
||||
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(json_output)
|
||||
print(f"Results written to {args.output}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Error writing output file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(json_output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user