#!/usr/bin/env python3 """ Fetch gene data from NCBI using the Datasets API. This script provides access to the NCBI Datasets API for retrieving comprehensive gene information including metadata and sequences. """ import argparse import json import sys import urllib.parse import urllib.request from typing import Optional, Dict, Any, List DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene" def get_taxon_id(taxon_name: str) -> Optional[str]: """ Convert taxon name to NCBI taxon ID. Args: taxon_name: Common or scientific name (e.g., "human", "Homo sapiens") Returns: Taxon ID as string, or None if not found """ # Common mappings common_taxa = { 'human': '9606', 'homo sapiens': '9606', 'mouse': '10090', 'mus musculus': '10090', 'rat': '10116', 'rattus norvegicus': '10116', 'zebrafish': '7955', 'danio rerio': '7955', 'fruit fly': '7227', 'drosophila melanogaster': '7227', 'c. elegans': '6239', 'caenorhabditis elegans': '6239', 'yeast': '4932', 'saccharomyces cerevisiae': '4932', 'arabidopsis': '3702', 'arabidopsis thaliana': '3702', 'e. coli': '562', 'escherichia coli': '562', } taxon_lower = taxon_name.lower().strip() return common_taxa.get(taxon_lower) def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]: """ Fetch gene data by Gene ID. Args: gene_id: NCBI Gene ID api_key: Optional NCBI API key Returns: Gene data as dictionary """ url = f"{DATASETS_API_BASE}/id/{gene_id}" headers = {} if api_key: headers['api-key'] = api_key try: req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req) as response: return json.loads(response.read().decode()) except urllib.error.HTTPError as e: print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) if e.code == 404: print(f"Gene ID {gene_id} not found", file=sys.stderr) return {} except Exception as e: print(f"Error: {e}", file=sys.stderr) return {} def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]: """ Fetch gene data by gene symbol and taxon. Args: symbol: Gene symbol (e.g., "BRCA1") taxon: Organism name or taxon ID api_key: Optional NCBI API key Returns: Gene data as dictionary """ # Convert taxon name to ID if needed taxon_id = get_taxon_id(taxon) if not taxon_id: # Try to use as-is (might already be a taxon ID) taxon_id = taxon url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}" headers = {} if api_key: headers['api-key'] = api_key try: req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req) as response: return json.loads(response.read().decode()) except urllib.error.HTTPError as e: print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) if e.code == 404: print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr) return {} except Exception as e: print(f"Error: {e}", file=sys.stderr) return {} def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]: """ Fetch data for multiple genes by ID. Args: gene_ids: List of Gene IDs api_key: Optional NCBI API key Returns: Combined gene data as dictionary """ # For multiple genes, use POST request url = f"{DATASETS_API_BASE}/id" data = json.dumps({"gene_ids": gene_ids}).encode('utf-8') headers = {'Content-Type': 'application/json'} if api_key: headers['api-key'] = api_key try: req = urllib.request.Request(url, data=data, headers=headers, method='POST') with urllib.request.urlopen(req) as response: return json.loads(response.read().decode()) except urllib.error.HTTPError as e: print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr) return {} except Exception as e: print(f"Error: {e}", file=sys.stderr) return {} def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None: """ Display gene information in human-readable format. Args: data: Gene data dictionary from API verbose: Show detailed information """ if 'genes' not in data: print("No gene data found in response") return for gene in data['genes']: gene_info = gene.get('gene', {}) print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}") print(f"Symbol: {gene_info.get('symbol', 'N/A')}") print(f"Description: {gene_info.get('description', 'N/A')}") if 'tax_name' in gene_info: print(f"Organism: {gene_info['tax_name']}") if 'chromosomes' in gene_info: chromosomes = ', '.join(gene_info['chromosomes']) print(f"Chromosome(s): {chromosomes}") # Nomenclature if 'nomenclature_authority' in gene_info: auth = gene_info['nomenclature_authority'] print(f"Nomenclature: {auth.get('authority', 'N/A')}") # Synonyms if 'synonyms' in gene_info and gene_info['synonyms']: print(f"Synonyms: {', '.join(gene_info['synonyms'])}") if verbose: # Gene type if 'type' in gene_info: print(f"Type: {gene_info['type']}") # Genomic locations if 'genomic_ranges' in gene_info: print("\nGenomic Locations:") for range_info in gene_info['genomic_ranges']: accession = range_info.get('accession_version', 'N/A') start = range_info.get('range', [{}])[0].get('begin', 'N/A') end = range_info.get('range', [{}])[0].get('end', 'N/A') strand = range_info.get('orientation', 'N/A') print(f" {accession}: {start}-{end} ({strand})") # Transcripts if 'transcripts' in gene_info: print(f"\nTranscripts: {len(gene_info['transcripts'])}") for transcript in gene_info['transcripts'][:5]: # Show first 5 print(f" {transcript.get('accession_version', 'N/A')}") print() def main(): parser = argparse.ArgumentParser( description='Fetch gene data from NCBI Datasets API', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Fetch by Gene ID %(prog)s --gene-id 672 # Fetch by gene symbol and organism %(prog)s --symbol BRCA1 --taxon human # Fetch multiple genes %(prog)s --gene-id 672,7157,5594 # Get JSON output %(prog)s --symbol TP53 --taxon "Homo sapiens" --output json # Verbose output with details %(prog)s --gene-id 672 --verbose """ ) parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated') parser.add_argument('--symbol', '-s', help='Gene symbol') parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)') parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty', help='Output format (default: pretty)') parser.add_argument('--verbose', '-v', action='store_true', help='Show detailed information') parser.add_argument('--api-key', '-k', help='NCBI API key') args = parser.parse_args() if not args.gene_id and not args.symbol: parser.error("Either --gene-id or --symbol must be provided") if args.symbol and not args.taxon: parser.error("--taxon is required when using --symbol") # Fetch data if args.gene_id: gene_ids = [id.strip() for id in args.gene_id.split(',')] if len(gene_ids) == 1: data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key) else: data = fetch_multiple_genes(gene_ids, api_key=args.api_key) else: data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key) if not data: sys.exit(1) # Output if args.output == 'json': print(json.dumps(data, indent=2)) else: display_gene_info(data, verbose=args.verbose) if __name__ == '__main__': main()