Initial commit
This commit is contained in:
277
skills/gene-database/scripts/fetch_gene_data.py
Normal file
277
skills/gene-database/scripts/fetch_gene_data.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch gene data from NCBI using the Datasets API.
|
||||
|
||||
This script provides access to the NCBI Datasets API for retrieving
|
||||
comprehensive gene information including metadata and sequences.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
|
||||
DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene"
|
||||
|
||||
|
||||
def get_taxon_id(taxon_name: str) -> Optional[str]:
|
||||
"""
|
||||
Convert taxon name to NCBI taxon ID.
|
||||
|
||||
Args:
|
||||
taxon_name: Common or scientific name (e.g., "human", "Homo sapiens")
|
||||
|
||||
Returns:
|
||||
Taxon ID as string, or None if not found
|
||||
"""
|
||||
# Common mappings
|
||||
common_taxa = {
|
||||
'human': '9606',
|
||||
'homo sapiens': '9606',
|
||||
'mouse': '10090',
|
||||
'mus musculus': '10090',
|
||||
'rat': '10116',
|
||||
'rattus norvegicus': '10116',
|
||||
'zebrafish': '7955',
|
||||
'danio rerio': '7955',
|
||||
'fruit fly': '7227',
|
||||
'drosophila melanogaster': '7227',
|
||||
'c. elegans': '6239',
|
||||
'caenorhabditis elegans': '6239',
|
||||
'yeast': '4932',
|
||||
'saccharomyces cerevisiae': '4932',
|
||||
'arabidopsis': '3702',
|
||||
'arabidopsis thaliana': '3702',
|
||||
'e. coli': '562',
|
||||
'escherichia coli': '562',
|
||||
}
|
||||
|
||||
taxon_lower = taxon_name.lower().strip()
|
||||
return common_taxa.get(taxon_lower)
|
||||
|
||||
|
||||
def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch gene data by Gene ID.
|
||||
|
||||
Args:
|
||||
gene_id: NCBI Gene ID
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Gene data as dictionary
|
||||
"""
|
||||
url = f"{DATASETS_API_BASE}/id/{gene_id}"
|
||||
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers['api-key'] = api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(req) as response:
|
||||
return json.loads(response.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
if e.code == 404:
|
||||
print(f"Gene ID {gene_id} not found", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch gene data by gene symbol and taxon.
|
||||
|
||||
Args:
|
||||
symbol: Gene symbol (e.g., "BRCA1")
|
||||
taxon: Organism name or taxon ID
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Gene data as dictionary
|
||||
"""
|
||||
# Convert taxon name to ID if needed
|
||||
taxon_id = get_taxon_id(taxon)
|
||||
if not taxon_id:
|
||||
# Try to use as-is (might already be a taxon ID)
|
||||
taxon_id = taxon
|
||||
|
||||
url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}"
|
||||
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers['api-key'] = api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(req) as response:
|
||||
return json.loads(response.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
if e.code == 404:
|
||||
print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch data for multiple genes by ID.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Combined gene data as dictionary
|
||||
"""
|
||||
# For multiple genes, use POST request
|
||||
url = f"{DATASETS_API_BASE}/id"
|
||||
|
||||
data = json.dumps({"gene_ids": gene_ids}).encode('utf-8')
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
|
||||
if api_key:
|
||||
headers['api-key'] = api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, data=data, headers=headers, method='POST')
|
||||
with urllib.request.urlopen(req) as response:
|
||||
return json.loads(response.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None:
|
||||
"""
|
||||
Display gene information in human-readable format.
|
||||
|
||||
Args:
|
||||
data: Gene data dictionary from API
|
||||
verbose: Show detailed information
|
||||
"""
|
||||
if 'genes' not in data:
|
||||
print("No gene data found in response")
|
||||
return
|
||||
|
||||
for gene in data['genes']:
|
||||
gene_info = gene.get('gene', {})
|
||||
|
||||
print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}")
|
||||
print(f"Symbol: {gene_info.get('symbol', 'N/A')}")
|
||||
print(f"Description: {gene_info.get('description', 'N/A')}")
|
||||
|
||||
if 'tax_name' in gene_info:
|
||||
print(f"Organism: {gene_info['tax_name']}")
|
||||
|
||||
if 'chromosomes' in gene_info:
|
||||
chromosomes = ', '.join(gene_info['chromosomes'])
|
||||
print(f"Chromosome(s): {chromosomes}")
|
||||
|
||||
# Nomenclature
|
||||
if 'nomenclature_authority' in gene_info:
|
||||
auth = gene_info['nomenclature_authority']
|
||||
print(f"Nomenclature: {auth.get('authority', 'N/A')}")
|
||||
|
||||
# Synonyms
|
||||
if 'synonyms' in gene_info and gene_info['synonyms']:
|
||||
print(f"Synonyms: {', '.join(gene_info['synonyms'])}")
|
||||
|
||||
if verbose:
|
||||
# Gene type
|
||||
if 'type' in gene_info:
|
||||
print(f"Type: {gene_info['type']}")
|
||||
|
||||
# Genomic locations
|
||||
if 'genomic_ranges' in gene_info:
|
||||
print("\nGenomic Locations:")
|
||||
for range_info in gene_info['genomic_ranges']:
|
||||
accession = range_info.get('accession_version', 'N/A')
|
||||
start = range_info.get('range', [{}])[0].get('begin', 'N/A')
|
||||
end = range_info.get('range', [{}])[0].get('end', 'N/A')
|
||||
strand = range_info.get('orientation', 'N/A')
|
||||
print(f" {accession}: {start}-{end} ({strand})")
|
||||
|
||||
# Transcripts
|
||||
if 'transcripts' in gene_info:
|
||||
print(f"\nTranscripts: {len(gene_info['transcripts'])}")
|
||||
for transcript in gene_info['transcripts'][:5]: # Show first 5
|
||||
print(f" {transcript.get('accession_version', 'N/A')}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Fetch gene data from NCBI Datasets API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Fetch by Gene ID
|
||||
%(prog)s --gene-id 672
|
||||
|
||||
# Fetch by gene symbol and organism
|
||||
%(prog)s --symbol BRCA1 --taxon human
|
||||
|
||||
# Fetch multiple genes
|
||||
%(prog)s --gene-id 672,7157,5594
|
||||
|
||||
# Get JSON output
|
||||
%(prog)s --symbol TP53 --taxon "Homo sapiens" --output json
|
||||
|
||||
# Verbose output with details
|
||||
%(prog)s --gene-id 672 --verbose
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated')
|
||||
parser.add_argument('--symbol', '-s', help='Gene symbol')
|
||||
parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)')
|
||||
parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty',
|
||||
help='Output format (default: pretty)')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Show detailed information')
|
||||
parser.add_argument('--api-key', '-k', help='NCBI API key')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.gene_id and not args.symbol:
|
||||
parser.error("Either --gene-id or --symbol must be provided")
|
||||
|
||||
if args.symbol and not args.taxon:
|
||||
parser.error("--taxon is required when using --symbol")
|
||||
|
||||
# Fetch data
|
||||
if args.gene_id:
|
||||
gene_ids = [id.strip() for id in args.gene_id.split(',')]
|
||||
if len(gene_ids) == 1:
|
||||
data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key)
|
||||
else:
|
||||
data = fetch_multiple_genes(gene_ids, api_key=args.api_key)
|
||||
else:
|
||||
data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key)
|
||||
|
||||
if not data:
|
||||
sys.exit(1)
|
||||
|
||||
# Output
|
||||
if args.output == 'json':
|
||||
print(json.dumps(data, indent=2))
|
||||
else:
|
||||
display_gene_info(data, verbose=args.verbose)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user