Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""
Fetch gene data from NCBI using the Datasets API.
This script provides access to the NCBI Datasets API for retrieving
comprehensive gene information including metadata and sequences.
"""
import argparse
import json
import sys
import urllib.parse
import urllib.request
from typing import Optional, Dict, Any, List
DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene"
def get_taxon_id(taxon_name: str) -> Optional[str]:
"""
Convert taxon name to NCBI taxon ID.
Args:
taxon_name: Common or scientific name (e.g., "human", "Homo sapiens")
Returns:
Taxon ID as string, or None if not found
"""
# Common mappings
common_taxa = {
'human': '9606',
'homo sapiens': '9606',
'mouse': '10090',
'mus musculus': '10090',
'rat': '10116',
'rattus norvegicus': '10116',
'zebrafish': '7955',
'danio rerio': '7955',
'fruit fly': '7227',
'drosophila melanogaster': '7227',
'c. elegans': '6239',
'caenorhabditis elegans': '6239',
'yeast': '4932',
'saccharomyces cerevisiae': '4932',
'arabidopsis': '3702',
'arabidopsis thaliana': '3702',
'e. coli': '562',
'escherichia coli': '562',
}
taxon_lower = taxon_name.lower().strip()
return common_taxa.get(taxon_lower)
def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch gene data by Gene ID.
Args:
gene_id: NCBI Gene ID
api_key: Optional NCBI API key
Returns:
Gene data as dictionary
"""
url = f"{DATASETS_API_BASE}/id/{gene_id}"
headers = {}
if api_key:
headers['api-key'] = api_key
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
if e.code == 404:
print(f"Gene ID {gene_id} not found", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch gene data by gene symbol and taxon.
Args:
symbol: Gene symbol (e.g., "BRCA1")
taxon: Organism name or taxon ID
api_key: Optional NCBI API key
Returns:
Gene data as dictionary
"""
# Convert taxon name to ID if needed
taxon_id = get_taxon_id(taxon)
if not taxon_id:
# Try to use as-is (might already be a taxon ID)
taxon_id = taxon
url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}"
headers = {}
if api_key:
headers['api-key'] = api_key
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
if e.code == 404:
print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch data for multiple genes by ID.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
Returns:
Combined gene data as dictionary
"""
# For multiple genes, use POST request
url = f"{DATASETS_API_BASE}/id"
data = json.dumps({"gene_ids": gene_ids}).encode('utf-8')
headers = {'Content-Type': 'application/json'}
if api_key:
headers['api-key'] = api_key
try:
req = urllib.request.Request(url, data=data, headers=headers, method='POST')
with urllib.request.urlopen(req) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None:
"""
Display gene information in human-readable format.
Args:
data: Gene data dictionary from API
verbose: Show detailed information
"""
if 'genes' not in data:
print("No gene data found in response")
return
for gene in data['genes']:
gene_info = gene.get('gene', {})
print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}")
print(f"Symbol: {gene_info.get('symbol', 'N/A')}")
print(f"Description: {gene_info.get('description', 'N/A')}")
if 'tax_name' in gene_info:
print(f"Organism: {gene_info['tax_name']}")
if 'chromosomes' in gene_info:
chromosomes = ', '.join(gene_info['chromosomes'])
print(f"Chromosome(s): {chromosomes}")
# Nomenclature
if 'nomenclature_authority' in gene_info:
auth = gene_info['nomenclature_authority']
print(f"Nomenclature: {auth.get('authority', 'N/A')}")
# Synonyms
if 'synonyms' in gene_info and gene_info['synonyms']:
print(f"Synonyms: {', '.join(gene_info['synonyms'])}")
if verbose:
# Gene type
if 'type' in gene_info:
print(f"Type: {gene_info['type']}")
# Genomic locations
if 'genomic_ranges' in gene_info:
print("\nGenomic Locations:")
for range_info in gene_info['genomic_ranges']:
accession = range_info.get('accession_version', 'N/A')
start = range_info.get('range', [{}])[0].get('begin', 'N/A')
end = range_info.get('range', [{}])[0].get('end', 'N/A')
strand = range_info.get('orientation', 'N/A')
print(f" {accession}: {start}-{end} ({strand})")
# Transcripts
if 'transcripts' in gene_info:
print(f"\nTranscripts: {len(gene_info['transcripts'])}")
for transcript in gene_info['transcripts'][:5]: # Show first 5
print(f" {transcript.get('accession_version', 'N/A')}")
print()
def main():
parser = argparse.ArgumentParser(
description='Fetch gene data from NCBI Datasets API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Fetch by Gene ID
%(prog)s --gene-id 672
# Fetch by gene symbol and organism
%(prog)s --symbol BRCA1 --taxon human
# Fetch multiple genes
%(prog)s --gene-id 672,7157,5594
# Get JSON output
%(prog)s --symbol TP53 --taxon "Homo sapiens" --output json
# Verbose output with details
%(prog)s --gene-id 672 --verbose
"""
)
parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated')
parser.add_argument('--symbol', '-s', help='Gene symbol')
parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)')
parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty',
help='Output format (default: pretty)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Show detailed information')
parser.add_argument('--api-key', '-k', help='NCBI API key')
args = parser.parse_args()
if not args.gene_id and not args.symbol:
parser.error("Either --gene-id or --symbol must be provided")
if args.symbol and not args.taxon:
parser.error("--taxon is required when using --symbol")
# Fetch data
if args.gene_id:
gene_ids = [id.strip() for id in args.gene_id.split(',')]
if len(gene_ids) == 1:
data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key)
else:
data = fetch_multiple_genes(gene_ids, api_key=args.api_key)
else:
data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key)
if not data:
sys.exit(1)
# Output
if args.output == 'json':
print(json.dumps(data, indent=2))
else:
display_gene_info(data, verbose=args.verbose)
if __name__ == '__main__':
main()