Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,298 @@
#!/usr/bin/env python3
"""
Batch gene lookup using NCBI APIs.
This script efficiently processes multiple gene queries with proper
rate limiting and error handling.
"""
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from typing import Optional, List, Dict, Any
def read_gene_list(filepath: str) -> List[str]:
"""
Read gene identifiers from a file (one per line).
Args:
filepath: Path to file containing gene symbols or IDs
Returns:
List of gene identifiers
"""
try:
with open(filepath, 'r') as f:
genes = [line.strip() for line in f if line.strip()]
return genes
except FileNotFoundError:
print(f"Error: File '{filepath}' not found", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(1)
def batch_esearch(queries: List[str], organism: Optional[str] = None,
api_key: Optional[str] = None) -> Dict[str, str]:
"""
Search for multiple gene symbols and return their IDs.
Args:
queries: List of gene symbols
organism: Optional organism filter
api_key: Optional NCBI API key
Returns:
Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
results = {}
# Rate limiting
delay = 0.1 if api_key else 0.34 # 10 req/sec with key, 3 req/sec without
for query in queries:
# Build search term
search_term = f"{query}[gene]"
if organism:
search_term += f" AND {organism}[organism]"
params = {
'db': 'gene',
'term': search_term,
'retmax': 1,
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
id_list = data['esearchresult']['idlist']
results[query] = id_list[0] if id_list else 'NOT_FOUND'
else:
results[query] = 'ERROR'
except Exception as e:
print(f"Error searching for {query}: {e}", file=sys.stderr)
results[query] = 'ERROR'
time.sleep(delay)
return results
def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
"""
Get summaries for multiple genes in batches.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
chunk_size: Number of IDs per request (max 500)
Returns:
Dictionary mapping Gene ID to summary data
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
all_results = {}
# Rate limiting
delay = 0.1 if api_key else 0.34
# Process in chunks
for i in range(0, len(gene_ids), chunk_size):
chunk = gene_ids[i:i + chunk_size]
params = {
'db': 'gene',
'id': ','.join(chunk),
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
if 'result' in data:
for gene_id in chunk:
if gene_id in data['result']:
all_results[gene_id] = data['result'][gene_id]
except Exception as e:
print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)
time.sleep(delay)
return all_results
def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Lookup genes by IDs and return structured data.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
Returns:
List of gene information dictionaries
"""
summaries = batch_esummary(gene_ids, api_key=api_key)
results = []
for gene_id in gene_ids:
if gene_id in summaries:
gene = summaries[gene_id]
results.append({
'gene_id': gene_id,
'symbol': gene.get('name', 'N/A'),
'description': gene.get('description', 'N/A'),
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
'chromosome': gene.get('chromosome', 'N/A'),
'map_location': gene.get('maplocation', 'N/A'),
'type': gene.get('geneticsource', 'N/A')
})
else:
results.append({
'gene_id': gene_id,
'error': 'Not found or error fetching'
})
return results
def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
api_key: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Lookup genes by symbols and return structured data.
Args:
gene_symbols: List of gene symbols
organism: Organism name
api_key: Optional NCBI API key
Returns:
List of gene information dictionaries
"""
# First, search for IDs
print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)
# Filter to valid IDs
valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]
if not valid_ids:
print("No genes found", file=sys.stderr)
return []
print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)
# Fetch summaries
summaries = batch_esummary(valid_ids, api_key=api_key)
# Build results
results = []
for symbol, gene_id in symbol_to_id.items():
if gene_id == 'NOT_FOUND':
results.append({
'query_symbol': symbol,
'status': 'not_found'
})
elif gene_id == 'ERROR':
results.append({
'query_symbol': symbol,
'status': 'error'
})
elif gene_id in summaries:
gene = summaries[gene_id]
results.append({
'query_symbol': symbol,
'gene_id': gene_id,
'symbol': gene.get('name', 'N/A'),
'description': gene.get('description', 'N/A'),
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
'chromosome': gene.get('chromosome', 'N/A'),
'map_location': gene.get('maplocation', 'N/A'),
'type': gene.get('geneticsource', 'N/A')
})
return results
def main():
parser = argparse.ArgumentParser(
description='Batch gene lookup using NCBI APIs',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Lookup by gene IDs
%(prog)s --ids 672,7157,5594
# Lookup by symbols from a file
%(prog)s --file genes.txt --organism human
# Lookup with API key and save to file
%(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
"""
)
parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
parser.add_argument('--output', '-O', help='Output file path (JSON format)')
parser.add_argument('--api-key', '-k', help='NCBI API key')
parser.add_argument('--pretty', '-p', action='store_true',
help='Pretty-print JSON output')
args = parser.parse_args()
if not args.ids and not args.file:
parser.error("Either --ids or --file must be provided")
if args.file and not args.organism:
parser.error("--organism is required when using --file")
# Process genes
if args.ids:
gene_ids = [id.strip() for id in args.ids.split(',')]
results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
else:
gene_symbols = read_gene_list(args.file)
results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)
# Output results
indent = 2 if args.pretty else None
json_output = json.dumps(results, indent=indent)
if args.output:
try:
with open(args.output, 'w') as f:
f.write(json_output)
print(f"Results written to {args.output}", file=sys.stderr)
except Exception as e:
print(f"Error writing output file: {e}", file=sys.stderr)
sys.exit(1)
else:
print(json_output)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""
Fetch gene data from NCBI using the Datasets API.
This script provides access to the NCBI Datasets API for retrieving
comprehensive gene information including metadata and sequences.
"""
import argparse
import json
import sys
import urllib.parse
import urllib.request
from typing import Optional, Dict, Any, List
DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene"
def get_taxon_id(taxon_name: str) -> Optional[str]:
"""
Convert taxon name to NCBI taxon ID.
Args:
taxon_name: Common or scientific name (e.g., "human", "Homo sapiens")
Returns:
Taxon ID as string, or None if not found
"""
# Common mappings
common_taxa = {
'human': '9606',
'homo sapiens': '9606',
'mouse': '10090',
'mus musculus': '10090',
'rat': '10116',
'rattus norvegicus': '10116',
'zebrafish': '7955',
'danio rerio': '7955',
'fruit fly': '7227',
'drosophila melanogaster': '7227',
'c. elegans': '6239',
'caenorhabditis elegans': '6239',
'yeast': '4932',
'saccharomyces cerevisiae': '4932',
'arabidopsis': '3702',
'arabidopsis thaliana': '3702',
'e. coli': '562',
'escherichia coli': '562',
}
taxon_lower = taxon_name.lower().strip()
return common_taxa.get(taxon_lower)
def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch gene data by Gene ID.
Args:
gene_id: NCBI Gene ID
api_key: Optional NCBI API key
Returns:
Gene data as dictionary
"""
url = f"{DATASETS_API_BASE}/id/{gene_id}"
headers = {}
if api_key:
headers['api-key'] = api_key
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
if e.code == 404:
print(f"Gene ID {gene_id} not found", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch gene data by gene symbol and taxon.
Args:
symbol: Gene symbol (e.g., "BRCA1")
taxon: Organism name or taxon ID
api_key: Optional NCBI API key
Returns:
Gene data as dictionary
"""
# Convert taxon name to ID if needed
taxon_id = get_taxon_id(taxon)
if not taxon_id:
# Try to use as-is (might already be a taxon ID)
taxon_id = taxon
url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}"
headers = {}
if api_key:
headers['api-key'] = api_key
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
if e.code == 404:
print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch data for multiple genes by ID.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
Returns:
Combined gene data as dictionary
"""
# For multiple genes, use POST request
url = f"{DATASETS_API_BASE}/id"
data = json.dumps({"gene_ids": gene_ids}).encode('utf-8')
headers = {'Content-Type': 'application/json'}
if api_key:
headers['api-key'] = api_key
try:
req = urllib.request.Request(url, data=data, headers=headers, method='POST')
with urllib.request.urlopen(req) as response:
return json.loads(response.read().decode())
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None:
"""
Display gene information in human-readable format.
Args:
data: Gene data dictionary from API
verbose: Show detailed information
"""
if 'genes' not in data:
print("No gene data found in response")
return
for gene in data['genes']:
gene_info = gene.get('gene', {})
print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}")
print(f"Symbol: {gene_info.get('symbol', 'N/A')}")
print(f"Description: {gene_info.get('description', 'N/A')}")
if 'tax_name' in gene_info:
print(f"Organism: {gene_info['tax_name']}")
if 'chromosomes' in gene_info:
chromosomes = ', '.join(gene_info['chromosomes'])
print(f"Chromosome(s): {chromosomes}")
# Nomenclature
if 'nomenclature_authority' in gene_info:
auth = gene_info['nomenclature_authority']
print(f"Nomenclature: {auth.get('authority', 'N/A')}")
# Synonyms
if 'synonyms' in gene_info and gene_info['synonyms']:
print(f"Synonyms: {', '.join(gene_info['synonyms'])}")
if verbose:
# Gene type
if 'type' in gene_info:
print(f"Type: {gene_info['type']}")
# Genomic locations
if 'genomic_ranges' in gene_info:
print("\nGenomic Locations:")
for range_info in gene_info['genomic_ranges']:
accession = range_info.get('accession_version', 'N/A')
start = range_info.get('range', [{}])[0].get('begin', 'N/A')
end = range_info.get('range', [{}])[0].get('end', 'N/A')
strand = range_info.get('orientation', 'N/A')
print(f" {accession}: {start}-{end} ({strand})")
# Transcripts
if 'transcripts' in gene_info:
print(f"\nTranscripts: {len(gene_info['transcripts'])}")
for transcript in gene_info['transcripts'][:5]: # Show first 5
print(f" {transcript.get('accession_version', 'N/A')}")
print()
def main():
parser = argparse.ArgumentParser(
description='Fetch gene data from NCBI Datasets API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Fetch by Gene ID
%(prog)s --gene-id 672
# Fetch by gene symbol and organism
%(prog)s --symbol BRCA1 --taxon human
# Fetch multiple genes
%(prog)s --gene-id 672,7157,5594
# Get JSON output
%(prog)s --symbol TP53 --taxon "Homo sapiens" --output json
# Verbose output with details
%(prog)s --gene-id 672 --verbose
"""
)
parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated')
parser.add_argument('--symbol', '-s', help='Gene symbol')
parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)')
parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty',
help='Output format (default: pretty)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Show detailed information')
parser.add_argument('--api-key', '-k', help='NCBI API key')
args = parser.parse_args()
if not args.gene_id and not args.symbol:
parser.error("Either --gene-id or --symbol must be provided")
if args.symbol and not args.taxon:
parser.error("--taxon is required when using --symbol")
# Fetch data
if args.gene_id:
gene_ids = [id.strip() for id in args.gene_id.split(',')]
if len(gene_ids) == 1:
data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key)
else:
data = fetch_multiple_genes(gene_ids, api_key=args.api_key)
else:
data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key)
if not data:
sys.exit(1)
# Output
if args.output == 'json':
print(json.dumps(data, indent=2))
else:
display_gene_info(data, verbose=args.verbose)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,251 @@
#!/usr/bin/env python3
"""
Query NCBI Gene database using E-utilities.
This script provides access to ESearch, ESummary, and EFetch functions
for searching and retrieving gene information.
"""
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from typing import Optional, Dict, List, Any
from xml.etree import ElementTree as ET
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
DB = "gene"
def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
"""
Search NCBI Gene database and return list of Gene IDs.
Args:
query: Search query (e.g., "BRCA1[gene] AND human[organism]")
retmax: Maximum number of results to return
api_key: Optional NCBI API key for higher rate limits
Returns:
List of Gene IDs as strings
"""
params = {
'db': DB,
'term': query,
'retmax': retmax,
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
return data['esearchresult']['idlist']
else:
print(f"Error: Unexpected response format", file=sys.stderr)
return []
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return []
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return []
def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Get document summaries for Gene IDs.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
Returns:
Dictionary of gene summaries
"""
params = {
'db': DB,
'id': ','.join(gene_ids),
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
return data
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
"""
Fetch full gene records.
Args:
gene_ids: List of Gene IDs
retmode: Return format ('xml', 'text', 'asn.1')
api_key: Optional NCBI API key
Returns:
Gene records as string in requested format
"""
params = {
'db': DB,
'id': ','.join(gene_ids),
'retmode': retmode
}
if api_key:
params['api_key'] = api_key
url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
return response.read().decode()
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return ""
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return ""
def search_and_summarize(query: str, organism: Optional[str] = None,
max_results: int = 20, api_key: Optional[str] = None) -> None:
"""
Search for genes and display summaries.
Args:
query: Gene search query
organism: Optional organism filter
max_results: Maximum number of results
api_key: Optional NCBI API key
"""
# Add organism filter if provided
if organism:
if '[organism]' not in query.lower():
query = f"{query} AND {organism}[organism]"
print(f"Searching for: {query}")
print("-" * 80)
# Search for gene IDs
gene_ids = esearch(query, retmax=max_results, api_key=api_key)
if not gene_ids:
print("No results found.")
return
print(f"Found {len(gene_ids)} gene(s)")
print()
# Get summaries
summaries = esummary(gene_ids, api_key=api_key)
if 'result' in summaries:
for gene_id in gene_ids:
if gene_id in summaries['result']:
gene = summaries['result'][gene_id]
print(f"Gene ID: {gene_id}")
print(f" Symbol: {gene.get('name', 'N/A')}")
print(f" Description: {gene.get('description', 'N/A')}")
print(f" Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
print(f" Chromosome: {gene.get('chromosome', 'N/A')}")
print(f" Map Location: {gene.get('maplocation', 'N/A')}")
print(f" Type: {gene.get('geneticsource', 'N/A')}")
print()
# Respect rate limits
time.sleep(0.34) # ~3 requests per second
def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
api_key: Optional[str] = None) -> None:
"""
Fetch and display gene information by ID.
Args:
gene_ids: List of Gene IDs
output_format: Output format ('json', 'xml', 'text')
api_key: Optional NCBI API key
"""
if output_format == 'json':
# Get summaries in JSON format
summaries = esummary(gene_ids, api_key=api_key)
print(json.dumps(summaries, indent=2))
else:
# Fetch full records
data = efetch(gene_ids, retmode=output_format, api_key=api_key)
print(data)
# Respect rate limits
time.sleep(0.34)
def main():
parser = argparse.ArgumentParser(
description='Query NCBI Gene database using E-utilities',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Search for gene by symbol
%(prog)s --search "BRCA1" --organism "human"
# Fetch gene by ID
%(prog)s --id 672 --format json
# Complex search query
%(prog)s --search "insulin[gene] AND diabetes[disease]"
# Multiple gene IDs
%(prog)s --id 672,7157,5594
"""
)
parser.add_argument('--search', '-s', help='Search query')
parser.add_argument('--organism', '-o', help='Organism filter')
parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
parser.add_argument('--format', '-f', default='json',
choices=['json', 'xml', 'text'],
help='Output format (default: json)')
parser.add_argument('--max-results', '-m', type=int, default=20,
help='Maximum number of search results (default: 20)')
parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')
args = parser.parse_args()
if not args.search and not args.id:
parser.error("Either --search or --id must be provided")
if args.id:
# Fetch by ID
gene_ids = [id.strip() for id in args.id.split(',')]
fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
else:
# Search and summarize
search_and_summarize(args.search, organism=args.organism,
max_results=args.max_results, api_key=args.api_key)
if __name__ == '__main__':
main()