Initial commit
This commit is contained in:
298
skills/gene-database/scripts/batch_gene_lookup.py
Normal file
298
skills/gene-database/scripts/batch_gene_lookup.py
Normal file
@@ -0,0 +1,298 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch gene lookup using NCBI APIs.
|
||||
|
||||
This script efficiently processes multiple gene queries with proper
|
||||
rate limiting and error handling.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
|
||||
def read_gene_list(filepath: str) -> List[str]:
|
||||
"""
|
||||
Read gene identifiers from a file (one per line).
|
||||
|
||||
Args:
|
||||
filepath: Path to file containing gene symbols or IDs
|
||||
|
||||
Returns:
|
||||
List of gene identifiers
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
genes = [line.strip() for line in f if line.strip()]
|
||||
return genes
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File '{filepath}' not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def batch_esearch(queries: List[str], organism: Optional[str] = None,
|
||||
api_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Search for multiple gene symbols and return their IDs.
|
||||
|
||||
Args:
|
||||
queries: List of gene symbols
|
||||
organism: Optional organism filter
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Dictionary mapping gene symbol to Gene ID (or 'NOT_FOUND')
|
||||
"""
|
||||
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
results = {}
|
||||
|
||||
# Rate limiting
|
||||
delay = 0.1 if api_key else 0.34 # 10 req/sec with key, 3 req/sec without
|
||||
|
||||
for query in queries:
|
||||
# Build search term
|
||||
search_term = f"{query}[gene]"
|
||||
if organism:
|
||||
search_term += f" AND {organism}[organism]"
|
||||
|
||||
params = {
|
||||
'db': 'gene',
|
||||
'term': search_term,
|
||||
'retmax': 1,
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{base_url}esearch.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
|
||||
id_list = data['esearchresult']['idlist']
|
||||
results[query] = id_list[0] if id_list else 'NOT_FOUND'
|
||||
else:
|
||||
results[query] = 'ERROR'
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error searching for {query}: {e}", file=sys.stderr)
|
||||
results[query] = 'ERROR'
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def batch_esummary(gene_ids: List[str], api_key: Optional[str] = None,
|
||||
chunk_size: int = 200) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Get summaries for multiple genes in batches.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
chunk_size: Number of IDs per request (max 500)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping Gene ID to summary data
|
||||
"""
|
||||
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
all_results = {}
|
||||
|
||||
# Rate limiting
|
||||
delay = 0.1 if api_key else 0.34
|
||||
|
||||
# Process in chunks
|
||||
for i in range(0, len(gene_ids), chunk_size):
|
||||
chunk = gene_ids[i:i + chunk_size]
|
||||
|
||||
params = {
|
||||
'db': 'gene',
|
||||
'id': ','.join(chunk),
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{base_url}esummary.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
if 'result' in data:
|
||||
for gene_id in chunk:
|
||||
if gene_id in data['result']:
|
||||
all_results[gene_id] = data['result'][gene_id]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching summaries for chunk: {e}", file=sys.stderr)
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def batch_lookup_by_ids(gene_ids: List[str], api_key: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Lookup genes by IDs and return structured data.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
List of gene information dictionaries
|
||||
"""
|
||||
summaries = batch_esummary(gene_ids, api_key=api_key)
|
||||
|
||||
results = []
|
||||
for gene_id in gene_ids:
|
||||
if gene_id in summaries:
|
||||
gene = summaries[gene_id]
|
||||
results.append({
|
||||
'gene_id': gene_id,
|
||||
'symbol': gene.get('name', 'N/A'),
|
||||
'description': gene.get('description', 'N/A'),
|
||||
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
|
||||
'chromosome': gene.get('chromosome', 'N/A'),
|
||||
'map_location': gene.get('maplocation', 'N/A'),
|
||||
'type': gene.get('geneticsource', 'N/A')
|
||||
})
|
||||
else:
|
||||
results.append({
|
||||
'gene_id': gene_id,
|
||||
'error': 'Not found or error fetching'
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def batch_lookup_by_symbols(gene_symbols: List[str], organism: str,
|
||||
api_key: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Lookup genes by symbols and return structured data.
|
||||
|
||||
Args:
|
||||
gene_symbols: List of gene symbols
|
||||
organism: Organism name
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
List of gene information dictionaries
|
||||
"""
|
||||
# First, search for IDs
|
||||
print(f"Searching for {len(gene_symbols)} gene symbols...", file=sys.stderr)
|
||||
symbol_to_id = batch_esearch(gene_symbols, organism=organism, api_key=api_key)
|
||||
|
||||
# Filter to valid IDs
|
||||
valid_ids = [id for id in symbol_to_id.values() if id not in ['NOT_FOUND', 'ERROR']]
|
||||
|
||||
if not valid_ids:
|
||||
print("No genes found", file=sys.stderr)
|
||||
return []
|
||||
|
||||
print(f"Found {len(valid_ids)} genes, fetching details...", file=sys.stderr)
|
||||
|
||||
# Fetch summaries
|
||||
summaries = batch_esummary(valid_ids, api_key=api_key)
|
||||
|
||||
# Build results
|
||||
results = []
|
||||
for symbol, gene_id in symbol_to_id.items():
|
||||
if gene_id == 'NOT_FOUND':
|
||||
results.append({
|
||||
'query_symbol': symbol,
|
||||
'status': 'not_found'
|
||||
})
|
||||
elif gene_id == 'ERROR':
|
||||
results.append({
|
||||
'query_symbol': symbol,
|
||||
'status': 'error'
|
||||
})
|
||||
elif gene_id in summaries:
|
||||
gene = summaries[gene_id]
|
||||
results.append({
|
||||
'query_symbol': symbol,
|
||||
'gene_id': gene_id,
|
||||
'symbol': gene.get('name', 'N/A'),
|
||||
'description': gene.get('description', 'N/A'),
|
||||
'organism': gene.get('organism', {}).get('scientificname', 'N/A'),
|
||||
'chromosome': gene.get('chromosome', 'N/A'),
|
||||
'map_location': gene.get('maplocation', 'N/A'),
|
||||
'type': gene.get('geneticsource', 'N/A')
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Batch gene lookup using NCBI APIs',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Lookup by gene IDs
|
||||
%(prog)s --ids 672,7157,5594
|
||||
|
||||
# Lookup by symbols from a file
|
||||
%(prog)s --file genes.txt --organism human
|
||||
|
||||
# Lookup with API key and save to file
|
||||
%(prog)s --ids 672,7157,5594 --api-key YOUR_KEY --output results.json
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--ids', '-i', help='Comma-separated Gene IDs')
|
||||
parser.add_argument('--file', '-f', help='File containing gene symbols (one per line)')
|
||||
parser.add_argument('--organism', '-o', help='Organism name (required with --file)')
|
||||
parser.add_argument('--output', '-O', help='Output file path (JSON format)')
|
||||
parser.add_argument('--api-key', '-k', help='NCBI API key')
|
||||
parser.add_argument('--pretty', '-p', action='store_true',
|
||||
help='Pretty-print JSON output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.ids and not args.file:
|
||||
parser.error("Either --ids or --file must be provided")
|
||||
|
||||
if args.file and not args.organism:
|
||||
parser.error("--organism is required when using --file")
|
||||
|
||||
# Process genes
|
||||
if args.ids:
|
||||
gene_ids = [id.strip() for id in args.ids.split(',')]
|
||||
results = batch_lookup_by_ids(gene_ids, api_key=args.api_key)
|
||||
else:
|
||||
gene_symbols = read_gene_list(args.file)
|
||||
results = batch_lookup_by_symbols(gene_symbols, args.organism, api_key=args.api_key)
|
||||
|
||||
# Output results
|
||||
indent = 2 if args.pretty else None
|
||||
json_output = json.dumps(results, indent=indent)
|
||||
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(json_output)
|
||||
print(f"Results written to {args.output}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Error writing output file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(json_output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
277
skills/gene-database/scripts/fetch_gene_data.py
Normal file
277
skills/gene-database/scripts/fetch_gene_data.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch gene data from NCBI using the Datasets API.
|
||||
|
||||
This script provides access to the NCBI Datasets API for retrieving
|
||||
comprehensive gene information including metadata and sequences.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
|
||||
DATASETS_API_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/gene"
|
||||
|
||||
|
||||
def get_taxon_id(taxon_name: str) -> Optional[str]:
|
||||
"""
|
||||
Convert taxon name to NCBI taxon ID.
|
||||
|
||||
Args:
|
||||
taxon_name: Common or scientific name (e.g., "human", "Homo sapiens")
|
||||
|
||||
Returns:
|
||||
Taxon ID as string, or None if not found
|
||||
"""
|
||||
# Common mappings
|
||||
common_taxa = {
|
||||
'human': '9606',
|
||||
'homo sapiens': '9606',
|
||||
'mouse': '10090',
|
||||
'mus musculus': '10090',
|
||||
'rat': '10116',
|
||||
'rattus norvegicus': '10116',
|
||||
'zebrafish': '7955',
|
||||
'danio rerio': '7955',
|
||||
'fruit fly': '7227',
|
||||
'drosophila melanogaster': '7227',
|
||||
'c. elegans': '6239',
|
||||
'caenorhabditis elegans': '6239',
|
||||
'yeast': '4932',
|
||||
'saccharomyces cerevisiae': '4932',
|
||||
'arabidopsis': '3702',
|
||||
'arabidopsis thaliana': '3702',
|
||||
'e. coli': '562',
|
||||
'escherichia coli': '562',
|
||||
}
|
||||
|
||||
taxon_lower = taxon_name.lower().strip()
|
||||
return common_taxa.get(taxon_lower)
|
||||
|
||||
|
||||
def fetch_gene_by_id(gene_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch gene data by Gene ID.
|
||||
|
||||
Args:
|
||||
gene_id: NCBI Gene ID
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Gene data as dictionary
|
||||
"""
|
||||
url = f"{DATASETS_API_BASE}/id/{gene_id}"
|
||||
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers['api-key'] = api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(req) as response:
|
||||
return json.loads(response.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
if e.code == 404:
|
||||
print(f"Gene ID {gene_id} not found", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_gene_by_symbol(symbol: str, taxon: str, api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch gene data by gene symbol and taxon.
|
||||
|
||||
Args:
|
||||
symbol: Gene symbol (e.g., "BRCA1")
|
||||
taxon: Organism name or taxon ID
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Gene data as dictionary
|
||||
"""
|
||||
# Convert taxon name to ID if needed
|
||||
taxon_id = get_taxon_id(taxon)
|
||||
if not taxon_id:
|
||||
# Try to use as-is (might already be a taxon ID)
|
||||
taxon_id = taxon
|
||||
|
||||
url = f"{DATASETS_API_BASE}/symbol/{symbol}/taxon/{taxon_id}"
|
||||
|
||||
headers = {}
|
||||
if api_key:
|
||||
headers['api-key'] = api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
with urllib.request.urlopen(req) as response:
|
||||
return json.loads(response.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
if e.code == 404:
|
||||
print(f"Gene symbol '{symbol}' not found for taxon {taxon}", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def fetch_multiple_genes(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch data for multiple genes by ID.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Combined gene data as dictionary
|
||||
"""
|
||||
# For multiple genes, use POST request
|
||||
url = f"{DATASETS_API_BASE}/id"
|
||||
|
||||
data = json.dumps({"gene_ids": gene_ids}).encode('utf-8')
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
|
||||
if api_key:
|
||||
headers['api-key'] = api_key
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, data=data, headers=headers, method='POST')
|
||||
with urllib.request.urlopen(req) as response:
|
||||
return json.loads(response.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def display_gene_info(data: Dict[str, Any], verbose: bool = False) -> None:
|
||||
"""
|
||||
Display gene information in human-readable format.
|
||||
|
||||
Args:
|
||||
data: Gene data dictionary from API
|
||||
verbose: Show detailed information
|
||||
"""
|
||||
if 'genes' not in data:
|
||||
print("No gene data found in response")
|
||||
return
|
||||
|
||||
for gene in data['genes']:
|
||||
gene_info = gene.get('gene', {})
|
||||
|
||||
print(f"Gene ID: {gene_info.get('gene_id', 'N/A')}")
|
||||
print(f"Symbol: {gene_info.get('symbol', 'N/A')}")
|
||||
print(f"Description: {gene_info.get('description', 'N/A')}")
|
||||
|
||||
if 'tax_name' in gene_info:
|
||||
print(f"Organism: {gene_info['tax_name']}")
|
||||
|
||||
if 'chromosomes' in gene_info:
|
||||
chromosomes = ', '.join(gene_info['chromosomes'])
|
||||
print(f"Chromosome(s): {chromosomes}")
|
||||
|
||||
# Nomenclature
|
||||
if 'nomenclature_authority' in gene_info:
|
||||
auth = gene_info['nomenclature_authority']
|
||||
print(f"Nomenclature: {auth.get('authority', 'N/A')}")
|
||||
|
||||
# Synonyms
|
||||
if 'synonyms' in gene_info and gene_info['synonyms']:
|
||||
print(f"Synonyms: {', '.join(gene_info['synonyms'])}")
|
||||
|
||||
if verbose:
|
||||
# Gene type
|
||||
if 'type' in gene_info:
|
||||
print(f"Type: {gene_info['type']}")
|
||||
|
||||
# Genomic locations
|
||||
if 'genomic_ranges' in gene_info:
|
||||
print("\nGenomic Locations:")
|
||||
for range_info in gene_info['genomic_ranges']:
|
||||
accession = range_info.get('accession_version', 'N/A')
|
||||
start = range_info.get('range', [{}])[0].get('begin', 'N/A')
|
||||
end = range_info.get('range', [{}])[0].get('end', 'N/A')
|
||||
strand = range_info.get('orientation', 'N/A')
|
||||
print(f" {accession}: {start}-{end} ({strand})")
|
||||
|
||||
# Transcripts
|
||||
if 'transcripts' in gene_info:
|
||||
print(f"\nTranscripts: {len(gene_info['transcripts'])}")
|
||||
for transcript in gene_info['transcripts'][:5]: # Show first 5
|
||||
print(f" {transcript.get('accession_version', 'N/A')}")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Fetch gene data from NCBI Datasets API',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Fetch by Gene ID
|
||||
%(prog)s --gene-id 672
|
||||
|
||||
# Fetch by gene symbol and organism
|
||||
%(prog)s --symbol BRCA1 --taxon human
|
||||
|
||||
# Fetch multiple genes
|
||||
%(prog)s --gene-id 672,7157,5594
|
||||
|
||||
# Get JSON output
|
||||
%(prog)s --symbol TP53 --taxon "Homo sapiens" --output json
|
||||
|
||||
# Verbose output with details
|
||||
%(prog)s --gene-id 672 --verbose
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--gene-id', '-g', help='Gene ID(s), comma-separated')
|
||||
parser.add_argument('--symbol', '-s', help='Gene symbol')
|
||||
parser.add_argument('--taxon', '-t', help='Organism name or taxon ID (required with --symbol)')
|
||||
parser.add_argument('--output', '-o', choices=['pretty', 'json'], default='pretty',
|
||||
help='Output format (default: pretty)')
|
||||
parser.add_argument('--verbose', '-v', action='store_true',
|
||||
help='Show detailed information')
|
||||
parser.add_argument('--api-key', '-k', help='NCBI API key')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.gene_id and not args.symbol:
|
||||
parser.error("Either --gene-id or --symbol must be provided")
|
||||
|
||||
if args.symbol and not args.taxon:
|
||||
parser.error("--taxon is required when using --symbol")
|
||||
|
||||
# Fetch data
|
||||
if args.gene_id:
|
||||
gene_ids = [id.strip() for id in args.gene_id.split(',')]
|
||||
if len(gene_ids) == 1:
|
||||
data = fetch_gene_by_id(gene_ids[0], api_key=args.api_key)
|
||||
else:
|
||||
data = fetch_multiple_genes(gene_ids, api_key=args.api_key)
|
||||
else:
|
||||
data = fetch_gene_by_symbol(args.symbol, args.taxon, api_key=args.api_key)
|
||||
|
||||
if not data:
|
||||
sys.exit(1)
|
||||
|
||||
# Output
|
||||
if args.output == 'json':
|
||||
print(json.dumps(data, indent=2))
|
||||
else:
|
||||
display_gene_info(data, verbose=args.verbose)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
251
skills/gene-database/scripts/query_gene.py
Normal file
251
skills/gene-database/scripts/query_gene.py
Normal file
@@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Query NCBI Gene database using E-utilities.
|
||||
|
||||
This script provides access to ESearch, ESummary, and EFetch functions
|
||||
for searching and retrieving gene information.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Optional, Dict, List, Any
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
|
||||
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
||||
DB = "gene"
|
||||
|
||||
|
||||
def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Search NCBI Gene database and return list of Gene IDs.
|
||||
|
||||
Args:
|
||||
query: Search query (e.g., "BRCA1[gene] AND human[organism]")
|
||||
retmax: Maximum number of results to return
|
||||
api_key: Optional NCBI API key for higher rate limits
|
||||
|
||||
Returns:
|
||||
List of Gene IDs as strings
|
||||
"""
|
||||
params = {
|
||||
'db': DB,
|
||||
'term': query,
|
||||
'retmax': retmax,
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
|
||||
return data['esearchresult']['idlist']
|
||||
else:
|
||||
print(f"Error: Unexpected response format", file=sys.stderr)
|
||||
return []
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Get document summaries for Gene IDs.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Dictionary of gene summaries
|
||||
"""
|
||||
params = {
|
||||
'db': DB,
|
||||
'id': ','.join(gene_ids),
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
return data
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
|
||||
"""
|
||||
Fetch full gene records.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
retmode: Return format ('xml', 'text', 'asn.1')
|
||||
api_key: Optional NCBI API key
|
||||
|
||||
Returns:
|
||||
Gene records as string in requested format
|
||||
"""
|
||||
params = {
|
||||
'db': DB,
|
||||
'id': ','.join(gene_ids),
|
||||
'retmode': retmode
|
||||
}
|
||||
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url) as response:
|
||||
return response.read().decode()
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return ""
|
||||
|
||||
|
||||
def search_and_summarize(query: str, organism: Optional[str] = None,
|
||||
max_results: int = 20, api_key: Optional[str] = None) -> None:
|
||||
"""
|
||||
Search for genes and display summaries.
|
||||
|
||||
Args:
|
||||
query: Gene search query
|
||||
organism: Optional organism filter
|
||||
max_results: Maximum number of results
|
||||
api_key: Optional NCBI API key
|
||||
"""
|
||||
# Add organism filter if provided
|
||||
if organism:
|
||||
if '[organism]' not in query.lower():
|
||||
query = f"{query} AND {organism}[organism]"
|
||||
|
||||
print(f"Searching for: {query}")
|
||||
print("-" * 80)
|
||||
|
||||
# Search for gene IDs
|
||||
gene_ids = esearch(query, retmax=max_results, api_key=api_key)
|
||||
|
||||
if not gene_ids:
|
||||
print("No results found.")
|
||||
return
|
||||
|
||||
print(f"Found {len(gene_ids)} gene(s)")
|
||||
print()
|
||||
|
||||
# Get summaries
|
||||
summaries = esummary(gene_ids, api_key=api_key)
|
||||
|
||||
if 'result' in summaries:
|
||||
for gene_id in gene_ids:
|
||||
if gene_id in summaries['result']:
|
||||
gene = summaries['result'][gene_id]
|
||||
print(f"Gene ID: {gene_id}")
|
||||
print(f" Symbol: {gene.get('name', 'N/A')}")
|
||||
print(f" Description: {gene.get('description', 'N/A')}")
|
||||
print(f" Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
|
||||
print(f" Chromosome: {gene.get('chromosome', 'N/A')}")
|
||||
print(f" Map Location: {gene.get('maplocation', 'N/A')}")
|
||||
print(f" Type: {gene.get('geneticsource', 'N/A')}")
|
||||
print()
|
||||
|
||||
# Respect rate limits
|
||||
time.sleep(0.34) # ~3 requests per second
|
||||
|
||||
|
||||
def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
|
||||
api_key: Optional[str] = None) -> None:
|
||||
"""
|
||||
Fetch and display gene information by ID.
|
||||
|
||||
Args:
|
||||
gene_ids: List of Gene IDs
|
||||
output_format: Output format ('json', 'xml', 'text')
|
||||
api_key: Optional NCBI API key
|
||||
"""
|
||||
if output_format == 'json':
|
||||
# Get summaries in JSON format
|
||||
summaries = esummary(gene_ids, api_key=api_key)
|
||||
print(json.dumps(summaries, indent=2))
|
||||
else:
|
||||
# Fetch full records
|
||||
data = efetch(gene_ids, retmode=output_format, api_key=api_key)
|
||||
print(data)
|
||||
|
||||
# Respect rate limits
|
||||
time.sleep(0.34)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Query NCBI Gene database using E-utilities',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Search for gene by symbol
|
||||
%(prog)s --search "BRCA1" --organism "human"
|
||||
|
||||
# Fetch gene by ID
|
||||
%(prog)s --id 672 --format json
|
||||
|
||||
# Complex search query
|
||||
%(prog)s --search "insulin[gene] AND diabetes[disease]"
|
||||
|
||||
# Multiple gene IDs
|
||||
%(prog)s --id 672,7157,5594
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--search', '-s', help='Search query')
|
||||
parser.add_argument('--organism', '-o', help='Organism filter')
|
||||
parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
|
||||
parser.add_argument('--format', '-f', default='json',
|
||||
choices=['json', 'xml', 'text'],
|
||||
help='Output format (default: json)')
|
||||
parser.add_argument('--max-results', '-m', type=int, default=20,
|
||||
help='Maximum number of search results (default: 20)')
|
||||
parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.search and not args.id:
|
||||
parser.error("Either --search or --id must be provided")
|
||||
|
||||
if args.id:
|
||||
# Fetch by ID
|
||||
gene_ids = [id.strip() for id in args.id.split(',')]
|
||||
fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
|
||||
else:
|
||||
# Search and summarize
|
||||
search_and_summarize(args.search, organism=args.organism,
|
||||
max_results=args.max_results, api_key=args.api_key)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user