Files
2025-11-30 08:30:10 +08:00

252 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Query NCBI Gene database using E-utilities.
This script provides access to ESearch, ESummary, and EFetch functions
for searching and retrieving gene information.
"""
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from typing import Optional, Dict, List, Any
from xml.etree import ElementTree as ET
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
DB = "gene"
def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
"""
Search NCBI Gene database and return list of Gene IDs.
Args:
query: Search query (e.g., "BRCA1[gene] AND human[organism]")
retmax: Maximum number of results to return
api_key: Optional NCBI API key for higher rate limits
Returns:
List of Gene IDs as strings
"""
params = {
'db': DB,
'term': query,
'retmax': retmax,
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
return data['esearchresult']['idlist']
else:
print(f"Error: Unexpected response format", file=sys.stderr)
return []
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return []
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return []
def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Get document summaries for Gene IDs.
Args:
gene_ids: List of Gene IDs
api_key: Optional NCBI API key
Returns:
Dictionary of gene summaries
"""
params = {
'db': DB,
'id': ','.join(gene_ids),
'retmode': 'json'
}
if api_key:
params['api_key'] = api_key
url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
data = json.loads(response.read().decode())
return data
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return {}
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return {}
def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
"""
Fetch full gene records.
Args:
gene_ids: List of Gene IDs
retmode: Return format ('xml', 'text', 'asn.1')
api_key: Optional NCBI API key
Returns:
Gene records as string in requested format
"""
params = {
'db': DB,
'id': ','.join(gene_ids),
'retmode': retmode
}
if api_key:
params['api_key'] = api_key
url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"
try:
with urllib.request.urlopen(url) as response:
return response.read().decode()
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
return ""
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return ""
def search_and_summarize(query: str, organism: Optional[str] = None,
max_results: int = 20, api_key: Optional[str] = None) -> None:
"""
Search for genes and display summaries.
Args:
query: Gene search query
organism: Optional organism filter
max_results: Maximum number of results
api_key: Optional NCBI API key
"""
# Add organism filter if provided
if organism:
if '[organism]' not in query.lower():
query = f"{query} AND {organism}[organism]"
print(f"Searching for: {query}")
print("-" * 80)
# Search for gene IDs
gene_ids = esearch(query, retmax=max_results, api_key=api_key)
if not gene_ids:
print("No results found.")
return
print(f"Found {len(gene_ids)} gene(s)")
print()
# Get summaries
summaries = esummary(gene_ids, api_key=api_key)
if 'result' in summaries:
for gene_id in gene_ids:
if gene_id in summaries['result']:
gene = summaries['result'][gene_id]
print(f"Gene ID: {gene_id}")
print(f" Symbol: {gene.get('name', 'N/A')}")
print(f" Description: {gene.get('description', 'N/A')}")
print(f" Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
print(f" Chromosome: {gene.get('chromosome', 'N/A')}")
print(f" Map Location: {gene.get('maplocation', 'N/A')}")
print(f" Type: {gene.get('geneticsource', 'N/A')}")
print()
# Respect rate limits
time.sleep(0.34) # ~3 requests per second
def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
api_key: Optional[str] = None) -> None:
"""
Fetch and display gene information by ID.
Args:
gene_ids: List of Gene IDs
output_format: Output format ('json', 'xml', 'text')
api_key: Optional NCBI API key
"""
if output_format == 'json':
# Get summaries in JSON format
summaries = esummary(gene_ids, api_key=api_key)
print(json.dumps(summaries, indent=2))
else:
# Fetch full records
data = efetch(gene_ids, retmode=output_format, api_key=api_key)
print(data)
# Respect rate limits
time.sleep(0.34)
def main():
parser = argparse.ArgumentParser(
description='Query NCBI Gene database using E-utilities',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Search for gene by symbol
%(prog)s --search "BRCA1" --organism "human"
# Fetch gene by ID
%(prog)s --id 672 --format json
# Complex search query
%(prog)s --search "insulin[gene] AND diabetes[disease]"
# Multiple gene IDs
%(prog)s --id 672,7157,5594
"""
)
parser.add_argument('--search', '-s', help='Search query')
parser.add_argument('--organism', '-o', help='Organism filter')
parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
parser.add_argument('--format', '-f', default='json',
choices=['json', 'xml', 'text'],
help='Output format (default: json)')
parser.add_argument('--max-results', '-m', type=int, default=20,
help='Maximum number of search results (default: 20)')
parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')
args = parser.parse_args()
if not args.search and not args.id:
parser.error("Either --search or --id must be provided")
if args.id:
# Fetch by ID
gene_ids = [id.strip() for id in args.id.split(',')]
fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
else:
# Search and summarize
search_and_summarize(args.search, organism=args.organism,
max_results=args.max_results, api_key=args.api_key)
if __name__ == '__main__':
main()