252 lines
7.2 KiB
Python
252 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Query NCBI Gene database using E-utilities.
|
|
|
|
This script provides access to ESearch, ESummary, and EFetch functions
|
|
for searching and retrieving gene information.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
from typing import Optional, Dict, List, Any
|
|
from xml.etree import ElementTree as ET
|
|
|
|
|
|
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
|
DB = "gene"
|
|
|
|
|
|
def esearch(query: str, retmax: int = 20, api_key: Optional[str] = None) -> List[str]:
|
|
"""
|
|
Search NCBI Gene database and return list of Gene IDs.
|
|
|
|
Args:
|
|
query: Search query (e.g., "BRCA1[gene] AND human[organism]")
|
|
retmax: Maximum number of results to return
|
|
api_key: Optional NCBI API key for higher rate limits
|
|
|
|
Returns:
|
|
List of Gene IDs as strings
|
|
"""
|
|
params = {
|
|
'db': DB,
|
|
'term': query,
|
|
'retmax': retmax,
|
|
'retmode': 'json'
|
|
}
|
|
|
|
if api_key:
|
|
params['api_key'] = api_key
|
|
|
|
url = f"{BASE_URL}esearch.fcgi?{urllib.parse.urlencode(params)}"
|
|
|
|
try:
|
|
with urllib.request.urlopen(url) as response:
|
|
data = json.loads(response.read().decode())
|
|
|
|
if 'esearchresult' in data and 'idlist' in data['esearchresult']:
|
|
return data['esearchresult']['idlist']
|
|
else:
|
|
print(f"Error: Unexpected response format", file=sys.stderr)
|
|
return []
|
|
|
|
except urllib.error.HTTPError as e:
|
|
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
|
return []
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
|
|
def esummary(gene_ids: List[str], api_key: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Get document summaries for Gene IDs.
|
|
|
|
Args:
|
|
gene_ids: List of Gene IDs
|
|
api_key: Optional NCBI API key
|
|
|
|
Returns:
|
|
Dictionary of gene summaries
|
|
"""
|
|
params = {
|
|
'db': DB,
|
|
'id': ','.join(gene_ids),
|
|
'retmode': 'json'
|
|
}
|
|
|
|
if api_key:
|
|
params['api_key'] = api_key
|
|
|
|
url = f"{BASE_URL}esummary.fcgi?{urllib.parse.urlencode(params)}"
|
|
|
|
try:
|
|
with urllib.request.urlopen(url) as response:
|
|
data = json.loads(response.read().decode())
|
|
return data
|
|
except urllib.error.HTTPError as e:
|
|
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
|
return {}
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return {}
|
|
|
|
|
|
def efetch(gene_ids: List[str], retmode: str = 'xml', api_key: Optional[str] = None) -> str:
|
|
"""
|
|
Fetch full gene records.
|
|
|
|
Args:
|
|
gene_ids: List of Gene IDs
|
|
retmode: Return format ('xml', 'text', 'asn.1')
|
|
api_key: Optional NCBI API key
|
|
|
|
Returns:
|
|
Gene records as string in requested format
|
|
"""
|
|
params = {
|
|
'db': DB,
|
|
'id': ','.join(gene_ids),
|
|
'retmode': retmode
|
|
}
|
|
|
|
if api_key:
|
|
params['api_key'] = api_key
|
|
|
|
url = f"{BASE_URL}efetch.fcgi?{urllib.parse.urlencode(params)}"
|
|
|
|
try:
|
|
with urllib.request.urlopen(url) as response:
|
|
return response.read().decode()
|
|
except urllib.error.HTTPError as e:
|
|
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
|
|
return ""
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return ""
|
|
|
|
|
|
def search_and_summarize(query: str, organism: Optional[str] = None,
|
|
max_results: int = 20, api_key: Optional[str] = None) -> None:
|
|
"""
|
|
Search for genes and display summaries.
|
|
|
|
Args:
|
|
query: Gene search query
|
|
organism: Optional organism filter
|
|
max_results: Maximum number of results
|
|
api_key: Optional NCBI API key
|
|
"""
|
|
# Add organism filter if provided
|
|
if organism:
|
|
if '[organism]' not in query.lower():
|
|
query = f"{query} AND {organism}[organism]"
|
|
|
|
print(f"Searching for: {query}")
|
|
print("-" * 80)
|
|
|
|
# Search for gene IDs
|
|
gene_ids = esearch(query, retmax=max_results, api_key=api_key)
|
|
|
|
if not gene_ids:
|
|
print("No results found.")
|
|
return
|
|
|
|
print(f"Found {len(gene_ids)} gene(s)")
|
|
print()
|
|
|
|
# Get summaries
|
|
summaries = esummary(gene_ids, api_key=api_key)
|
|
|
|
if 'result' in summaries:
|
|
for gene_id in gene_ids:
|
|
if gene_id in summaries['result']:
|
|
gene = summaries['result'][gene_id]
|
|
print(f"Gene ID: {gene_id}")
|
|
print(f" Symbol: {gene.get('name', 'N/A')}")
|
|
print(f" Description: {gene.get('description', 'N/A')}")
|
|
print(f" Organism: {gene.get('organism', {}).get('scientificname', 'N/A')}")
|
|
print(f" Chromosome: {gene.get('chromosome', 'N/A')}")
|
|
print(f" Map Location: {gene.get('maplocation', 'N/A')}")
|
|
print(f" Type: {gene.get('geneticsource', 'N/A')}")
|
|
print()
|
|
|
|
# Respect rate limits
|
|
time.sleep(0.34) # ~3 requests per second
|
|
|
|
|
|
def fetch_by_id(gene_ids: List[str], output_format: str = 'json',
|
|
api_key: Optional[str] = None) -> None:
|
|
"""
|
|
Fetch and display gene information by ID.
|
|
|
|
Args:
|
|
gene_ids: List of Gene IDs
|
|
output_format: Output format ('json', 'xml', 'text')
|
|
api_key: Optional NCBI API key
|
|
"""
|
|
if output_format == 'json':
|
|
# Get summaries in JSON format
|
|
summaries = esummary(gene_ids, api_key=api_key)
|
|
print(json.dumps(summaries, indent=2))
|
|
else:
|
|
# Fetch full records
|
|
data = efetch(gene_ids, retmode=output_format, api_key=api_key)
|
|
print(data)
|
|
|
|
# Respect rate limits
|
|
time.sleep(0.34)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Query NCBI Gene database using E-utilities',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Search for gene by symbol
|
|
%(prog)s --search "BRCA1" --organism "human"
|
|
|
|
# Fetch gene by ID
|
|
%(prog)s --id 672 --format json
|
|
|
|
# Complex search query
|
|
%(prog)s --search "insulin[gene] AND diabetes[disease]"
|
|
|
|
# Multiple gene IDs
|
|
%(prog)s --id 672,7157,5594
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('--search', '-s', help='Search query')
|
|
parser.add_argument('--organism', '-o', help='Organism filter')
|
|
parser.add_argument('--id', '-i', help='Gene ID(s), comma-separated')
|
|
parser.add_argument('--format', '-f', default='json',
|
|
choices=['json', 'xml', 'text'],
|
|
help='Output format (default: json)')
|
|
parser.add_argument('--max-results', '-m', type=int, default=20,
|
|
help='Maximum number of search results (default: 20)')
|
|
parser.add_argument('--api-key', '-k', help='NCBI API key for higher rate limits')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.search and not args.id:
|
|
parser.error("Either --search or --id must be provided")
|
|
|
|
if args.id:
|
|
# Fetch by ID
|
|
gene_ids = [id.strip() for id in args.id.split(',')]
|
|
fetch_by_id(gene_ids, output_format=args.format, api_key=args.api_key)
|
|
else:
|
|
# Search and summarize
|
|
search_and_summarize(args.search, organism=args.organism,
|
|
max_results=args.max_results, api_key=args.api_key)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|