Files
gh-k-dense-ai-claude-scient…/skills/ensembl-database/scripts/ensembl_query.py
2025-11-30 08:30:10 +08:00

428 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Ensembl REST API Query Script
Reusable functions for common Ensembl database queries with built-in rate limiting and error handling.
Usage:
python ensembl_query.py --gene BRCA2 --species human
python ensembl_query.py --variant rs699 --species human
python ensembl_query.py --region "7:140424943-140624564" --species human
"""
import requests
import time
import json
import argparse
from typing import Dict, List, Optional, Any
class EnsemblAPIClient:
"""Client for querying the Ensembl REST API with rate limiting and error handling."""
def __init__(self, server: str = "https://rest.ensembl.org", rate_limit: int = 15):
"""
Initialize the Ensembl API client.
Args:
server: Base URL for the Ensembl REST API
rate_limit: Maximum requests per second (default 15 for anonymous users)
"""
self.server = server
self.rate_limit = rate_limit
self.request_count = 0
self.last_request_time = 0
def _rate_limit_check(self):
"""Enforce rate limiting before making requests."""
current_time = time.time()
time_since_last = current_time - self.last_request_time
if time_since_last < 1.0:
if self.request_count >= self.rate_limit:
sleep_time = 1.0 - time_since_last
time.sleep(sleep_time)
self.request_count = 0
self.last_request_time = time.time()
else:
self.request_count = 0
self.last_request_time = current_time
def _make_request(
self,
endpoint: str,
params: Optional[Dict] = None,
max_retries: int = 3,
method: str = "GET",
data: Optional[Dict] = None
) -> Any:
"""
Make an API request with error handling and retries.
Args:
endpoint: API endpoint path
params: Query parameters
max_retries: Maximum number of retry attempts
method: HTTP method (GET or POST)
data: JSON data for POST requests
Returns:
JSON response data
Raises:
Exception: If request fails after max retries
"""
headers = {"Content-Type": "application/json"}
url = f"{self.server}{endpoint}"
for attempt in range(max_retries):
self._rate_limit_check()
self.request_count += 1
try:
if method == "POST":
response = requests.post(url, headers=headers, json=data)
else:
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limited - wait and retry
retry_after = int(response.headers.get('Retry-After', 1))
print(f"Rate limited. Waiting {retry_after} seconds...")
time.sleep(retry_after)
elif response.status_code == 404:
raise Exception(f"Resource not found: {endpoint}")
else:
response.raise_for_status()
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
raise Exception(f"Request failed after {max_retries} attempts: {e}")
time.sleep(2 ** attempt) # Exponential backoff
raise Exception(f"Failed after {max_retries} attempts")
def lookup_gene_by_symbol(self, species: str, symbol: str, expand: bool = True) -> Dict:
"""
Look up gene information by symbol.
Args:
species: Species name (e.g., 'human', 'mouse')
symbol: Gene symbol (e.g., 'BRCA2', 'TP53')
expand: Include transcript information
Returns:
Gene information dictionary
"""
endpoint = f"/lookup/symbol/{species}/{symbol}"
params = {"expand": 1} if expand else {}
return self._make_request(endpoint, params=params)
def lookup_by_id(self, ensembl_id: str, expand: bool = False) -> Dict:
"""
Look up object by Ensembl ID.
Args:
ensembl_id: Ensembl identifier (e.g., 'ENSG00000139618')
expand: Include child objects
Returns:
Object information dictionary
"""
endpoint = f"/lookup/id/{ensembl_id}"
params = {"expand": 1} if expand else {}
return self._make_request(endpoint, params=params)
def get_sequence(
self,
ensembl_id: str,
seq_type: str = "genomic",
format: str = "json"
) -> Any:
"""
Retrieve sequence by Ensembl ID.
Args:
ensembl_id: Ensembl identifier
seq_type: Sequence type ('genomic', 'cds', 'cdna', 'protein')
format: Output format ('json', 'fasta', 'text')
Returns:
Sequence data
"""
endpoint = f"/sequence/id/{ensembl_id}"
params = {"type": seq_type}
if format == "fasta":
headers = {"Content-Type": "text/x-fasta"}
url = f"{self.server}{endpoint}"
response = requests.get(url, headers=headers, params=params)
return response.text
return self._make_request(endpoint, params=params)
def get_region_sequence(
self,
species: str,
region: str,
format: str = "json"
) -> Any:
"""
Get genomic sequence for a region.
Args:
species: Species name
region: Region string (e.g., '7:140424943-140624564')
format: Output format ('json', 'fasta', 'text')
Returns:
Sequence data
"""
endpoint = f"/sequence/region/{species}/{region}"
if format == "fasta":
headers = {"Content-Type": "text/x-fasta"}
url = f"{self.server}{endpoint}"
response = requests.get(url, headers=headers)
return response.text
return self._make_request(endpoint)
def get_variant(self, species: str, variant_id: str, include_pops: bool = True) -> Dict:
"""
Get variant information by ID.
Args:
species: Species name
variant_id: Variant identifier (e.g., 'rs699')
include_pops: Include population frequencies
Returns:
Variant information dictionary
"""
endpoint = f"/variation/{species}/{variant_id}"
params = {"pops": 1} if include_pops else {}
return self._make_request(endpoint, params=params)
def predict_variant_effect(
self,
species: str,
hgvs_notation: str
) -> List[Dict]:
"""
Predict variant consequences using VEP.
Args:
species: Species name
hgvs_notation: HGVS notation (e.g., 'ENST00000288602:c.803C>T')
Returns:
List of predicted consequences
"""
endpoint = f"/vep/{species}/hgvs/{hgvs_notation}"
return self._make_request(endpoint)
def find_orthologs(
self,
ensembl_id: str,
target_species: Optional[str] = None
) -> Dict:
"""
Find orthologs for a gene.
Args:
ensembl_id: Source gene Ensembl ID
target_species: Target species (optional, returns all if not specified)
Returns:
Homology information dictionary
"""
endpoint = f"/homology/id/{ensembl_id}"
params = {}
if target_species:
params["target_species"] = target_species
return self._make_request(endpoint, params=params)
def get_region_features(
self,
species: str,
region: str,
feature_type: str = "gene"
) -> List[Dict]:
"""
Get genomic features in a region.
Args:
species: Species name
region: Region string (e.g., '7:140424943-140624564')
feature_type: Feature type ('gene', 'transcript', 'variation', etc.)
Returns:
List of features
"""
endpoint = f"/overlap/region/{species}/{region}"
params = {"feature": feature_type}
return self._make_request(endpoint, params=params)
def get_species_info(self) -> List[Dict]:
"""
Get information about all available species.
Returns:
List of species information dictionaries
"""
endpoint = "/info/species"
result = self._make_request(endpoint)
return result.get("species", [])
def get_assembly_info(self, species: str) -> Dict:
"""
Get assembly information for a species.
Args:
species: Species name
Returns:
Assembly information dictionary
"""
endpoint = f"/info/assembly/{species}"
return self._make_request(endpoint)
def map_coordinates(
self,
species: str,
asm_from: str,
region: str,
asm_to: str
) -> Dict:
"""
Map coordinates between genome assemblies.
Args:
species: Species name
asm_from: Source assembly (e.g., 'GRCh37')
region: Region string (e.g., '7:140453136-140453136')
asm_to: Target assembly (e.g., 'GRCh38')
Returns:
Mapped coordinates
"""
endpoint = f"/map/{species}/{asm_from}/{region}/{asm_to}"
return self._make_request(endpoint)
def main():
"""Command-line interface for common Ensembl queries."""
parser = argparse.ArgumentParser(
description="Query the Ensembl database via REST API"
)
parser.add_argument("--gene", help="Gene symbol to look up")
parser.add_argument("--ensembl-id", help="Ensembl ID to look up")
parser.add_argument("--variant", help="Variant ID (e.g., rs699)")
parser.add_argument("--region", help="Genomic region (chr:start-end)")
parser.add_argument(
"--species",
default="human",
help="Species name (default: human)"
)
parser.add_argument(
"--orthologs",
help="Find orthologs for gene (provide Ensembl ID)"
)
parser.add_argument(
"--target-species",
help="Target species for ortholog search"
)
parser.add_argument(
"--sequence",
action="store_true",
help="Retrieve sequence (requires --gene or --ensembl-id or --region)"
)
parser.add_argument(
"--format",
choices=["json", "fasta"],
default="json",
help="Output format (default: json)"
)
parser.add_argument(
"--assembly",
default="GRCh37",
help="For GRCh37, use grch37.rest.ensembl.org server"
)
args = parser.parse_args()
# Select appropriate server
server = "https://rest.ensembl.org"
if args.assembly.lower() == "grch37":
server = "https://grch37.rest.ensembl.org"
client = EnsemblAPIClient(server=server)
try:
if args.gene:
print(f"Looking up gene: {args.gene}")
result = client.lookup_gene_by_symbol(args.species, args.gene)
if args.sequence:
print(f"\nRetrieving sequence for {result['id']}...")
seq_result = client.get_sequence(
result['id'],
format=args.format
)
print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
else:
print(json.dumps(result, indent=2))
elif args.ensembl_id:
print(f"Looking up ID: {args.ensembl_id}")
result = client.lookup_by_id(args.ensembl_id, expand=True)
if args.sequence:
print(f"\nRetrieving sequence...")
seq_result = client.get_sequence(
args.ensembl_id,
format=args.format
)
print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result)
else:
print(json.dumps(result, indent=2))
elif args.variant:
print(f"Looking up variant: {args.variant}")
result = client.get_variant(args.species, args.variant)
print(json.dumps(result, indent=2))
elif args.region:
if args.sequence:
print(f"Retrieving sequence for region: {args.region}")
result = client.get_region_sequence(
args.species,
args.region,
format=args.format
)
print(json.dumps(result, indent=2) if args.format == "json" else result)
else:
print(f"Finding features in region: {args.region}")
result = client.get_region_features(args.species, args.region)
print(json.dumps(result, indent=2))
elif args.orthologs:
print(f"Finding orthologs for: {args.orthologs}")
result = client.find_orthologs(
args.orthologs,
target_species=args.target_species
)
print(json.dumps(result, indent=2))
else:
parser.print_help()
except Exception as e:
print(f"Error: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())