#!/usr/bin/env python3 """ Ensembl REST API Query Script Reusable functions for common Ensembl database queries with built-in rate limiting and error handling. Usage: python ensembl_query.py --gene BRCA2 --species human python ensembl_query.py --variant rs699 --species human python ensembl_query.py --region "7:140424943-140624564" --species human """ import requests import time import json import argparse from typing import Dict, List, Optional, Any class EnsemblAPIClient: """Client for querying the Ensembl REST API with rate limiting and error handling.""" def __init__(self, server: str = "https://rest.ensembl.org", rate_limit: int = 15): """ Initialize the Ensembl API client. Args: server: Base URL for the Ensembl REST API rate_limit: Maximum requests per second (default 15 for anonymous users) """ self.server = server self.rate_limit = rate_limit self.request_count = 0 self.last_request_time = 0 def _rate_limit_check(self): """Enforce rate limiting before making requests.""" current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < 1.0: if self.request_count >= self.rate_limit: sleep_time = 1.0 - time_since_last time.sleep(sleep_time) self.request_count = 0 self.last_request_time = time.time() else: self.request_count = 0 self.last_request_time = current_time def _make_request( self, endpoint: str, params: Optional[Dict] = None, max_retries: int = 3, method: str = "GET", data: Optional[Dict] = None ) -> Any: """ Make an API request with error handling and retries. Args: endpoint: API endpoint path params: Query parameters max_retries: Maximum number of retry attempts method: HTTP method (GET or POST) data: JSON data for POST requests Returns: JSON response data Raises: Exception: If request fails after max retries """ headers = {"Content-Type": "application/json"} url = f"{self.server}{endpoint}" for attempt in range(max_retries): self._rate_limit_check() self.request_count += 1 try: if method == "POST": response = requests.post(url, headers=headers, json=data) else: response = requests.get(url, headers=headers, params=params) if response.status_code == 200: return response.json() elif response.status_code == 429: # Rate limited - wait and retry retry_after = int(response.headers.get('Retry-After', 1)) print(f"Rate limited. Waiting {retry_after} seconds...") time.sleep(retry_after) elif response.status_code == 404: raise Exception(f"Resource not found: {endpoint}") else: response.raise_for_status() except requests.exceptions.RequestException as e: if attempt == max_retries - 1: raise Exception(f"Request failed after {max_retries} attempts: {e}") time.sleep(2 ** attempt) # Exponential backoff raise Exception(f"Failed after {max_retries} attempts") def lookup_gene_by_symbol(self, species: str, symbol: str, expand: bool = True) -> Dict: """ Look up gene information by symbol. Args: species: Species name (e.g., 'human', 'mouse') symbol: Gene symbol (e.g., 'BRCA2', 'TP53') expand: Include transcript information Returns: Gene information dictionary """ endpoint = f"/lookup/symbol/{species}/{symbol}" params = {"expand": 1} if expand else {} return self._make_request(endpoint, params=params) def lookup_by_id(self, ensembl_id: str, expand: bool = False) -> Dict: """ Look up object by Ensembl ID. Args: ensembl_id: Ensembl identifier (e.g., 'ENSG00000139618') expand: Include child objects Returns: Object information dictionary """ endpoint = f"/lookup/id/{ensembl_id}" params = {"expand": 1} if expand else {} return self._make_request(endpoint, params=params) def get_sequence( self, ensembl_id: str, seq_type: str = "genomic", format: str = "json" ) -> Any: """ Retrieve sequence by Ensembl ID. Args: ensembl_id: Ensembl identifier seq_type: Sequence type ('genomic', 'cds', 'cdna', 'protein') format: Output format ('json', 'fasta', 'text') Returns: Sequence data """ endpoint = f"/sequence/id/{ensembl_id}" params = {"type": seq_type} if format == "fasta": headers = {"Content-Type": "text/x-fasta"} url = f"{self.server}{endpoint}" response = requests.get(url, headers=headers, params=params) return response.text return self._make_request(endpoint, params=params) def get_region_sequence( self, species: str, region: str, format: str = "json" ) -> Any: """ Get genomic sequence for a region. Args: species: Species name region: Region string (e.g., '7:140424943-140624564') format: Output format ('json', 'fasta', 'text') Returns: Sequence data """ endpoint = f"/sequence/region/{species}/{region}" if format == "fasta": headers = {"Content-Type": "text/x-fasta"} url = f"{self.server}{endpoint}" response = requests.get(url, headers=headers) return response.text return self._make_request(endpoint) def get_variant(self, species: str, variant_id: str, include_pops: bool = True) -> Dict: """ Get variant information by ID. Args: species: Species name variant_id: Variant identifier (e.g., 'rs699') include_pops: Include population frequencies Returns: Variant information dictionary """ endpoint = f"/variation/{species}/{variant_id}" params = {"pops": 1} if include_pops else {} return self._make_request(endpoint, params=params) def predict_variant_effect( self, species: str, hgvs_notation: str ) -> List[Dict]: """ Predict variant consequences using VEP. Args: species: Species name hgvs_notation: HGVS notation (e.g., 'ENST00000288602:c.803C>T') Returns: List of predicted consequences """ endpoint = f"/vep/{species}/hgvs/{hgvs_notation}" return self._make_request(endpoint) def find_orthologs( self, ensembl_id: str, target_species: Optional[str] = None ) -> Dict: """ Find orthologs for a gene. Args: ensembl_id: Source gene Ensembl ID target_species: Target species (optional, returns all if not specified) Returns: Homology information dictionary """ endpoint = f"/homology/id/{ensembl_id}" params = {} if target_species: params["target_species"] = target_species return self._make_request(endpoint, params=params) def get_region_features( self, species: str, region: str, feature_type: str = "gene" ) -> List[Dict]: """ Get genomic features in a region. Args: species: Species name region: Region string (e.g., '7:140424943-140624564') feature_type: Feature type ('gene', 'transcript', 'variation', etc.) Returns: List of features """ endpoint = f"/overlap/region/{species}/{region}" params = {"feature": feature_type} return self._make_request(endpoint, params=params) def get_species_info(self) -> List[Dict]: """ Get information about all available species. Returns: List of species information dictionaries """ endpoint = "/info/species" result = self._make_request(endpoint) return result.get("species", []) def get_assembly_info(self, species: str) -> Dict: """ Get assembly information for a species. Args: species: Species name Returns: Assembly information dictionary """ endpoint = f"/info/assembly/{species}" return self._make_request(endpoint) def map_coordinates( self, species: str, asm_from: str, region: str, asm_to: str ) -> Dict: """ Map coordinates between genome assemblies. Args: species: Species name asm_from: Source assembly (e.g., 'GRCh37') region: Region string (e.g., '7:140453136-140453136') asm_to: Target assembly (e.g., 'GRCh38') Returns: Mapped coordinates """ endpoint = f"/map/{species}/{asm_from}/{region}/{asm_to}" return self._make_request(endpoint) def main(): """Command-line interface for common Ensembl queries.""" parser = argparse.ArgumentParser( description="Query the Ensembl database via REST API" ) parser.add_argument("--gene", help="Gene symbol to look up") parser.add_argument("--ensembl-id", help="Ensembl ID to look up") parser.add_argument("--variant", help="Variant ID (e.g., rs699)") parser.add_argument("--region", help="Genomic region (chr:start-end)") parser.add_argument( "--species", default="human", help="Species name (default: human)" ) parser.add_argument( "--orthologs", help="Find orthologs for gene (provide Ensembl ID)" ) parser.add_argument( "--target-species", help="Target species for ortholog search" ) parser.add_argument( "--sequence", action="store_true", help="Retrieve sequence (requires --gene or --ensembl-id or --region)" ) parser.add_argument( "--format", choices=["json", "fasta"], default="json", help="Output format (default: json)" ) parser.add_argument( "--assembly", default="GRCh37", help="For GRCh37, use grch37.rest.ensembl.org server" ) args = parser.parse_args() # Select appropriate server server = "https://rest.ensembl.org" if args.assembly.lower() == "grch37": server = "https://grch37.rest.ensembl.org" client = EnsemblAPIClient(server=server) try: if args.gene: print(f"Looking up gene: {args.gene}") result = client.lookup_gene_by_symbol(args.species, args.gene) if args.sequence: print(f"\nRetrieving sequence for {result['id']}...") seq_result = client.get_sequence( result['id'], format=args.format ) print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result) else: print(json.dumps(result, indent=2)) elif args.ensembl_id: print(f"Looking up ID: {args.ensembl_id}") result = client.lookup_by_id(args.ensembl_id, expand=True) if args.sequence: print(f"\nRetrieving sequence...") seq_result = client.get_sequence( args.ensembl_id, format=args.format ) print(json.dumps(seq_result, indent=2) if args.format == "json" else seq_result) else: print(json.dumps(result, indent=2)) elif args.variant: print(f"Looking up variant: {args.variant}") result = client.get_variant(args.species, args.variant) print(json.dumps(result, indent=2)) elif args.region: if args.sequence: print(f"Retrieving sequence for region: {args.region}") result = client.get_region_sequence( args.species, args.region, format=args.format ) print(json.dumps(result, indent=2) if args.format == "json" else result) else: print(f"Finding features in region: {args.region}") result = client.get_region_features(args.species, args.region) print(json.dumps(result, indent=2)) elif args.orthologs: print(f"Finding orthologs for: {args.orthologs}") result = client.find_orthologs( args.orthologs, target_species=args.target_species ) print(json.dumps(result, indent=2)) else: parser.print_help() except Exception as e: print(f"Error: {e}") return 1 return 0 if __name__ == "__main__": exit(main())