Initial commit
This commit is contained in:
256
skills/uniprot-database/scripts/uniprot_client.py
Normal file
256
skills/uniprot-database/scripts/uniprot_client.py
Normal file
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
UniProt REST API Client
|
||||
|
||||
A Python client for interacting with the UniProt REST API.
|
||||
Provides helper functions for common operations including search,
|
||||
retrieval, ID mapping, and streaming.
|
||||
|
||||
Usage examples:
|
||||
# Search for proteins
|
||||
results = search_proteins("insulin AND organism_name:human", format="json")
|
||||
|
||||
# Get a single protein
|
||||
protein = get_protein("P12345", format="fasta")
|
||||
|
||||
# Map IDs
|
||||
mapped = map_ids(["P12345", "P04637"], from_db="UniProtKB_AC-ID", to_db="PDB")
|
||||
|
||||
# Stream large results
|
||||
for batch in stream_results("taxonomy_id:9606 AND reviewed:true", format="fasta"):
|
||||
process(batch)
|
||||
"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import json
|
||||
from typing import List, Dict, Optional, Generator
|
||||
from urllib.parse import urlencode
|
||||
|
||||
BASE_URL = "https://rest.uniprot.org"
|
||||
POLLING_INTERVAL = 3 # seconds
|
||||
|
||||
|
||||
def search_proteins(query: str, format: str = "json",
|
||||
fields: Optional[List[str]] = None,
|
||||
size: int = 25) -> Dict:
|
||||
"""
|
||||
Search UniProt database with a query.
|
||||
|
||||
Args:
|
||||
query: Search query (e.g., "insulin AND organism_name:human")
|
||||
format: Response format (json, tsv, xlsx, xml, fasta, txt, rdf)
|
||||
fields: List of fields to return (e.g., ["accession", "gene_names", "organism_name"])
|
||||
size: Number of results per page (default 25, max 500)
|
||||
|
||||
Returns:
|
||||
Response data in requested format
|
||||
"""
|
||||
endpoint = f"{BASE_URL}/uniprotkb/search"
|
||||
|
||||
params = {
|
||||
"query": query,
|
||||
"format": format,
|
||||
"size": size
|
||||
}
|
||||
|
||||
if fields:
|
||||
params["fields"] = ",".join(fields)
|
||||
|
||||
response = requests.get(endpoint, params=params)
|
||||
response.raise_for_status()
|
||||
|
||||
if format == "json":
|
||||
return response.json()
|
||||
else:
|
||||
return response.text
|
||||
|
||||
|
||||
def get_protein(accession: str, format: str = "json") -> str:
|
||||
"""
|
||||
Retrieve a single protein entry by accession number.
|
||||
|
||||
Args:
|
||||
accession: UniProt accession number (e.g., "P12345")
|
||||
format: Response format (json, txt, xml, fasta, gff, rdf)
|
||||
|
||||
Returns:
|
||||
Protein data in requested format
|
||||
"""
|
||||
endpoint = f"{BASE_URL}/uniprotkb/{accession}.{format}"
|
||||
|
||||
response = requests.get(endpoint)
|
||||
response.raise_for_status()
|
||||
|
||||
if format == "json":
|
||||
return response.json()
|
||||
else:
|
||||
return response.text
|
||||
|
||||
|
||||
def batch_retrieve(accessions: List[str], format: str = "json",
|
||||
fields: Optional[List[str]] = None) -> str:
|
||||
"""
|
||||
Retrieve multiple protein entries efficiently.
|
||||
|
||||
Args:
|
||||
accessions: List of UniProt accession numbers
|
||||
format: Response format
|
||||
fields: List of fields to return
|
||||
|
||||
Returns:
|
||||
Combined results in requested format
|
||||
"""
|
||||
query = " OR ".join([f"accession:{acc}" for acc in accessions])
|
||||
return search_proteins(query, format=format, fields=fields, size=len(accessions))
|
||||
|
||||
|
||||
def stream_results(query: str, format: str = "fasta",
|
||||
fields: Optional[List[str]] = None,
|
||||
chunk_size: int = 8192) -> Generator[str, None, None]:
|
||||
"""
|
||||
Stream large result sets without pagination.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
format: Response format
|
||||
fields: List of fields to return
|
||||
chunk_size: Size of chunks to yield
|
||||
|
||||
Yields:
|
||||
Chunks of response data
|
||||
"""
|
||||
endpoint = f"{BASE_URL}/uniprotkb/stream"
|
||||
|
||||
params = {
|
||||
"query": query,
|
||||
"format": format
|
||||
}
|
||||
|
||||
if fields:
|
||||
params["fields"] = ",".join(fields)
|
||||
|
||||
response = requests.get(endpoint, params=params, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
for chunk in response.iter_content(chunk_size=chunk_size, decode_unicode=True):
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
|
||||
def map_ids(ids: List[str], from_db: str, to_db: str,
|
||||
format: str = "json") -> Dict:
|
||||
"""
|
||||
Map protein identifiers between different database systems.
|
||||
|
||||
Args:
|
||||
ids: List of identifiers to map (max 100,000)
|
||||
from_db: Source database (e.g., "UniProtKB_AC-ID", "Gene_Name")
|
||||
to_db: Target database (e.g., "PDB", "Ensembl", "RefSeq_Protein")
|
||||
format: Response format
|
||||
|
||||
Returns:
|
||||
Mapping results
|
||||
|
||||
Note:
|
||||
- Maximum 100,000 IDs per job
|
||||
- Results stored for 7 days
|
||||
- See id_mapping_databases.md for all supported databases
|
||||
"""
|
||||
if len(ids) > 100000:
|
||||
raise ValueError("Maximum 100,000 IDs allowed per mapping job")
|
||||
|
||||
# Step 1: Submit job
|
||||
submit_endpoint = f"{BASE_URL}/idmapping/run"
|
||||
|
||||
data = {
|
||||
"from": from_db,
|
||||
"to": to_db,
|
||||
"ids": ",".join(ids)
|
||||
}
|
||||
|
||||
response = requests.post(submit_endpoint, data=data)
|
||||
response.raise_for_status()
|
||||
job_id = response.json()["jobId"]
|
||||
|
||||
# Step 2: Poll for completion
|
||||
status_endpoint = f"{BASE_URL}/idmapping/status/{job_id}"
|
||||
|
||||
while True:
|
||||
response = requests.get(status_endpoint)
|
||||
response.raise_for_status()
|
||||
status = response.json()
|
||||
|
||||
if "results" in status or "failedIds" in status:
|
||||
break
|
||||
|
||||
time.sleep(POLLING_INTERVAL)
|
||||
|
||||
# Step 3: Retrieve results
|
||||
results_endpoint = f"{BASE_URL}/idmapping/results/{job_id}"
|
||||
|
||||
params = {"format": format}
|
||||
response = requests.get(results_endpoint, params=params)
|
||||
response.raise_for_status()
|
||||
|
||||
if format == "json":
|
||||
return response.json()
|
||||
else:
|
||||
return response.text
|
||||
|
||||
|
||||
def get_available_fields() -> List[Dict]:
|
||||
"""
|
||||
Get list of all available fields for queries.
|
||||
|
||||
Returns:
|
||||
List of field definitions with names and descriptions
|
||||
"""
|
||||
endpoint = f"{BASE_URL}/configure/uniprotkb/result-fields"
|
||||
|
||||
response = requests.get(endpoint)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()
|
||||
|
||||
|
||||
def get_id_mapping_databases() -> Dict:
|
||||
"""
|
||||
Get list of all supported databases for ID mapping.
|
||||
|
||||
Returns:
|
||||
Dictionary of database groups and their supported databases
|
||||
"""
|
||||
endpoint = f"{BASE_URL}/configure/idmapping/fields"
|
||||
|
||||
response = requests.get(endpoint)
|
||||
response.raise_for_status()
|
||||
|
||||
return response.json()
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Example 1: Search for human insulin proteins
|
||||
print("Searching for human insulin proteins...")
|
||||
results = search_proteins(
|
||||
"insulin AND organism_name:human AND reviewed:true",
|
||||
format="json",
|
||||
fields=["accession", "id", "gene_names", "protein_name"],
|
||||
size=5
|
||||
)
|
||||
print(json.dumps(results, indent=2))
|
||||
|
||||
# Example 2: Get a specific protein in FASTA format
|
||||
print("\nRetrieving protein P01308 (human insulin)...")
|
||||
protein = get_protein("P01308", format="fasta")
|
||||
print(protein)
|
||||
|
||||
# Example 3: Map UniProt IDs to PDB IDs
|
||||
print("\nMapping UniProt IDs to PDB...")
|
||||
mapping = map_ids(
|
||||
["P01308", "P04637"],
|
||||
from_db="UniProtKB_AC-ID",
|
||||
to_db="PDB"
|
||||
)
|
||||
print(json.dumps(mapping, indent=2))
|
||||
Reference in New Issue
Block a user