""" STRING Database REST API Helper Functions This module provides Python functions for interacting with the STRING database API. All functions return raw response text or JSON which can be parsed as needed. API Base URL: https://string-db.org/api Documentation: https://string-db.org/help/api/ STRING provides protein-protein interaction data from over 40 sources covering 5000+ genomes with ~59.3 million proteins and 20+ billion interactions. """ import urllib.request import urllib.parse import urllib.error import json from typing import Optional, List, Union, Dict STRING_BASE_URL = "https://string-db.org/api" def string_map_ids(identifiers: Union[str, List[str]], species: int = 9606, limit: int = 1, echo_query: int = 1, caller_identity: str = "claude_scientific_skills") -> str: """ Map protein names, synonyms, and identifiers to STRING IDs. Args: identifiers: Single protein identifier or list of identifiers species: NCBI taxon ID (default: 9606 for human) limit: Number of matches to return per identifier (default: 1) echo_query: Include query term in output (1) or not (0) caller_identity: Application identifier for tracking Returns: str: TSV format with mapping results Examples: # Map single protein result = string_map_ids('TP53', species=9606) # Map multiple proteins result = string_map_ids(['TP53', 'BRCA1', 'EGFR'], species=9606) """ if isinstance(identifiers, list): identifiers_str = '\n'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'limit': limit, 'echo_query': echo_query, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/tsv/get_string_ids" data = urllib.parse.urlencode(params).encode('utf-8') try: with urllib.request.urlopen(url, data=data) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" def string_network(identifiers: Union[str, List[str]], species: int = 9606, required_score: int = 400, network_type: str = "functional", add_nodes: int = 0, caller_identity: str = "claude_scientific_skills") -> str: """ Get protein-protein interaction network data. Args: identifiers: Protein identifier(s) - use STRING IDs for best results species: NCBI taxon ID (default: 9606 for human) required_score: Confidence threshold 0-1000 (default: 400 = medium confidence) network_type: 'functional' or 'physical' (default: functional) add_nodes: Number of additional nodes to add to network (0-10) caller_identity: Application identifier for tracking Returns: str: TSV format with interaction data Examples: # Get network for single protein network = string_network('9606.ENSP00000269305') # Get network with multiple proteins network = string_network(['9606.ENSP00000269305', '9606.ENSP00000275493']) # Get network with additional interacting proteins network = string_network('TP53', add_nodes=5, required_score=700) """ if isinstance(identifiers, list): identifiers_str = '%0d'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'required_score': required_score, 'network_type': network_type, 'add_nodes': add_nodes, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/tsv/network?" + urllib.parse.urlencode(params) try: with urllib.request.urlopen(url) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" def string_network_image(identifiers: Union[str, List[str]], species: int = 9606, required_score: int = 400, network_flavor: str = "evidence", add_nodes: int = 0, caller_identity: str = "claude_scientific_skills") -> bytes: """ Get network visualization as PNG image. Args: identifiers: Protein identifier(s) species: NCBI taxon ID (default: 9606 for human) required_score: Confidence threshold 0-1000 (default: 400) network_flavor: 'evidence', 'confidence', or 'actions' (default: evidence) add_nodes: Number of additional nodes to add (0-10) caller_identity: Application identifier for tracking Returns: bytes: PNG image data Example: # Get network image img_data = string_network_image(['TP53', 'MDM2', 'ATM']) with open('network.png', 'wb') as f: f.write(img_data) """ if isinstance(identifiers, list): identifiers_str = '%0d'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'required_score': required_score, 'network_flavor': network_flavor, 'add_nodes': add_nodes, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/image/network?" + urllib.parse.urlencode(params) try: with urllib.request.urlopen(url) as response: return response.read() except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}".encode() def string_interaction_partners(identifiers: Union[str, List[str]], species: int = 9606, required_score: int = 400, limit: int = 10, caller_identity: str = "claude_scientific_skills") -> str: """ Get all interaction partners for protein(s). Args: identifiers: Protein identifier(s) species: NCBI taxon ID (default: 9606 for human) required_score: Confidence threshold 0-1000 (default: 400) limit: Maximum number of partners to return (default: 10) caller_identity: Application identifier for tracking Returns: str: TSV format with interaction partners Example: # Get top 20 interactors of TP53 partners = string_interaction_partners('TP53', limit=20, required_score=700) """ if isinstance(identifiers, list): identifiers_str = '%0d'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'required_score': required_score, 'limit': limit, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/tsv/interaction_partners?" + urllib.parse.urlencode(params) try: with urllib.request.urlopen(url) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" def string_enrichment(identifiers: Union[str, List[str]], species: int = 9606, caller_identity: str = "claude_scientific_skills") -> str: """ Perform functional enrichment analysis (Gene Ontology, KEGG, Pfam, etc.). Args: identifiers: List of protein identifiers species: NCBI taxon ID (default: 9606 for human) caller_identity: Application identifier for tracking Returns: str: TSV format with enrichment results Example: # Enrichment for a list of proteins proteins = ['TP53', 'MDM2', 'ATM', 'CHEK2', 'BRCA1'] enrichment = string_enrichment(proteins, species=9606) """ if isinstance(identifiers, list): identifiers_str = '%0d'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/tsv/enrichment?" + urllib.parse.urlencode(params) try: with urllib.request.urlopen(url) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" def string_ppi_enrichment(identifiers: Union[str, List[str]], species: int = 9606, required_score: int = 400, caller_identity: str = "claude_scientific_skills") -> str: """ Test if network has more interactions than expected by chance. Args: identifiers: List of protein identifiers species: NCBI taxon ID (default: 9606 for human) required_score: Confidence threshold 0-1000 (default: 400) caller_identity: Application identifier for tracking Returns: str: JSON with PPI enrichment p-value Example: # Test if proteins are more connected than random proteins = ['TP53', 'MDM2', 'ATM', 'CHEK2'] ppi_result = string_ppi_enrichment(proteins) """ if isinstance(identifiers, list): identifiers_str = '%0d'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'required_score': required_score, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/json/ppi_enrichment?" + urllib.parse.urlencode(params) try: with urllib.request.urlopen(url) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" def string_homology(identifiers: Union[str, List[str]], species: int = 9606, caller_identity: str = "claude_scientific_skills") -> str: """ Get homology/similarity scores between proteins. Args: identifiers: Protein identifier(s) species: NCBI taxon ID (default: 9606 for human) caller_identity: Application identifier for tracking Returns: str: TSV format with homology scores Example: # Get homology data homology = string_homology(['TP53', 'TP63', 'TP73']) """ if isinstance(identifiers, list): identifiers_str = '%0d'.join(identifiers) else: identifiers_str = identifiers params = { 'identifiers': identifiers_str, 'species': species, 'caller_identity': caller_identity } url = f"{STRING_BASE_URL}/tsv/homology?" + urllib.parse.urlencode(params) try: with urllib.request.urlopen(url) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" def string_version() -> str: """ Get current STRING database version. Returns: str: Version information Example: version = string_version() """ url = f"{STRING_BASE_URL}/tsv/version" try: with urllib.request.urlopen(url) as response: return response.read().decode('utf-8') except urllib.error.HTTPError as e: return f"Error: {e.code} - {e.reason}" if __name__ == "__main__": # Example usage print("STRING Version:") print(string_version()) print() print("Mapping protein names to STRING IDs:") mapping = string_map_ids(['TP53', 'BRCA1'], species=9606) print(mapping) print() print("Getting interaction network:") network = string_network('TP53', species=9606, add_nodes=3) print(network[:500] + "...")