Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/string-database/scripts/string_api.py
+++ b/skills/string-database/scripts/string_api.py
@@ -0,0 +1,369 @@
+"""
+STRING Database REST API Helper Functions
+
+This module provides Python functions for interacting with the STRING database API.
+All functions return raw response text or JSON which can be parsed as needed.
+
+API Base URL: https://string-db.org/api
+Documentation: https://string-db.org/help/api/
+
+STRING provides protein-protein interaction data from over 40 sources covering
+5000+ genomes with ~59.3 million proteins and 20+ billion interactions.
+"""
+
+import urllib.request
+import urllib.parse
+import urllib.error
+import json
+from typing import Optional, List, Union, Dict
+
+
+STRING_BASE_URL = "https://string-db.org/api"
+
+
+def string_map_ids(identifiers: Union[str, List[str]],
+                   species: int = 9606,
+                   limit: int = 1,
+                   echo_query: int = 1,
+                   caller_identity: str = "claude_scientific_skills") -> str:
+    """
+    Map protein names, synonyms, and identifiers to STRING IDs.
+
+    Args:
+        identifiers: Single protein identifier or list of identifiers
+        species: NCBI taxon ID (default: 9606 for human)
+        limit: Number of matches to return per identifier (default: 1)
+        echo_query: Include query term in output (1) or not (0)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        str: TSV format with mapping results
+
+    Examples:
+        # Map single protein
+        result = string_map_ids('TP53', species=9606)
+
+        # Map multiple proteins
+        result = string_map_ids(['TP53', 'BRCA1', 'EGFR'], species=9606)
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '\n'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'limit': limit,
+        'echo_query': echo_query,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/tsv/get_string_ids"
+    data = urllib.parse.urlencode(params).encode('utf-8')
+
+    try:
+        with urllib.request.urlopen(url, data=data) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+def string_network(identifiers: Union[str, List[str]],
+                   species: int = 9606,
+                   required_score: int = 400,
+                   network_type: str = "functional",
+                   add_nodes: int = 0,
+                   caller_identity: str = "claude_scientific_skills") -> str:
+    """
+    Get protein-protein interaction network data.
+
+    Args:
+        identifiers: Protein identifier(s) - use STRING IDs for best results
+        species: NCBI taxon ID (default: 9606 for human)
+        required_score: Confidence threshold 0-1000 (default: 400 = medium confidence)
+        network_type: 'functional' or 'physical' (default: functional)
+        add_nodes: Number of additional nodes to add to network (0-10)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        str: TSV format with interaction data
+
+    Examples:
+        # Get network for single protein
+        network = string_network('9606.ENSP00000269305')
+
+        # Get network with multiple proteins
+        network = string_network(['9606.ENSP00000269305', '9606.ENSP00000275493'])
+
+        # Get network with additional interacting proteins
+        network = string_network('TP53', add_nodes=5, required_score=700)
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '%0d'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'required_score': required_score,
+        'network_type': network_type,
+        'add_nodes': add_nodes,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/tsv/network?" + urllib.parse.urlencode(params)
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+def string_network_image(identifiers: Union[str, List[str]],
+                        species: int = 9606,
+                        required_score: int = 400,
+                        network_flavor: str = "evidence",
+                        add_nodes: int = 0,
+                        caller_identity: str = "claude_scientific_skills") -> bytes:
+    """
+    Get network visualization as PNG image.
+
+    Args:
+        identifiers: Protein identifier(s)
+        species: NCBI taxon ID (default: 9606 for human)
+        required_score: Confidence threshold 0-1000 (default: 400)
+        network_flavor: 'evidence', 'confidence', or 'actions' (default: evidence)
+        add_nodes: Number of additional nodes to add (0-10)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        bytes: PNG image data
+
+    Example:
+        # Get network image
+        img_data = string_network_image(['TP53', 'MDM2', 'ATM'])
+        with open('network.png', 'wb') as f:
+            f.write(img_data)
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '%0d'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'required_score': required_score,
+        'network_flavor': network_flavor,
+        'add_nodes': add_nodes,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/image/network?" + urllib.parse.urlencode(params)
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read()
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}".encode()
+
+
+def string_interaction_partners(identifiers: Union[str, List[str]],
+                                species: int = 9606,
+                                required_score: int = 400,
+                                limit: int = 10,
+                                caller_identity: str = "claude_scientific_skills") -> str:
+    """
+    Get all interaction partners for protein(s).
+
+    Args:
+        identifiers: Protein identifier(s)
+        species: NCBI taxon ID (default: 9606 for human)
+        required_score: Confidence threshold 0-1000 (default: 400)
+        limit: Maximum number of partners to return (default: 10)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        str: TSV format with interaction partners
+
+    Example:
+        # Get top 20 interactors of TP53
+        partners = string_interaction_partners('TP53', limit=20, required_score=700)
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '%0d'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'required_score': required_score,
+        'limit': limit,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/tsv/interaction_partners?" + urllib.parse.urlencode(params)
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+def string_enrichment(identifiers: Union[str, List[str]],
+                     species: int = 9606,
+                     caller_identity: str = "claude_scientific_skills") -> str:
+    """
+    Perform functional enrichment analysis (Gene Ontology, KEGG, Pfam, etc.).
+
+    Args:
+        identifiers: List of protein identifiers
+        species: NCBI taxon ID (default: 9606 for human)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        str: TSV format with enrichment results
+
+    Example:
+        # Enrichment for a list of proteins
+        proteins = ['TP53', 'MDM2', 'ATM', 'CHEK2', 'BRCA1']
+        enrichment = string_enrichment(proteins, species=9606)
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '%0d'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/tsv/enrichment?" + urllib.parse.urlencode(params)
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+def string_ppi_enrichment(identifiers: Union[str, List[str]],
+                         species: int = 9606,
+                         required_score: int = 400,
+                         caller_identity: str = "claude_scientific_skills") -> str:
+    """
+    Test if network has more interactions than expected by chance.
+
+    Args:
+        identifiers: List of protein identifiers
+        species: NCBI taxon ID (default: 9606 for human)
+        required_score: Confidence threshold 0-1000 (default: 400)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        str: JSON with PPI enrichment p-value
+
+    Example:
+        # Test if proteins are more connected than random
+        proteins = ['TP53', 'MDM2', 'ATM', 'CHEK2']
+        ppi_result = string_ppi_enrichment(proteins)
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '%0d'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'required_score': required_score,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/json/ppi_enrichment?" + urllib.parse.urlencode(params)
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+def string_homology(identifiers: Union[str, List[str]],
+                   species: int = 9606,
+                   caller_identity: str = "claude_scientific_skills") -> str:
+    """
+    Get homology/similarity scores between proteins.
+
+    Args:
+        identifiers: Protein identifier(s)
+        species: NCBI taxon ID (default: 9606 for human)
+        caller_identity: Application identifier for tracking
+
+    Returns:
+        str: TSV format with homology scores
+
+    Example:
+        # Get homology data
+        homology = string_homology(['TP53', 'TP63', 'TP73'])
+    """
+    if isinstance(identifiers, list):
+        identifiers_str = '%0d'.join(identifiers)
+    else:
+        identifiers_str = identifiers
+
+    params = {
+        'identifiers': identifiers_str,
+        'species': species,
+        'caller_identity': caller_identity
+    }
+
+    url = f"{STRING_BASE_URL}/tsv/homology?" + urllib.parse.urlencode(params)
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+def string_version() -> str:
+    """
+    Get current STRING database version.
+
+    Returns:
+        str: Version information
+
+    Example:
+        version = string_version()
+    """
+    url = f"{STRING_BASE_URL}/tsv/version"
+
+    try:
+        with urllib.request.urlopen(url) as response:
+            return response.read().decode('utf-8')
+    except urllib.error.HTTPError as e:
+        return f"Error: {e.code} - {e.reason}"
+
+
+if __name__ == "__main__":
+    # Example usage
+    print("STRING Version:")
+    print(string_version())
+    print()
+
+    print("Mapping protein names to STRING IDs:")
+    mapping = string_map_ids(['TP53', 'BRCA1'], species=9606)
+    print(mapping)
+    print()
+
+    print("Getting interaction network:")
+    network = string_network('TP53', species=9606, add_nodes=3)
+    print(network[:500] + "...")