Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/pubchem-database/scripts/bioactivity_query.py
+++ b/skills/pubchem-database/scripts/bioactivity_query.py
@@ -0,0 +1,367 @@
+#!/usr/bin/env python3
+"""
+PubChem Bioactivity Data Retrieval
+
+This script provides functions for retrieving biological activity data
+from PubChem for compounds and assays.
+"""
+
+import sys
+import json
+import time
+from typing import Dict, List, Optional
+
+try:
+    import requests
+except ImportError:
+    print("Error: requests is not installed. Install it with: pip install requests")
+    sys.exit(1)
+
+
+BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
+PUG_VIEW_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
+
+# Rate limiting: 5 requests per second maximum
+REQUEST_DELAY = 0.21  # seconds between requests
+
+
+def rate_limited_request(url: str, method: str = 'GET', **kwargs) -> Optional[requests.Response]:
+    """
+    Make a rate-limited request to PubChem API.
+
+    Args:
+        url: Request URL
+        method: HTTP method ('GET' or 'POST')
+        **kwargs: Additional arguments for requests
+
+    Returns:
+        Response object or None on error
+    """
+    time.sleep(REQUEST_DELAY)
+
+    try:
+        if method.upper() == 'GET':
+            response = requests.get(url, **kwargs)
+        else:
+            response = requests.post(url, **kwargs)
+
+        response.raise_for_status()
+        return response
+    except requests.exceptions.RequestException as e:
+        print(f"Request error: {e}")
+        return None
+
+
+def get_bioassay_summary(cid: int) -> Optional[Dict]:
+    """
+    Get bioassay summary for a compound.
+
+    Args:
+        cid: PubChem Compound ID
+
+    Returns:
+        Dictionary containing bioassay summary data
+    """
+    url = f"{BASE_URL}/compound/cid/{cid}/assaysummary/JSON"
+    response = rate_limited_request(url)
+
+    if response and response.status_code == 200:
+        return response.json()
+    return None
+
+
+def get_compound_bioactivities(
+    cid: int,
+    activity_outcome: Optional[str] = None
+) -> List[Dict]:
+    """
+    Get bioactivity data for a compound.
+
+    Args:
+        cid: PubChem Compound ID
+        activity_outcome: Filter by activity ('active', 'inactive', 'inconclusive')
+
+    Returns:
+        List of bioactivity records
+    """
+    data = get_bioassay_summary(cid)
+
+    if not data:
+        return []
+
+    activities = []
+    table = data.get('Table', {})
+
+    for row in table.get('Row', []):
+        activity = {}
+        for i, cell in enumerate(row.get('Cell', [])):
+            column_name = table['Columns']['Column'][i]
+            activity[column_name] = cell
+
+        if activity_outcome:
+            if activity.get('Activity Outcome', '').lower() == activity_outcome.lower():
+                activities.append(activity)
+        else:
+            activities.append(activity)
+
+    return activities
+
+
+def get_assay_description(aid: int) -> Optional[Dict]:
+    """
+    Get detailed description for a specific assay.
+
+    Args:
+        aid: PubChem Assay ID (AID)
+
+    Returns:
+        Dictionary containing assay description
+    """
+    url = f"{BASE_URL}/assay/aid/{aid}/description/JSON"
+    response = rate_limited_request(url)
+
+    if response and response.status_code == 200:
+        return response.json()
+    return None
+
+
+def get_assay_targets(aid: int) -> List[str]:
+    """
+    Get biological targets for an assay.
+
+    Args:
+        aid: PubChem Assay ID
+
+    Returns:
+        List of target names
+    """
+    description = get_assay_description(aid)
+
+    if not description:
+        return []
+
+    targets = []
+    assay_data = description.get('PC_AssayContainer', [{}])[0]
+    assay = assay_data.get('assay', {})
+
+    # Extract target information
+    descr = assay.get('descr', {})
+    for target in descr.get('target', []):
+        mol_id = target.get('mol_id', '')
+        name = target.get('name', '')
+        if name:
+            targets.append(name)
+        elif mol_id:
+            targets.append(f"GI:{mol_id}")
+
+    return targets
+
+
+def search_assays_by_target(
+    target_name: str,
+    max_results: int = 100
+) -> List[int]:
+    """
+    Search for assays targeting a specific protein or gene.
+
+    Args:
+        target_name: Name of the target (e.g., 'EGFR', 'p53')
+        max_results: Maximum number of results
+
+    Returns:
+        List of Assay IDs (AIDs)
+    """
+    # Use PubChem's text search for assays
+    url = f"{BASE_URL}/assay/target/{target_name}/aids/JSON"
+    response = rate_limited_request(url)
+
+    if response and response.status_code == 200:
+        data = response.json()
+        aids = data.get('IdentifierList', {}).get('AID', [])
+        return aids[:max_results]
+    return []
+
+
+def get_active_compounds_in_assay(aid: int, max_results: int = 1000) -> List[int]:
+    """
+    Get list of active compounds in an assay.
+
+    Args:
+        aid: PubChem Assay ID
+        max_results: Maximum number of results
+
+    Returns:
+        List of Compound IDs (CIDs) that showed activity
+    """
+    url = f"{BASE_URL}/assay/aid/{aid}/cids/JSON?cids_type=active"
+    response = rate_limited_request(url)
+
+    if response and response.status_code == 200:
+        data = response.json()
+        cids = data.get('IdentifierList', {}).get('CID', [])
+        return cids[:max_results]
+    return []
+
+
+def get_compound_annotations(cid: int, section: Optional[str] = None) -> Optional[Dict]:
+    """
+    Get comprehensive compound annotations from PUG-View.
+
+    Args:
+        cid: PubChem Compound ID
+        section: Specific section to retrieve (e.g., 'Pharmacology and Biochemistry')
+
+    Returns:
+        Dictionary containing annotation data
+    """
+    url = f"{PUG_VIEW_URL}/data/compound/{cid}/JSON"
+
+    if section:
+        url += f"?heading={section}"
+
+    response = rate_limited_request(url)
+
+    if response and response.status_code == 200:
+        return response.json()
+    return None
+
+
+def get_drug_information(cid: int) -> Optional[Dict]:
+    """
+    Get drug and medication information for a compound.
+
+    Args:
+        cid: PubChem Compound ID
+
+    Returns:
+        Dictionary containing drug information
+    """
+    return get_compound_annotations(cid, section="Drug and Medication Information")
+
+
+def get_safety_hazards(cid: int) -> Optional[Dict]:
+    """
+    Get safety and hazard information for a compound.
+
+    Args:
+        cid: PubChem Compound ID
+
+    Returns:
+        Dictionary containing safety information
+    """
+    return get_compound_annotations(cid, section="Safety and Hazards")
+
+
+def summarize_bioactivities(cid: int) -> Dict:
+    """
+    Generate a summary of bioactivity data for a compound.
+
+    Args:
+        cid: PubChem Compound ID
+
+    Returns:
+        Dictionary with bioactivity summary statistics
+    """
+    activities = get_compound_bioactivities(cid)
+
+    summary = {
+        'total_assays': len(activities),
+        'active': 0,
+        'inactive': 0,
+        'inconclusive': 0,
+        'unspecified': 0,
+        'assay_types': {}
+    }
+
+    for activity in activities:
+        outcome = activity.get('Activity Outcome', '').lower()
+
+        if 'active' in outcome:
+            summary['active'] += 1
+        elif 'inactive' in outcome:
+            summary['inactive'] += 1
+        elif 'inconclusive' in outcome:
+            summary['inconclusive'] += 1
+        else:
+            summary['unspecified'] += 1
+
+    return summary
+
+
+def find_compounds_by_bioactivity(
+    target: str,
+    threshold: Optional[float] = None,
+    max_compounds: int = 100
+) -> List[Dict]:
+    """
+    Find compounds with bioactivity against a specific target.
+
+    Args:
+        target: Target name (e.g., 'EGFR')
+        threshold: Activity threshold (if applicable)
+        max_compounds: Maximum number of compounds to return
+
+    Returns:
+        List of dictionaries with compound information and activity data
+    """
+    # Step 1: Find assays for the target
+    assay_ids = search_assays_by_target(target, max_results=10)
+
+    if not assay_ids:
+        print(f"No assays found for target: {target}")
+        return []
+
+    # Step 2: Get active compounds from these assays
+    compound_set = set()
+    compound_data = []
+
+    for aid in assay_ids[:5]:  # Limit to first 5 assays
+        active_cids = get_active_compounds_in_assay(aid, max_results=max_compounds)
+
+        for cid in active_cids:
+            if cid not in compound_set and len(compound_data) < max_compounds:
+                compound_set.add(cid)
+                compound_data.append({
+                    'cid': cid,
+                    'aid': aid,
+                    'target': target
+                })
+
+        if len(compound_data) >= max_compounds:
+            break
+
+    return compound_data
+
+
+def main():
+    """Example usage of bioactivity query functions."""
+
+    # Example 1: Get bioassay summary for aspirin (CID 2244)
+    print("Example 1: Getting bioassay summary for aspirin (CID 2244)...")
+    summary = summarize_bioactivities(2244)
+    print(json.dumps(summary, indent=2))
+
+    # Example 2: Get active bioactivities for a compound
+    print("\nExample 2: Getting active bioactivities for aspirin...")
+    activities = get_compound_bioactivities(2244, activity_outcome='active')
+    print(f"Found {len(activities)} active bioactivities")
+    if activities:
+        print(f"First activity: {activities[0].get('Assay Name', 'N/A')}")
+
+    # Example 3: Get assay information
+    print("\nExample 3: Getting assay description...")
+    if activities:
+        aid = activities[0].get('AID', 0)
+        targets = get_assay_targets(aid)
+        print(f"Assay {aid} targets: {', '.join(targets) if targets else 'N/A'}")
+
+    # Example 4: Search for compounds targeting EGFR
+    print("\nExample 4: Searching for EGFR inhibitors...")
+    egfr_compounds = find_compounds_by_bioactivity('EGFR', max_compounds=5)
+    print(f"Found {len(egfr_compounds)} compounds with EGFR activity")
+    for comp in egfr_compounds[:5]:
+        print(f"  CID {comp['cid']} (from AID {comp['aid']})")
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/pubchem-database/scripts/compound_search.py
+++ b/skills/pubchem-database/scripts/compound_search.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+"""
+PubChem Compound Search Utility
+
+This script provides functions for searching and retrieving compound information
+from PubChem using the PubChemPy library.
+"""
+
+import sys
+import json
+from typing import List, Dict, Optional, Union
+
+try:
+    import pubchempy as pcp
+except ImportError:
+    print("Error: pubchempy is not installed. Install it with: pip install pubchempy")
+    sys.exit(1)
+
+
+def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]:
+    """
+    Search for compounds by name.
+
+    Args:
+        name: Chemical name to search for
+        max_results: Maximum number of results to return
+
+    Returns:
+        List of Compound objects
+    """
+    try:
+        compounds = pcp.get_compounds(name, 'name')
+        return compounds[:max_results]
+    except Exception as e:
+        print(f"Error searching for '{name}': {e}")
+        return []
+
+
+def search_by_smiles(smiles: str) -> Optional[pcp.Compound]:
+    """
+    Search for a compound by SMILES string.
+
+    Args:
+        smiles: SMILES string
+
+    Returns:
+        Compound object or None if not found
+    """
+    try:
+        compounds = pcp.get_compounds(smiles, 'smiles')
+        return compounds[0] if compounds else None
+    except Exception as e:
+        print(f"Error searching for SMILES '{smiles}': {e}")
+        return None
+
+
+def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]:
+    """
+    Retrieve a compound by its CID (Compound ID).
+
+    Args:
+        cid: PubChem Compound ID
+
+    Returns:
+        Compound object or None if not found
+    """
+    try:
+        return pcp.Compound.from_cid(cid)
+    except Exception as e:
+        print(f"Error retrieving CID {cid}: {e}")
+        return None
+
+
+def get_compound_properties(
+    identifier: Union[str, int],
+    namespace: str = 'name',
+    properties: Optional[List[str]] = None
+) -> Dict:
+    """
+    Get specific properties for a compound.
+
+    Args:
+        identifier: Compound identifier (name, SMILES, CID, etc.)
+        namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.)
+        properties: List of properties to retrieve. If None, returns common properties.
+
+    Returns:
+        Dictionary of properties
+    """
+    if properties is None:
+        properties = [
+            'MolecularFormula',
+            'MolecularWeight',
+            'CanonicalSMILES',
+            'IUPACName',
+            'XLogP',
+            'TPSA',
+            'HBondDonorCount',
+            'HBondAcceptorCount'
+        ]
+
+    try:
+        result = pcp.get_properties(properties, identifier, namespace)
+        return result[0] if result else {}
+    except Exception as e:
+        print(f"Error getting properties for '{identifier}': {e}")
+        return {}
+
+
+def similarity_search(
+    smiles: str,
+    threshold: int = 90,
+    max_records: int = 10
+) -> List[pcp.Compound]:
+    """
+    Perform similarity search for compounds similar to the query structure.
+
+    Args:
+        smiles: Query SMILES string
+        threshold: Similarity threshold (0-100)
+        max_records: Maximum number of results
+
+    Returns:
+        List of similar Compound objects
+    """
+    try:
+        compounds = pcp.get_compounds(
+            smiles,
+            'smiles',
+            searchtype='similarity',
+            Threshold=threshold,
+            MaxRecords=max_records
+        )
+        return compounds
+    except Exception as e:
+        print(f"Error in similarity search: {e}")
+        return []
+
+
+def substructure_search(
+    smiles: str,
+    max_records: int = 100
+) -> List[pcp.Compound]:
+    """
+    Perform substructure search for compounds containing the query structure.
+
+    Args:
+        smiles: Query SMILES string (substructure)
+        max_records: Maximum number of results
+
+    Returns:
+        List of Compound objects containing the substructure
+    """
+    try:
+        compounds = pcp.get_compounds(
+            smiles,
+            'smiles',
+            searchtype='substructure',
+            MaxRecords=max_records
+        )
+        return compounds
+    except Exception as e:
+        print(f"Error in substructure search: {e}")
+        return []
+
+
+def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]:
+    """
+    Get all synonyms for a compound.
+
+    Args:
+        identifier: Compound identifier
+        namespace: Type of identifier
+
+    Returns:
+        List of synonym strings
+    """
+    try:
+        results = pcp.get_synonyms(identifier, namespace)
+        if results:
+            return results[0].get('Synonym', [])
+        return []
+    except Exception as e:
+        print(f"Error getting synonyms: {e}")
+        return []
+
+
+def batch_search(
+    identifiers: List[str],
+    namespace: str = 'name',
+    properties: Optional[List[str]] = None
+) -> List[Dict]:
+    """
+    Batch search for multiple compounds.
+
+    Args:
+        identifiers: List of compound identifiers
+        namespace: Type of identifiers
+        properties: List of properties to retrieve
+
+    Returns:
+        List of dictionaries containing properties for each compound
+    """
+    results = []
+    for identifier in identifiers:
+        props = get_compound_properties(identifier, namespace, properties)
+        if props:
+            props['query'] = identifier
+            results.append(props)
+    return results
+
+
+def download_structure(
+    identifier: Union[str, int],
+    namespace: str = 'name',
+    format: str = 'SDF',
+    filename: Optional[str] = None
+) -> Optional[str]:
+    """
+    Download compound structure in specified format.
+
+    Args:
+        identifier: Compound identifier
+        namespace: Type of identifier
+        format: Output format ('SDF', 'JSON', 'PNG', etc.)
+        filename: Output filename (if None, returns data as string)
+
+    Returns:
+        Data string if filename is None, else None
+    """
+    try:
+        if filename:
+            pcp.download(format, identifier, namespace, filename, overwrite=True)
+            return None
+        else:
+            return pcp.download(format, identifier, namespace)
+    except Exception as e:
+        print(f"Error downloading structure: {e}")
+        return None
+
+
+def print_compound_info(compound: pcp.Compound) -> None:
+    """
+    Print formatted compound information.
+
+    Args:
+        compound: PubChemPy Compound object
+    """
+    print(f"\n{'='*60}")
+    print(f"Compound CID: {compound.cid}")
+    print(f"{'='*60}")
+    print(f"IUPAC Name: {compound.iupac_name or 'N/A'}")
+    print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}")
+    print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol")
+    print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}")
+    print(f"InChI: {compound.inchi or 'N/A'}")
+    print(f"InChI Key: {compound.inchikey or 'N/A'}")
+    print(f"XLogP: {compound.xlogp or 'N/A'}")
+    print(f"TPSA: {compound.tpsa or 'N/A'} Ų")
+    print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}")
+    print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}")
+    print(f"{'='*60}\n")
+
+
+def main():
+    """Example usage of PubChem search functions."""
+
+    # Example 1: Search by name
+    print("Example 1: Searching for 'aspirin'...")
+    compounds = search_by_name('aspirin', max_results=1)
+    if compounds:
+        print_compound_info(compounds[0])
+
+    # Example 2: Get properties
+    print("\nExample 2: Getting properties for caffeine...")
+    props = get_compound_properties('caffeine', 'name')
+    print(json.dumps(props, indent=2))
+
+    # Example 3: Similarity search
+    print("\nExample 3: Finding compounds similar to benzene...")
+    benzene_smiles = 'c1ccccc1'
+    similar = similarity_search(benzene_smiles, threshold=95, max_records=5)
+    print(f"Found {len(similar)} similar compounds:")
+    for comp in similar:
+        print(f"  CID {comp.cid}: {comp.iupac_name or 'N/A'}")
+
+    # Example 4: Batch search
+    print("\nExample 4: Batch search for multiple compounds...")
+    names = ['aspirin', 'ibuprofen', 'paracetamol']
+    results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight'])
+    for result in results:
+        print(f"  {result.get('query')}: {result.get('MolecularFormula')} "
+              f"({result.get('MolecularWeight')} g/mol)")
+
+
+if __name__ == '__main__':
+    main()