Initial commit
This commit is contained in:
367
skills/pubchem-database/scripts/bioactivity_query.py
Normal file
367
skills/pubchem-database/scripts/bioactivity_query.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubChem Bioactivity Data Retrieval
|
||||
|
||||
This script provides functions for retrieving biological activity data
|
||||
from PubChem for compounds and assays.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("Error: requests is not installed. Install it with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
|
||||
PUG_VIEW_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
|
||||
|
||||
# Rate limiting: 5 requests per second maximum
|
||||
REQUEST_DELAY = 0.21 # seconds between requests
|
||||
|
||||
|
||||
def rate_limited_request(url: str, method: str = 'GET', **kwargs) -> Optional[requests.Response]:
|
||||
"""
|
||||
Make a rate-limited request to PubChem API.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
method: HTTP method ('GET' or 'POST')
|
||||
**kwargs: Additional arguments for requests
|
||||
|
||||
Returns:
|
||||
Response object or None on error
|
||||
"""
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
try:
|
||||
if method.upper() == 'GET':
|
||||
response = requests.get(url, **kwargs)
|
||||
else:
|
||||
response = requests.post(url, **kwargs)
|
||||
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Request error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_bioassay_summary(cid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get bioassay summary for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing bioassay summary data
|
||||
"""
|
||||
url = f"{BASE_URL}/compound/cid/{cid}/assaysummary/JSON"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
return response.json()
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_bioactivities(
|
||||
cid: int,
|
||||
activity_outcome: Optional[str] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Get bioactivity data for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
activity_outcome: Filter by activity ('active', 'inactive', 'inconclusive')
|
||||
|
||||
Returns:
|
||||
List of bioactivity records
|
||||
"""
|
||||
data = get_bioassay_summary(cid)
|
||||
|
||||
if not data:
|
||||
return []
|
||||
|
||||
activities = []
|
||||
table = data.get('Table', {})
|
||||
|
||||
for row in table.get('Row', []):
|
||||
activity = {}
|
||||
for i, cell in enumerate(row.get('Cell', [])):
|
||||
column_name = table['Columns']['Column'][i]
|
||||
activity[column_name] = cell
|
||||
|
||||
if activity_outcome:
|
||||
if activity.get('Activity Outcome', '').lower() == activity_outcome.lower():
|
||||
activities.append(activity)
|
||||
else:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
|
||||
def get_assay_description(aid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get detailed description for a specific assay.
|
||||
|
||||
Args:
|
||||
aid: PubChem Assay ID (AID)
|
||||
|
||||
Returns:
|
||||
Dictionary containing assay description
|
||||
"""
|
||||
url = f"{BASE_URL}/assay/aid/{aid}/description/JSON"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
return response.json()
|
||||
return None
|
||||
|
||||
|
||||
def get_assay_targets(aid: int) -> List[str]:
|
||||
"""
|
||||
Get biological targets for an assay.
|
||||
|
||||
Args:
|
||||
aid: PubChem Assay ID
|
||||
|
||||
Returns:
|
||||
List of target names
|
||||
"""
|
||||
description = get_assay_description(aid)
|
||||
|
||||
if not description:
|
||||
return []
|
||||
|
||||
targets = []
|
||||
assay_data = description.get('PC_AssayContainer', [{}])[0]
|
||||
assay = assay_data.get('assay', {})
|
||||
|
||||
# Extract target information
|
||||
descr = assay.get('descr', {})
|
||||
for target in descr.get('target', []):
|
||||
mol_id = target.get('mol_id', '')
|
||||
name = target.get('name', '')
|
||||
if name:
|
||||
targets.append(name)
|
||||
elif mol_id:
|
||||
targets.append(f"GI:{mol_id}")
|
||||
|
||||
return targets
|
||||
|
||||
|
||||
def search_assays_by_target(
|
||||
target_name: str,
|
||||
max_results: int = 100
|
||||
) -> List[int]:
|
||||
"""
|
||||
Search for assays targeting a specific protein or gene.
|
||||
|
||||
Args:
|
||||
target_name: Name of the target (e.g., 'EGFR', 'p53')
|
||||
max_results: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Assay IDs (AIDs)
|
||||
"""
|
||||
# Use PubChem's text search for assays
|
||||
url = f"{BASE_URL}/assay/target/{target_name}/aids/JSON"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
data = response.json()
|
||||
aids = data.get('IdentifierList', {}).get('AID', [])
|
||||
return aids[:max_results]
|
||||
return []
|
||||
|
||||
|
||||
def get_active_compounds_in_assay(aid: int, max_results: int = 1000) -> List[int]:
|
||||
"""
|
||||
Get list of active compounds in an assay.
|
||||
|
||||
Args:
|
||||
aid: PubChem Assay ID
|
||||
max_results: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Compound IDs (CIDs) that showed activity
|
||||
"""
|
||||
url = f"{BASE_URL}/assay/aid/{aid}/cids/JSON?cids_type=active"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
data = response.json()
|
||||
cids = data.get('IdentifierList', {}).get('CID', [])
|
||||
return cids[:max_results]
|
||||
return []
|
||||
|
||||
|
||||
def get_compound_annotations(cid: int, section: Optional[str] = None) -> Optional[Dict]:
|
||||
"""
|
||||
Get comprehensive compound annotations from PUG-View.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
section: Specific section to retrieve (e.g., 'Pharmacology and Biochemistry')
|
||||
|
||||
Returns:
|
||||
Dictionary containing annotation data
|
||||
"""
|
||||
url = f"{PUG_VIEW_URL}/data/compound/{cid}/JSON"
|
||||
|
||||
if section:
|
||||
url += f"?heading={section}"
|
||||
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
return response.json()
|
||||
return None
|
||||
|
||||
|
||||
def get_drug_information(cid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get drug and medication information for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing drug information
|
||||
"""
|
||||
return get_compound_annotations(cid, section="Drug and Medication Information")
|
||||
|
||||
|
||||
def get_safety_hazards(cid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get safety and hazard information for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing safety information
|
||||
"""
|
||||
return get_compound_annotations(cid, section="Safety and Hazards")
|
||||
|
||||
|
||||
def summarize_bioactivities(cid: int) -> Dict:
|
||||
"""
|
||||
Generate a summary of bioactivity data for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary with bioactivity summary statistics
|
||||
"""
|
||||
activities = get_compound_bioactivities(cid)
|
||||
|
||||
summary = {
|
||||
'total_assays': len(activities),
|
||||
'active': 0,
|
||||
'inactive': 0,
|
||||
'inconclusive': 0,
|
||||
'unspecified': 0,
|
||||
'assay_types': {}
|
||||
}
|
||||
|
||||
for activity in activities:
|
||||
outcome = activity.get('Activity Outcome', '').lower()
|
||||
|
||||
if 'active' in outcome:
|
||||
summary['active'] += 1
|
||||
elif 'inactive' in outcome:
|
||||
summary['inactive'] += 1
|
||||
elif 'inconclusive' in outcome:
|
||||
summary['inconclusive'] += 1
|
||||
else:
|
||||
summary['unspecified'] += 1
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def find_compounds_by_bioactivity(
|
||||
target: str,
|
||||
threshold: Optional[float] = None,
|
||||
max_compounds: int = 100
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Find compounds with bioactivity against a specific target.
|
||||
|
||||
Args:
|
||||
target: Target name (e.g., 'EGFR')
|
||||
threshold: Activity threshold (if applicable)
|
||||
max_compounds: Maximum number of compounds to return
|
||||
|
||||
Returns:
|
||||
List of dictionaries with compound information and activity data
|
||||
"""
|
||||
# Step 1: Find assays for the target
|
||||
assay_ids = search_assays_by_target(target, max_results=10)
|
||||
|
||||
if not assay_ids:
|
||||
print(f"No assays found for target: {target}")
|
||||
return []
|
||||
|
||||
# Step 2: Get active compounds from these assays
|
||||
compound_set = set()
|
||||
compound_data = []
|
||||
|
||||
for aid in assay_ids[:5]: # Limit to first 5 assays
|
||||
active_cids = get_active_compounds_in_assay(aid, max_results=max_compounds)
|
||||
|
||||
for cid in active_cids:
|
||||
if cid not in compound_set and len(compound_data) < max_compounds:
|
||||
compound_set.add(cid)
|
||||
compound_data.append({
|
||||
'cid': cid,
|
||||
'aid': aid,
|
||||
'target': target
|
||||
})
|
||||
|
||||
if len(compound_data) >= max_compounds:
|
||||
break
|
||||
|
||||
return compound_data
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of bioactivity query functions."""
|
||||
|
||||
# Example 1: Get bioassay summary for aspirin (CID 2244)
|
||||
print("Example 1: Getting bioassay summary for aspirin (CID 2244)...")
|
||||
summary = summarize_bioactivities(2244)
|
||||
print(json.dumps(summary, indent=2))
|
||||
|
||||
# Example 2: Get active bioactivities for a compound
|
||||
print("\nExample 2: Getting active bioactivities for aspirin...")
|
||||
activities = get_compound_bioactivities(2244, activity_outcome='active')
|
||||
print(f"Found {len(activities)} active bioactivities")
|
||||
if activities:
|
||||
print(f"First activity: {activities[0].get('Assay Name', 'N/A')}")
|
||||
|
||||
# Example 3: Get assay information
|
||||
print("\nExample 3: Getting assay description...")
|
||||
if activities:
|
||||
aid = activities[0].get('AID', 0)
|
||||
targets = get_assay_targets(aid)
|
||||
print(f"Assay {aid} targets: {', '.join(targets) if targets else 'N/A'}")
|
||||
|
||||
# Example 4: Search for compounds targeting EGFR
|
||||
print("\nExample 4: Searching for EGFR inhibitors...")
|
||||
egfr_compounds = find_compounds_by_bioactivity('EGFR', max_compounds=5)
|
||||
print(f"Found {len(egfr_compounds)} compounds with EGFR activity")
|
||||
for comp in egfr_compounds[:5]:
|
||||
print(f" CID {comp['cid']} (from AID {comp['aid']})")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
297
skills/pubchem-database/scripts/compound_search.py
Normal file
297
skills/pubchem-database/scripts/compound_search.py
Normal file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubChem Compound Search Utility
|
||||
|
||||
This script provides functions for searching and retrieving compound information
|
||||
from PubChem using the PubChemPy library.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Dict, Optional, Union
|
||||
|
||||
try:
|
||||
import pubchempy as pcp
|
||||
except ImportError:
|
||||
print("Error: pubchempy is not installed. Install it with: pip install pubchempy")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]:
|
||||
"""
|
||||
Search for compounds by name.
|
||||
|
||||
Args:
|
||||
name: Chemical name to search for
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of Compound objects
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(name, 'name')
|
||||
return compounds[:max_results]
|
||||
except Exception as e:
|
||||
print(f"Error searching for '{name}': {e}")
|
||||
return []
|
||||
|
||||
|
||||
def search_by_smiles(smiles: str) -> Optional[pcp.Compound]:
|
||||
"""
|
||||
Search for a compound by SMILES string.
|
||||
|
||||
Args:
|
||||
smiles: SMILES string
|
||||
|
||||
Returns:
|
||||
Compound object or None if not found
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(smiles, 'smiles')
|
||||
return compounds[0] if compounds else None
|
||||
except Exception as e:
|
||||
print(f"Error searching for SMILES '{smiles}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]:
|
||||
"""
|
||||
Retrieve a compound by its CID (Compound ID).
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Compound object or None if not found
|
||||
"""
|
||||
try:
|
||||
return pcp.Compound.from_cid(cid)
|
||||
except Exception as e:
|
||||
print(f"Error retrieving CID {cid}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_properties(
|
||||
identifier: Union[str, int],
|
||||
namespace: str = 'name',
|
||||
properties: Optional[List[str]] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Get specific properties for a compound.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier (name, SMILES, CID, etc.)
|
||||
namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.)
|
||||
properties: List of properties to retrieve. If None, returns common properties.
|
||||
|
||||
Returns:
|
||||
Dictionary of properties
|
||||
"""
|
||||
if properties is None:
|
||||
properties = [
|
||||
'MolecularFormula',
|
||||
'MolecularWeight',
|
||||
'CanonicalSMILES',
|
||||
'IUPACName',
|
||||
'XLogP',
|
||||
'TPSA',
|
||||
'HBondDonorCount',
|
||||
'HBondAcceptorCount'
|
||||
]
|
||||
|
||||
try:
|
||||
result = pcp.get_properties(properties, identifier, namespace)
|
||||
return result[0] if result else {}
|
||||
except Exception as e:
|
||||
print(f"Error getting properties for '{identifier}': {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def similarity_search(
|
||||
smiles: str,
|
||||
threshold: int = 90,
|
||||
max_records: int = 10
|
||||
) -> List[pcp.Compound]:
|
||||
"""
|
||||
Perform similarity search for compounds similar to the query structure.
|
||||
|
||||
Args:
|
||||
smiles: Query SMILES string
|
||||
threshold: Similarity threshold (0-100)
|
||||
max_records: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of similar Compound objects
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(
|
||||
smiles,
|
||||
'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=threshold,
|
||||
MaxRecords=max_records
|
||||
)
|
||||
return compounds
|
||||
except Exception as e:
|
||||
print(f"Error in similarity search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def substructure_search(
|
||||
smiles: str,
|
||||
max_records: int = 100
|
||||
) -> List[pcp.Compound]:
|
||||
"""
|
||||
Perform substructure search for compounds containing the query structure.
|
||||
|
||||
Args:
|
||||
smiles: Query SMILES string (substructure)
|
||||
max_records: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Compound objects containing the substructure
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(
|
||||
smiles,
|
||||
'smiles',
|
||||
searchtype='substructure',
|
||||
MaxRecords=max_records
|
||||
)
|
||||
return compounds
|
||||
except Exception as e:
|
||||
print(f"Error in substructure search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]:
|
||||
"""
|
||||
Get all synonyms for a compound.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier
|
||||
namespace: Type of identifier
|
||||
|
||||
Returns:
|
||||
List of synonym strings
|
||||
"""
|
||||
try:
|
||||
results = pcp.get_synonyms(identifier, namespace)
|
||||
if results:
|
||||
return results[0].get('Synonym', [])
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error getting synonyms: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def batch_search(
|
||||
identifiers: List[str],
|
||||
namespace: str = 'name',
|
||||
properties: Optional[List[str]] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Batch search for multiple compounds.
|
||||
|
||||
Args:
|
||||
identifiers: List of compound identifiers
|
||||
namespace: Type of identifiers
|
||||
properties: List of properties to retrieve
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing properties for each compound
|
||||
"""
|
||||
results = []
|
||||
for identifier in identifiers:
|
||||
props = get_compound_properties(identifier, namespace, properties)
|
||||
if props:
|
||||
props['query'] = identifier
|
||||
results.append(props)
|
||||
return results
|
||||
|
||||
|
||||
def download_structure(
|
||||
identifier: Union[str, int],
|
||||
namespace: str = 'name',
|
||||
format: str = 'SDF',
|
||||
filename: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Download compound structure in specified format.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier
|
||||
namespace: Type of identifier
|
||||
format: Output format ('SDF', 'JSON', 'PNG', etc.)
|
||||
filename: Output filename (if None, returns data as string)
|
||||
|
||||
Returns:
|
||||
Data string if filename is None, else None
|
||||
"""
|
||||
try:
|
||||
if filename:
|
||||
pcp.download(format, identifier, namespace, filename, overwrite=True)
|
||||
return None
|
||||
else:
|
||||
return pcp.download(format, identifier, namespace)
|
||||
except Exception as e:
|
||||
print(f"Error downloading structure: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def print_compound_info(compound: pcp.Compound) -> None:
|
||||
"""
|
||||
Print formatted compound information.
|
||||
|
||||
Args:
|
||||
compound: PubChemPy Compound object
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Compound CID: {compound.cid}")
|
||||
print(f"{'='*60}")
|
||||
print(f"IUPAC Name: {compound.iupac_name or 'N/A'}")
|
||||
print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}")
|
||||
print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol")
|
||||
print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}")
|
||||
print(f"InChI: {compound.inchi or 'N/A'}")
|
||||
print(f"InChI Key: {compound.inchikey or 'N/A'}")
|
||||
print(f"XLogP: {compound.xlogp or 'N/A'}")
|
||||
print(f"TPSA: {compound.tpsa or 'N/A'} Ų")
|
||||
print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}")
|
||||
print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of PubChem search functions."""
|
||||
|
||||
# Example 1: Search by name
|
||||
print("Example 1: Searching for 'aspirin'...")
|
||||
compounds = search_by_name('aspirin', max_results=1)
|
||||
if compounds:
|
||||
print_compound_info(compounds[0])
|
||||
|
||||
# Example 2: Get properties
|
||||
print("\nExample 2: Getting properties for caffeine...")
|
||||
props = get_compound_properties('caffeine', 'name')
|
||||
print(json.dumps(props, indent=2))
|
||||
|
||||
# Example 3: Similarity search
|
||||
print("\nExample 3: Finding compounds similar to benzene...")
|
||||
benzene_smiles = 'c1ccccc1'
|
||||
similar = similarity_search(benzene_smiles, threshold=95, max_records=5)
|
||||
print(f"Found {len(similar)} similar compounds:")
|
||||
for comp in similar:
|
||||
print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}")
|
||||
|
||||
# Example 4: Batch search
|
||||
print("\nExample 4: Batch search for multiple compounds...")
|
||||
names = ['aspirin', 'ibuprofen', 'paracetamol']
|
||||
results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight'])
|
||||
for result in results:
|
||||
print(f" {result.get('query')}: {result.get('MolecularFormula')} "
|
||||
f"({result.get('MolecularWeight')} g/mol)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user