Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,367 @@
#!/usr/bin/env python3
"""
PubChem Bioactivity Data Retrieval
This script provides functions for retrieving biological activity data
from PubChem for compounds and assays.
"""
import sys
import json
import time
from typing import Dict, List, Optional
try:
import requests
except ImportError:
print("Error: requests is not installed. Install it with: pip install requests")
sys.exit(1)
BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
PUG_VIEW_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
# Rate limiting: 5 requests per second maximum
REQUEST_DELAY = 0.21 # seconds between requests
def rate_limited_request(url: str, method: str = 'GET', **kwargs) -> Optional[requests.Response]:
"""
Make a rate-limited request to PubChem API.
Args:
url: Request URL
method: HTTP method ('GET' or 'POST')
**kwargs: Additional arguments for requests
Returns:
Response object or None on error
"""
time.sleep(REQUEST_DELAY)
try:
if method.upper() == 'GET':
response = requests.get(url, **kwargs)
else:
response = requests.post(url, **kwargs)
response.raise_for_status()
return response
except requests.exceptions.RequestException as e:
print(f"Request error: {e}")
return None
def get_bioassay_summary(cid: int) -> Optional[Dict]:
"""
Get bioassay summary for a compound.
Args:
cid: PubChem Compound ID
Returns:
Dictionary containing bioassay summary data
"""
url = f"{BASE_URL}/compound/cid/{cid}/assaysummary/JSON"
response = rate_limited_request(url)
if response and response.status_code == 200:
return response.json()
return None
def get_compound_bioactivities(
cid: int,
activity_outcome: Optional[str] = None
) -> List[Dict]:
"""
Get bioactivity data for a compound.
Args:
cid: PubChem Compound ID
activity_outcome: Filter by activity ('active', 'inactive', 'inconclusive')
Returns:
List of bioactivity records
"""
data = get_bioassay_summary(cid)
if not data:
return []
activities = []
table = data.get('Table', {})
for row in table.get('Row', []):
activity = {}
for i, cell in enumerate(row.get('Cell', [])):
column_name = table['Columns']['Column'][i]
activity[column_name] = cell
if activity_outcome:
if activity.get('Activity Outcome', '').lower() == activity_outcome.lower():
activities.append(activity)
else:
activities.append(activity)
return activities
def get_assay_description(aid: int) -> Optional[Dict]:
"""
Get detailed description for a specific assay.
Args:
aid: PubChem Assay ID (AID)
Returns:
Dictionary containing assay description
"""
url = f"{BASE_URL}/assay/aid/{aid}/description/JSON"
response = rate_limited_request(url)
if response and response.status_code == 200:
return response.json()
return None
def get_assay_targets(aid: int) -> List[str]:
"""
Get biological targets for an assay.
Args:
aid: PubChem Assay ID
Returns:
List of target names
"""
description = get_assay_description(aid)
if not description:
return []
targets = []
assay_data = description.get('PC_AssayContainer', [{}])[0]
assay = assay_data.get('assay', {})
# Extract target information
descr = assay.get('descr', {})
for target in descr.get('target', []):
mol_id = target.get('mol_id', '')
name = target.get('name', '')
if name:
targets.append(name)
elif mol_id:
targets.append(f"GI:{mol_id}")
return targets
def search_assays_by_target(
target_name: str,
max_results: int = 100
) -> List[int]:
"""
Search for assays targeting a specific protein or gene.
Args:
target_name: Name of the target (e.g., 'EGFR', 'p53')
max_results: Maximum number of results
Returns:
List of Assay IDs (AIDs)
"""
# Use PubChem's text search for assays
url = f"{BASE_URL}/assay/target/{target_name}/aids/JSON"
response = rate_limited_request(url)
if response and response.status_code == 200:
data = response.json()
aids = data.get('IdentifierList', {}).get('AID', [])
return aids[:max_results]
return []
def get_active_compounds_in_assay(aid: int, max_results: int = 1000) -> List[int]:
"""
Get list of active compounds in an assay.
Args:
aid: PubChem Assay ID
max_results: Maximum number of results
Returns:
List of Compound IDs (CIDs) that showed activity
"""
url = f"{BASE_URL}/assay/aid/{aid}/cids/JSON?cids_type=active"
response = rate_limited_request(url)
if response and response.status_code == 200:
data = response.json()
cids = data.get('IdentifierList', {}).get('CID', [])
return cids[:max_results]
return []
def get_compound_annotations(cid: int, section: Optional[str] = None) -> Optional[Dict]:
"""
Get comprehensive compound annotations from PUG-View.
Args:
cid: PubChem Compound ID
section: Specific section to retrieve (e.g., 'Pharmacology and Biochemistry')
Returns:
Dictionary containing annotation data
"""
url = f"{PUG_VIEW_URL}/data/compound/{cid}/JSON"
if section:
url += f"?heading={section}"
response = rate_limited_request(url)
if response and response.status_code == 200:
return response.json()
return None
def get_drug_information(cid: int) -> Optional[Dict]:
"""
Get drug and medication information for a compound.
Args:
cid: PubChem Compound ID
Returns:
Dictionary containing drug information
"""
return get_compound_annotations(cid, section="Drug and Medication Information")
def get_safety_hazards(cid: int) -> Optional[Dict]:
"""
Get safety and hazard information for a compound.
Args:
cid: PubChem Compound ID
Returns:
Dictionary containing safety information
"""
return get_compound_annotations(cid, section="Safety and Hazards")
def summarize_bioactivities(cid: int) -> Dict:
"""
Generate a summary of bioactivity data for a compound.
Args:
cid: PubChem Compound ID
Returns:
Dictionary with bioactivity summary statistics
"""
activities = get_compound_bioactivities(cid)
summary = {
'total_assays': len(activities),
'active': 0,
'inactive': 0,
'inconclusive': 0,
'unspecified': 0,
'assay_types': {}
}
for activity in activities:
outcome = activity.get('Activity Outcome', '').lower()
if 'active' in outcome:
summary['active'] += 1
elif 'inactive' in outcome:
summary['inactive'] += 1
elif 'inconclusive' in outcome:
summary['inconclusive'] += 1
else:
summary['unspecified'] += 1
return summary
def find_compounds_by_bioactivity(
target: str,
threshold: Optional[float] = None,
max_compounds: int = 100
) -> List[Dict]:
"""
Find compounds with bioactivity against a specific target.
Args:
target: Target name (e.g., 'EGFR')
threshold: Activity threshold (if applicable)
max_compounds: Maximum number of compounds to return
Returns:
List of dictionaries with compound information and activity data
"""
# Step 1: Find assays for the target
assay_ids = search_assays_by_target(target, max_results=10)
if not assay_ids:
print(f"No assays found for target: {target}")
return []
# Step 2: Get active compounds from these assays
compound_set = set()
compound_data = []
for aid in assay_ids[:5]: # Limit to first 5 assays
active_cids = get_active_compounds_in_assay(aid, max_results=max_compounds)
for cid in active_cids:
if cid not in compound_set and len(compound_data) < max_compounds:
compound_set.add(cid)
compound_data.append({
'cid': cid,
'aid': aid,
'target': target
})
if len(compound_data) >= max_compounds:
break
return compound_data
def main():
"""Example usage of bioactivity query functions."""
# Example 1: Get bioassay summary for aspirin (CID 2244)
print("Example 1: Getting bioassay summary for aspirin (CID 2244)...")
summary = summarize_bioactivities(2244)
print(json.dumps(summary, indent=2))
# Example 2: Get active bioactivities for a compound
print("\nExample 2: Getting active bioactivities for aspirin...")
activities = get_compound_bioactivities(2244, activity_outcome='active')
print(f"Found {len(activities)} active bioactivities")
if activities:
print(f"First activity: {activities[0].get('Assay Name', 'N/A')}")
# Example 3: Get assay information
print("\nExample 3: Getting assay description...")
if activities:
aid = activities[0].get('AID', 0)
targets = get_assay_targets(aid)
print(f"Assay {aid} targets: {', '.join(targets) if targets else 'N/A'}")
# Example 4: Search for compounds targeting EGFR
print("\nExample 4: Searching for EGFR inhibitors...")
egfr_compounds = find_compounds_by_bioactivity('EGFR', max_compounds=5)
print(f"Found {len(egfr_compounds)} compounds with EGFR activity")
for comp in egfr_compounds[:5]:
print(f" CID {comp['cid']} (from AID {comp['aid']})")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
PubChem Compound Search Utility
This script provides functions for searching and retrieving compound information
from PubChem using the PubChemPy library.
"""
import sys
import json
from typing import List, Dict, Optional, Union
try:
import pubchempy as pcp
except ImportError:
print("Error: pubchempy is not installed. Install it with: pip install pubchempy")
sys.exit(1)
def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]:
"""
Search for compounds by name.
Args:
name: Chemical name to search for
max_results: Maximum number of results to return
Returns:
List of Compound objects
"""
try:
compounds = pcp.get_compounds(name, 'name')
return compounds[:max_results]
except Exception as e:
print(f"Error searching for '{name}': {e}")
return []
def search_by_smiles(smiles: str) -> Optional[pcp.Compound]:
"""
Search for a compound by SMILES string.
Args:
smiles: SMILES string
Returns:
Compound object or None if not found
"""
try:
compounds = pcp.get_compounds(smiles, 'smiles')
return compounds[0] if compounds else None
except Exception as e:
print(f"Error searching for SMILES '{smiles}': {e}")
return None
def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]:
"""
Retrieve a compound by its CID (Compound ID).
Args:
cid: PubChem Compound ID
Returns:
Compound object or None if not found
"""
try:
return pcp.Compound.from_cid(cid)
except Exception as e:
print(f"Error retrieving CID {cid}: {e}")
return None
def get_compound_properties(
identifier: Union[str, int],
namespace: str = 'name',
properties: Optional[List[str]] = None
) -> Dict:
"""
Get specific properties for a compound.
Args:
identifier: Compound identifier (name, SMILES, CID, etc.)
namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.)
properties: List of properties to retrieve. If None, returns common properties.
Returns:
Dictionary of properties
"""
if properties is None:
properties = [
'MolecularFormula',
'MolecularWeight',
'CanonicalSMILES',
'IUPACName',
'XLogP',
'TPSA',
'HBondDonorCount',
'HBondAcceptorCount'
]
try:
result = pcp.get_properties(properties, identifier, namespace)
return result[0] if result else {}
except Exception as e:
print(f"Error getting properties for '{identifier}': {e}")
return {}
def similarity_search(
smiles: str,
threshold: int = 90,
max_records: int = 10
) -> List[pcp.Compound]:
"""
Perform similarity search for compounds similar to the query structure.
Args:
smiles: Query SMILES string
threshold: Similarity threshold (0-100)
max_records: Maximum number of results
Returns:
List of similar Compound objects
"""
try:
compounds = pcp.get_compounds(
smiles,
'smiles',
searchtype='similarity',
Threshold=threshold,
MaxRecords=max_records
)
return compounds
except Exception as e:
print(f"Error in similarity search: {e}")
return []
def substructure_search(
smiles: str,
max_records: int = 100
) -> List[pcp.Compound]:
"""
Perform substructure search for compounds containing the query structure.
Args:
smiles: Query SMILES string (substructure)
max_records: Maximum number of results
Returns:
List of Compound objects containing the substructure
"""
try:
compounds = pcp.get_compounds(
smiles,
'smiles',
searchtype='substructure',
MaxRecords=max_records
)
return compounds
except Exception as e:
print(f"Error in substructure search: {e}")
return []
def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]:
"""
Get all synonyms for a compound.
Args:
identifier: Compound identifier
namespace: Type of identifier
Returns:
List of synonym strings
"""
try:
results = pcp.get_synonyms(identifier, namespace)
if results:
return results[0].get('Synonym', [])
return []
except Exception as e:
print(f"Error getting synonyms: {e}")
return []
def batch_search(
identifiers: List[str],
namespace: str = 'name',
properties: Optional[List[str]] = None
) -> List[Dict]:
"""
Batch search for multiple compounds.
Args:
identifiers: List of compound identifiers
namespace: Type of identifiers
properties: List of properties to retrieve
Returns:
List of dictionaries containing properties for each compound
"""
results = []
for identifier in identifiers:
props = get_compound_properties(identifier, namespace, properties)
if props:
props['query'] = identifier
results.append(props)
return results
def download_structure(
identifier: Union[str, int],
namespace: str = 'name',
format: str = 'SDF',
filename: Optional[str] = None
) -> Optional[str]:
"""
Download compound structure in specified format.
Args:
identifier: Compound identifier
namespace: Type of identifier
format: Output format ('SDF', 'JSON', 'PNG', etc.)
filename: Output filename (if None, returns data as string)
Returns:
Data string if filename is None, else None
"""
try:
if filename:
pcp.download(format, identifier, namespace, filename, overwrite=True)
return None
else:
return pcp.download(format, identifier, namespace)
except Exception as e:
print(f"Error downloading structure: {e}")
return None
def print_compound_info(compound: pcp.Compound) -> None:
"""
Print formatted compound information.
Args:
compound: PubChemPy Compound object
"""
print(f"\n{'='*60}")
print(f"Compound CID: {compound.cid}")
print(f"{'='*60}")
print(f"IUPAC Name: {compound.iupac_name or 'N/A'}")
print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}")
print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol")
print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}")
print(f"InChI: {compound.inchi or 'N/A'}")
print(f"InChI Key: {compound.inchikey or 'N/A'}")
print(f"XLogP: {compound.xlogp or 'N/A'}")
print(f"TPSA: {compound.tpsa or 'N/A'} Ų")
print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}")
print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}")
print(f"{'='*60}\n")
def main():
"""Example usage of PubChem search functions."""
# Example 1: Search by name
print("Example 1: Searching for 'aspirin'...")
compounds = search_by_name('aspirin', max_results=1)
if compounds:
print_compound_info(compounds[0])
# Example 2: Get properties
print("\nExample 2: Getting properties for caffeine...")
props = get_compound_properties('caffeine', 'name')
print(json.dumps(props, indent=2))
# Example 3: Similarity search
print("\nExample 3: Finding compounds similar to benzene...")
benzene_smiles = 'c1ccccc1'
similar = similarity_search(benzene_smiles, threshold=95, max_records=5)
print(f"Found {len(similar)} similar compounds:")
for comp in similar:
print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}")
# Example 4: Batch search
print("\nExample 4: Batch search for multiple compounds...")
names = ['aspirin', 'ibuprofen', 'paracetamol']
results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight'])
for result in results:
print(f" {result.get('query')}: {result.get('MolecularFormula')} "
f"({result.get('MolecularWeight')} g/mol)")
if __name__ == '__main__':
main()