Initial commit
This commit is contained in:
568
skills/pubchem-database/SKILL.md
Normal file
568
skills/pubchem-database/SKILL.md
Normal file
@@ -0,0 +1,568 @@
|
||||
---
|
||||
name: pubchem-database
|
||||
description: "Query PubChem via PUG-REST API/PubChemPy (110M+ compounds). Search by name/CID/SMILES, retrieve properties, similarity/substructure searches, bioactivity, for cheminformatics."
|
||||
---
|
||||
|
||||
# PubChem Database
|
||||
|
||||
## Overview
|
||||
|
||||
PubChem is the world's largest freely available chemical database with 110M+ compounds and 270M+ bioactivities. Query chemical structures by name, CID, or SMILES, retrieve molecular properties, perform similarity and substructure searches, access bioactivity data using PUG-REST API and PubChemPy.
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
This skill should be used when:
|
||||
- Searching for chemical compounds by name, structure (SMILES/InChI), or molecular formula
|
||||
- Retrieving molecular properties (MW, LogP, TPSA, hydrogen bonding descriptors)
|
||||
- Performing similarity searches to find structurally related compounds
|
||||
- Conducting substructure searches for specific chemical motifs
|
||||
- Accessing bioactivity data from screening assays
|
||||
- Converting between chemical identifier formats (CID, SMILES, InChI)
|
||||
- Batch processing multiple compounds for drug-likeness screening or property analysis
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. Chemical Structure Search
|
||||
|
||||
Search for compounds using multiple identifier types:
|
||||
|
||||
**By Chemical Name**:
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
compounds = pcp.get_compounds('aspirin', 'name')
|
||||
compound = compounds[0]
|
||||
```
|
||||
|
||||
**By CID (Compound ID)**:
|
||||
```python
|
||||
compound = pcp.Compound.from_cid(2244) # Aspirin
|
||||
```
|
||||
|
||||
**By SMILES**:
|
||||
```python
|
||||
compound = pcp.get_compounds('CC(=O)OC1=CC=CC=C1C(=O)O', 'smiles')[0]
|
||||
```
|
||||
|
||||
**By InChI**:
|
||||
```python
|
||||
compound = pcp.get_compounds('InChI=1S/C9H8O4/...', 'inchi')[0]
|
||||
```
|
||||
|
||||
**By Molecular Formula**:
|
||||
```python
|
||||
compounds = pcp.get_compounds('C9H8O4', 'formula')
|
||||
# Returns all compounds matching this formula
|
||||
```
|
||||
|
||||
### 2. Property Retrieval
|
||||
|
||||
Retrieve molecular properties for compounds using either high-level or low-level approaches:
|
||||
|
||||
**Using PubChemPy (Recommended)**:
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Get compound object with all properties
|
||||
compound = pcp.get_compounds('caffeine', 'name')[0]
|
||||
|
||||
# Access individual properties
|
||||
molecular_formula = compound.molecular_formula
|
||||
molecular_weight = compound.molecular_weight
|
||||
iupac_name = compound.iupac_name
|
||||
smiles = compound.canonical_smiles
|
||||
inchi = compound.inchi
|
||||
xlogp = compound.xlogp # Partition coefficient
|
||||
tpsa = compound.tpsa # Topological polar surface area
|
||||
```
|
||||
|
||||
**Get Specific Properties**:
|
||||
```python
|
||||
# Request only specific properties
|
||||
properties = pcp.get_properties(
|
||||
['MolecularFormula', 'MolecularWeight', 'CanonicalSMILES', 'XLogP'],
|
||||
'aspirin',
|
||||
'name'
|
||||
)
|
||||
# Returns list of dictionaries
|
||||
```
|
||||
|
||||
**Batch Property Retrieval**:
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
compound_names = ['aspirin', 'ibuprofen', 'paracetamol']
|
||||
all_properties = []
|
||||
|
||||
for name in compound_names:
|
||||
props = pcp.get_properties(
|
||||
['MolecularFormula', 'MolecularWeight', 'XLogP'],
|
||||
name,
|
||||
'name'
|
||||
)
|
||||
all_properties.extend(props)
|
||||
|
||||
df = pd.DataFrame(all_properties)
|
||||
```
|
||||
|
||||
**Available Properties**: MolecularFormula, MolecularWeight, CanonicalSMILES, IsomericSMILES, InChI, InChIKey, IUPACName, XLogP, TPSA, HBondDonorCount, HBondAcceptorCount, RotatableBondCount, Complexity, Charge, and many more (see `references/api_reference.md` for complete list).
|
||||
|
||||
### 3. Similarity Search
|
||||
|
||||
Find structurally similar compounds using Tanimoto similarity:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Start with a query compound
|
||||
query_compound = pcp.get_compounds('gefitinib', 'name')[0]
|
||||
query_smiles = query_compound.canonical_smiles
|
||||
|
||||
# Perform similarity search
|
||||
similar_compounds = pcp.get_compounds(
|
||||
query_smiles,
|
||||
'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=85, # Similarity threshold (0-100)
|
||||
MaxRecords=50
|
||||
)
|
||||
|
||||
# Process results
|
||||
for compound in similar_compounds[:10]:
|
||||
print(f"CID {compound.cid}: {compound.iupac_name}")
|
||||
print(f" MW: {compound.molecular_weight}")
|
||||
```
|
||||
|
||||
**Note**: Similarity searches are asynchronous for large queries and may take 15-30 seconds to complete. PubChemPy handles the asynchronous pattern automatically.
|
||||
|
||||
### 4. Substructure Search
|
||||
|
||||
Find compounds containing a specific structural motif:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Search for compounds containing pyridine ring
|
||||
pyridine_smiles = 'c1ccncc1'
|
||||
|
||||
matches = pcp.get_compounds(
|
||||
pyridine_smiles,
|
||||
'smiles',
|
||||
searchtype='substructure',
|
||||
MaxRecords=100
|
||||
)
|
||||
|
||||
print(f"Found {len(matches)} compounds containing pyridine")
|
||||
```
|
||||
|
||||
**Common Substructures**:
|
||||
- Benzene ring: `c1ccccc1`
|
||||
- Pyridine: `c1ccncc1`
|
||||
- Phenol: `c1ccc(O)cc1`
|
||||
- Carboxylic acid: `C(=O)O`
|
||||
|
||||
### 5. Format Conversion
|
||||
|
||||
Convert between different chemical structure formats:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
compound = pcp.get_compounds('aspirin', 'name')[0]
|
||||
|
||||
# Convert to different formats
|
||||
smiles = compound.canonical_smiles
|
||||
inchi = compound.inchi
|
||||
inchikey = compound.inchikey
|
||||
cid = compound.cid
|
||||
|
||||
# Download structure files
|
||||
pcp.download('SDF', 'aspirin', 'name', 'aspirin.sdf', overwrite=True)
|
||||
pcp.download('JSON', '2244', 'cid', 'aspirin.json', overwrite=True)
|
||||
```
|
||||
|
||||
### 6. Structure Visualization
|
||||
|
||||
Generate 2D structure images:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Download compound structure as PNG
|
||||
pcp.download('PNG', 'caffeine', 'name', 'caffeine.png', overwrite=True)
|
||||
|
||||
# Using direct URL (via requests)
|
||||
import requests
|
||||
|
||||
cid = 2244 # Aspirin
|
||||
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/PNG?image_size=large"
|
||||
response = requests.get(url)
|
||||
|
||||
with open('structure.png', 'wb') as f:
|
||||
f.write(response.content)
|
||||
```
|
||||
|
||||
### 7. Synonym Retrieval
|
||||
|
||||
Get all known names and synonyms for a compound:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
synonyms_data = pcp.get_synonyms('aspirin', 'name')
|
||||
|
||||
if synonyms_data:
|
||||
cid = synonyms_data[0]['CID']
|
||||
synonyms = synonyms_data[0]['Synonym']
|
||||
|
||||
print(f"CID {cid} has {len(synonyms)} synonyms:")
|
||||
for syn in synonyms[:10]: # First 10
|
||||
print(f" - {syn}")
|
||||
```
|
||||
|
||||
### 8. Bioactivity Data Access
|
||||
|
||||
Retrieve biological activity data from assays:
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
# Get bioassay summary for a compound
|
||||
cid = 2244 # Aspirin
|
||||
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON"
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
# Process bioassay information
|
||||
table = data.get('Table', {})
|
||||
rows = table.get('Row', [])
|
||||
print(f"Found {len(rows)} bioassay records")
|
||||
```
|
||||
|
||||
**For more complex bioactivity queries**, use the `scripts/bioactivity_query.py` helper script which provides:
|
||||
- Bioassay summaries with activity outcome filtering
|
||||
- Assay target identification
|
||||
- Search for compounds by biological target
|
||||
- Active compound lists for specific assays
|
||||
|
||||
### 9. Comprehensive Compound Annotations
|
||||
|
||||
Access detailed compound information through PUG-View:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
cid = 2244
|
||||
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON"
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
annotations = response.json()
|
||||
# Contains extensive data including:
|
||||
# - Chemical and Physical Properties
|
||||
# - Drug and Medication Information
|
||||
# - Pharmacology and Biochemistry
|
||||
# - Safety and Hazards
|
||||
# - Toxicity
|
||||
# - Literature references
|
||||
# - Patents
|
||||
```
|
||||
|
||||
**Get Specific Section**:
|
||||
```python
|
||||
# Get only drug information
|
||||
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON?heading=Drug and Medication Information"
|
||||
```
|
||||
|
||||
## Installation Requirements
|
||||
|
||||
Install PubChemPy for Python-based access:
|
||||
|
||||
```bash
|
||||
uv pip install pubchempy
|
||||
```
|
||||
|
||||
For direct API access and bioactivity queries:
|
||||
|
||||
```bash
|
||||
uv pip install requests
|
||||
```
|
||||
|
||||
Optional for data analysis:
|
||||
|
||||
```bash
|
||||
uv pip install pandas
|
||||
```
|
||||
|
||||
## Helper Scripts
|
||||
|
||||
This skill includes Python scripts for common PubChem tasks:
|
||||
|
||||
### scripts/compound_search.py
|
||||
|
||||
Provides utility functions for searching and retrieving compound information:
|
||||
|
||||
**Key Functions**:
|
||||
- `search_by_name(name, max_results=10)`: Search compounds by name
|
||||
- `search_by_smiles(smiles)`: Search by SMILES string
|
||||
- `get_compound_by_cid(cid)`: Retrieve compound by CID
|
||||
- `get_compound_properties(identifier, namespace, properties)`: Get specific properties
|
||||
- `similarity_search(smiles, threshold, max_records)`: Perform similarity search
|
||||
- `substructure_search(smiles, max_records)`: Perform substructure search
|
||||
- `get_synonyms(identifier, namespace)`: Get all synonyms
|
||||
- `batch_search(identifiers, namespace, properties)`: Batch search multiple compounds
|
||||
- `download_structure(identifier, namespace, format, filename)`: Download structures
|
||||
- `print_compound_info(compound)`: Print formatted compound information
|
||||
|
||||
**Usage**:
|
||||
```python
|
||||
from scripts.compound_search import search_by_name, get_compound_properties
|
||||
|
||||
# Search for a compound
|
||||
compounds = search_by_name('ibuprofen')
|
||||
|
||||
# Get specific properties
|
||||
props = get_compound_properties('aspirin', 'name', ['MolecularWeight', 'XLogP'])
|
||||
```
|
||||
|
||||
### scripts/bioactivity_query.py
|
||||
|
||||
Provides functions for retrieving biological activity data:
|
||||
|
||||
**Key Functions**:
|
||||
- `get_bioassay_summary(cid)`: Get bioassay summary for compound
|
||||
- `get_compound_bioactivities(cid, activity_outcome)`: Get filtered bioactivities
|
||||
- `get_assay_description(aid)`: Get detailed assay information
|
||||
- `get_assay_targets(aid)`: Get biological targets for assay
|
||||
- `search_assays_by_target(target_name, max_results)`: Find assays by target
|
||||
- `get_active_compounds_in_assay(aid, max_results)`: Get active compounds
|
||||
- `get_compound_annotations(cid, section)`: Get PUG-View annotations
|
||||
- `summarize_bioactivities(cid)`: Generate bioactivity summary statistics
|
||||
- `find_compounds_by_bioactivity(target, threshold, max_compounds)`: Find compounds by target
|
||||
|
||||
**Usage**:
|
||||
```python
|
||||
from scripts.bioactivity_query import get_bioassay_summary, summarize_bioactivities
|
||||
|
||||
# Get bioactivity summary
|
||||
summary = summarize_bioactivities(2244) # Aspirin
|
||||
print(f"Total assays: {summary['total_assays']}")
|
||||
print(f"Active: {summary['active']}, Inactive: {summary['inactive']}")
|
||||
```
|
||||
|
||||
## API Rate Limits and Best Practices
|
||||
|
||||
**Rate Limits**:
|
||||
- Maximum 5 requests per second
|
||||
- Maximum 400 requests per minute
|
||||
- Maximum 300 seconds running time per minute
|
||||
|
||||
**Best Practices**:
|
||||
1. **Use CIDs for repeated queries**: CIDs are more efficient than names or structures
|
||||
2. **Cache results locally**: Store frequently accessed data
|
||||
3. **Batch requests**: Combine multiple queries when possible
|
||||
4. **Implement delays**: Add 0.2-0.3 second delays between requests
|
||||
5. **Handle errors gracefully**: Check for HTTP errors and missing data
|
||||
6. **Use PubChemPy**: Higher-level abstraction handles many edge cases
|
||||
7. **Leverage asynchronous pattern**: For large similarity/substructure searches
|
||||
8. **Specify MaxRecords**: Limit results to avoid timeouts
|
||||
|
||||
**Error Handling**:
|
||||
```python
|
||||
from pubchempy import BadRequestError, NotFoundError, TimeoutError
|
||||
|
||||
try:
|
||||
compound = pcp.get_compounds('query', 'name')[0]
|
||||
except NotFoundError:
|
||||
print("Compound not found")
|
||||
except BadRequestError:
|
||||
print("Invalid request format")
|
||||
except TimeoutError:
|
||||
print("Request timed out - try reducing scope")
|
||||
except IndexError:
|
||||
print("No results returned")
|
||||
```
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### Workflow 1: Chemical Identifier Conversion Pipeline
|
||||
|
||||
Convert between different chemical identifiers:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Start with any identifier type
|
||||
compound = pcp.get_compounds('caffeine', 'name')[0]
|
||||
|
||||
# Extract all identifier formats
|
||||
identifiers = {
|
||||
'CID': compound.cid,
|
||||
'Name': compound.iupac_name,
|
||||
'SMILES': compound.canonical_smiles,
|
||||
'InChI': compound.inchi,
|
||||
'InChIKey': compound.inchikey,
|
||||
'Formula': compound.molecular_formula
|
||||
}
|
||||
```
|
||||
|
||||
### Workflow 2: Drug-Like Property Screening
|
||||
|
||||
Screen compounds using Lipinski's Rule of Five:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
def check_drug_likeness(compound_name):
|
||||
compound = pcp.get_compounds(compound_name, 'name')[0]
|
||||
|
||||
# Lipinski's Rule of Five
|
||||
rules = {
|
||||
'MW <= 500': compound.molecular_weight <= 500,
|
||||
'LogP <= 5': compound.xlogp <= 5 if compound.xlogp else None,
|
||||
'HBD <= 5': compound.h_bond_donor_count <= 5,
|
||||
'HBA <= 10': compound.h_bond_acceptor_count <= 10
|
||||
}
|
||||
|
||||
violations = sum(1 for v in rules.values() if v is False)
|
||||
return rules, violations
|
||||
|
||||
rules, violations = check_drug_likeness('aspirin')
|
||||
print(f"Lipinski violations: {violations}")
|
||||
```
|
||||
|
||||
### Workflow 3: Finding Similar Drug Candidates
|
||||
|
||||
Identify structurally similar compounds to a known drug:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Start with known drug
|
||||
reference_drug = pcp.get_compounds('imatinib', 'name')[0]
|
||||
reference_smiles = reference_drug.canonical_smiles
|
||||
|
||||
# Find similar compounds
|
||||
similar = pcp.get_compounds(
|
||||
reference_smiles,
|
||||
'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=85,
|
||||
MaxRecords=20
|
||||
)
|
||||
|
||||
# Filter by drug-like properties
|
||||
candidates = []
|
||||
for comp in similar:
|
||||
if comp.molecular_weight and 200 <= comp.molecular_weight <= 600:
|
||||
if comp.xlogp and -1 <= comp.xlogp <= 5:
|
||||
candidates.append(comp)
|
||||
|
||||
print(f"Found {len(candidates)} drug-like candidates")
|
||||
```
|
||||
|
||||
### Workflow 4: Batch Compound Property Comparison
|
||||
|
||||
Compare properties across multiple compounds:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
import pandas as pd
|
||||
|
||||
compound_list = ['aspirin', 'ibuprofen', 'naproxen', 'celecoxib']
|
||||
|
||||
properties_list = []
|
||||
for name in compound_list:
|
||||
try:
|
||||
compound = pcp.get_compounds(name, 'name')[0]
|
||||
properties_list.append({
|
||||
'Name': name,
|
||||
'CID': compound.cid,
|
||||
'Formula': compound.molecular_formula,
|
||||
'MW': compound.molecular_weight,
|
||||
'LogP': compound.xlogp,
|
||||
'TPSA': compound.tpsa,
|
||||
'HBD': compound.h_bond_donor_count,
|
||||
'HBA': compound.h_bond_acceptor_count
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error processing {name}: {e}")
|
||||
|
||||
df = pd.DataFrame(properties_list)
|
||||
print(df.to_string(index=False))
|
||||
```
|
||||
|
||||
### Workflow 5: Substructure-Based Virtual Screening
|
||||
|
||||
Screen for compounds containing specific pharmacophores:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Define pharmacophore (e.g., sulfonamide group)
|
||||
pharmacophore_smiles = 'S(=O)(=O)N'
|
||||
|
||||
# Search for compounds containing this substructure
|
||||
hits = pcp.get_compounds(
|
||||
pharmacophore_smiles,
|
||||
'smiles',
|
||||
searchtype='substructure',
|
||||
MaxRecords=100
|
||||
)
|
||||
|
||||
# Further filter by properties
|
||||
filtered_hits = [
|
||||
comp for comp in hits
|
||||
if comp.molecular_weight and comp.molecular_weight < 500
|
||||
]
|
||||
|
||||
print(f"Found {len(filtered_hits)} compounds with desired substructure")
|
||||
```
|
||||
|
||||
## Reference Documentation
|
||||
|
||||
For detailed API documentation, including complete property lists, URL patterns, advanced query options, and more examples, consult `references/api_reference.md`. This comprehensive reference includes:
|
||||
|
||||
- Complete PUG-REST API endpoint documentation
|
||||
- Full list of available molecular properties
|
||||
- Asynchronous request handling patterns
|
||||
- PubChemPy API reference
|
||||
- PUG-View API for annotations
|
||||
- Common workflows and use cases
|
||||
- Links to official PubChem documentation
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Compound Not Found**:
|
||||
- Try alternative names or synonyms
|
||||
- Use CID if known
|
||||
- Check spelling and chemical name format
|
||||
|
||||
**Timeout Errors**:
|
||||
- Reduce MaxRecords parameter
|
||||
- Add delays between requests
|
||||
- Use CIDs instead of names for faster queries
|
||||
|
||||
**Empty Property Values**:
|
||||
- Not all properties are available for all compounds
|
||||
- Check if property exists before accessing: `if compound.xlogp:`
|
||||
- Some properties only available for certain compound types
|
||||
|
||||
**Rate Limit Exceeded**:
|
||||
- Implement delays (0.2-0.3 seconds) between requests
|
||||
- Use batch operations where possible
|
||||
- Consider caching results locally
|
||||
|
||||
**Similarity/Substructure Search Hangs**:
|
||||
- These are asynchronous operations that may take 15-30 seconds
|
||||
- PubChemPy handles polling automatically
|
||||
- Reduce MaxRecords if timing out
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- PubChem Home: https://pubchem.ncbi.nlm.nih.gov/
|
||||
- PUG-REST Documentation: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest
|
||||
- PUG-REST Tutorial: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest-tutorial
|
||||
- PubChemPy Documentation: https://pubchempy.readthedocs.io/
|
||||
- PubChemPy GitHub: https://github.com/mcs07/PubChemPy
|
||||
440
skills/pubchem-database/references/api_reference.md
Normal file
440
skills/pubchem-database/references/api_reference.md
Normal file
@@ -0,0 +1,440 @@
|
||||
# PubChem API Reference
|
||||
|
||||
## Overview
|
||||
|
||||
PubChem is the world's largest freely available chemical database maintained by the National Center for Biotechnology Information (NCBI). It contains over 110 million unique chemical structures and over 270 million bioactivities from more than 770 data sources.
|
||||
|
||||
## Database Structure
|
||||
|
||||
PubChem consists of three primary subdatabases:
|
||||
|
||||
1. **Compound Database**: Unique validated chemical structures with computed properties
|
||||
2. **Substance Database**: Deposited chemical substance records from data sources
|
||||
3. **BioAssay Database**: Biological activity test results for chemical compounds
|
||||
|
||||
## PubChem PUG-REST API
|
||||
|
||||
### Base URL Structure
|
||||
|
||||
```
|
||||
https://pubchem.ncbi.nlm.nih.gov/rest/pug/<input>/<operation>/<output>
|
||||
```
|
||||
|
||||
Components:
|
||||
- `<input>`: compound/cid, substance/sid, assay/aid, or search specifications
|
||||
- `<operation>`: Optional operations like property, synonyms, classification, etc.
|
||||
- `<output>`: Format such as JSON, XML, CSV, PNG, SDF, etc.
|
||||
|
||||
### Common Request Patterns
|
||||
|
||||
#### 1. Retrieve by Identifier
|
||||
|
||||
Get compound by CID (Compound ID):
|
||||
```
|
||||
GET /rest/pug/compound/cid/{cid}/property/{properties}/JSON
|
||||
```
|
||||
|
||||
Get compound by name:
|
||||
```
|
||||
GET /rest/pug/compound/name/{name}/property/{properties}/JSON
|
||||
```
|
||||
|
||||
Get compound by SMILES:
|
||||
```
|
||||
GET /rest/pug/compound/smiles/{smiles}/property/{properties}/JSON
|
||||
```
|
||||
|
||||
Get compound by InChI:
|
||||
```
|
||||
GET /rest/pug/compound/inchi/{inchi}/property/{properties}/JSON
|
||||
```
|
||||
|
||||
#### 2. Available Properties
|
||||
|
||||
Common molecular properties that can be retrieved:
|
||||
- `MolecularFormula`
|
||||
- `MolecularWeight`
|
||||
- `CanonicalSMILES`
|
||||
- `IsomericSMILES`
|
||||
- `InChI`
|
||||
- `InChIKey`
|
||||
- `IUPACName`
|
||||
- `XLogP`
|
||||
- `ExactMass`
|
||||
- `MonoisotopicMass`
|
||||
- `TPSA` (Topological Polar Surface Area)
|
||||
- `Complexity`
|
||||
- `Charge`
|
||||
- `HBondDonorCount`
|
||||
- `HBondAcceptorCount`
|
||||
- `RotatableBondCount`
|
||||
- `HeavyAtomCount`
|
||||
- `IsotopeAtomCount`
|
||||
- `AtomStereoCount`
|
||||
- `BondStereoCount`
|
||||
- `CovalentUnitCount`
|
||||
- `Volume3D`
|
||||
- `XStericQuadrupole3D`
|
||||
- `YStericQuadrupole3D`
|
||||
- `ZStericQuadrupole3D`
|
||||
- `FeatureCount3D`
|
||||
|
||||
To retrieve multiple properties, separate them with commas:
|
||||
```
|
||||
/property/MolecularFormula,MolecularWeight,CanonicalSMILES/JSON
|
||||
```
|
||||
|
||||
#### 3. Structure Search Operations
|
||||
|
||||
**Similarity Search**:
|
||||
```
|
||||
POST /rest/pug/compound/similarity/smiles/{smiles}/JSON
|
||||
Parameters: Threshold (default 90%)
|
||||
```
|
||||
|
||||
**Substructure Search**:
|
||||
```
|
||||
POST /rest/pug/compound/substructure/smiles/{smiles}/cids/JSON
|
||||
```
|
||||
|
||||
**Superstructure Search**:
|
||||
```
|
||||
POST /rest/pug/compound/superstructure/smiles/{smiles}/cids/JSON
|
||||
```
|
||||
|
||||
#### 4. Image Generation
|
||||
|
||||
Get 2D structure image:
|
||||
```
|
||||
GET /rest/pug/compound/cid/{cid}/PNG
|
||||
Optional parameters: image_size=small|large
|
||||
```
|
||||
|
||||
#### 5. Format Conversion
|
||||
|
||||
Get compound as SDF (Structure-Data File):
|
||||
```
|
||||
GET /rest/pug/compound/cid/{cid}/SDF
|
||||
```
|
||||
|
||||
Get compound as MOL:
|
||||
```
|
||||
GET /rest/pug/compound/cid/{cid}/record/SDF
|
||||
```
|
||||
|
||||
#### 6. Synonym Retrieval
|
||||
|
||||
Get all synonyms for a compound:
|
||||
```
|
||||
GET /rest/pug/compound/cid/{cid}/synonyms/JSON
|
||||
```
|
||||
|
||||
#### 7. Bioassay Data
|
||||
|
||||
Get bioassay data for a compound:
|
||||
```
|
||||
GET /rest/pug/compound/cid/{cid}/assaysummary/JSON
|
||||
```
|
||||
|
||||
Get specific assay information:
|
||||
```
|
||||
GET /rest/pug/assay/aid/{aid}/description/JSON
|
||||
```
|
||||
|
||||
### Asynchronous Requests
|
||||
|
||||
For large queries (similarity/substructure searches), PUG-REST uses an asynchronous pattern:
|
||||
|
||||
1. Submit the query (returns ListKey)
|
||||
2. Check status using the ListKey
|
||||
3. Retrieve results when ready
|
||||
|
||||
Example workflow:
|
||||
```python
|
||||
# Step 1: Submit similarity search
|
||||
response = requests.post(
|
||||
"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/similarity/smiles/{smiles}/cids/JSON",
|
||||
data={"Threshold": 90}
|
||||
)
|
||||
listkey = response.json()["Waiting"]["ListKey"]
|
||||
|
||||
# Step 2: Check status
|
||||
status_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/{listkey}/cids/JSON"
|
||||
|
||||
# Step 3: Poll until ready (with timeout)
|
||||
# Step 4: Retrieve results from the same URL
|
||||
```
|
||||
|
||||
### Usage Limits
|
||||
|
||||
**Rate Limits**:
|
||||
- Maximum 5 requests per second
|
||||
- Maximum 400 requests per minute
|
||||
- Maximum 300 seconds running time per minute
|
||||
|
||||
**Best Practices**:
|
||||
- Use batch requests when possible
|
||||
- Implement exponential backoff for retries
|
||||
- Cache results when appropriate
|
||||
- Use asynchronous pattern for large queries
|
||||
|
||||
## PubChemPy Python Library
|
||||
|
||||
PubChemPy is a Python wrapper that simplifies PUG-REST API access.
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
pip install pubchempy
|
||||
```
|
||||
|
||||
### Key Classes
|
||||
|
||||
#### Compound Class
|
||||
|
||||
Main class for representing chemical compounds:
|
||||
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
# Get by CID
|
||||
compound = pcp.Compound.from_cid(2244)
|
||||
|
||||
# Access properties
|
||||
compound.molecular_formula # 'C9H8O4'
|
||||
compound.molecular_weight # 180.16
|
||||
compound.iupac_name # '2-acetyloxybenzoic acid'
|
||||
compound.canonical_smiles # 'CC(=O)OC1=CC=CC=C1C(=O)O'
|
||||
compound.isomeric_smiles # Same as canonical for non-stereoisomers
|
||||
compound.inchi # InChI string
|
||||
compound.inchikey # InChI Key
|
||||
compound.xlogp # Partition coefficient
|
||||
compound.tpsa # Topological polar surface area
|
||||
```
|
||||
|
||||
#### Search Methods
|
||||
|
||||
**By Name**:
|
||||
```python
|
||||
compounds = pcp.get_compounds('aspirin', 'name')
|
||||
# Returns list of Compound objects
|
||||
```
|
||||
|
||||
**By SMILES**:
|
||||
```python
|
||||
compound = pcp.get_compounds('CC(=O)OC1=CC=CC=C1C(=O)O', 'smiles')[0]
|
||||
```
|
||||
|
||||
**By InChI**:
|
||||
```python
|
||||
compound = pcp.get_compounds('InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)', 'inchi')[0]
|
||||
```
|
||||
|
||||
**By Formula**:
|
||||
```python
|
||||
compounds = pcp.get_compounds('C9H8O4', 'formula')
|
||||
# Returns all compounds with this formula
|
||||
```
|
||||
|
||||
**Similarity Search**:
|
||||
```python
|
||||
results = pcp.get_compounds('CC(=O)OC1=CC=CC=C1C(=O)O', 'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=90)
|
||||
```
|
||||
|
||||
**Substructure Search**:
|
||||
```python
|
||||
results = pcp.get_compounds('c1ccccc1', 'smiles',
|
||||
searchtype='substructure')
|
||||
# Returns all compounds containing benzene ring
|
||||
```
|
||||
|
||||
#### Property Retrieval
|
||||
|
||||
Get specific properties for multiple compounds:
|
||||
```python
|
||||
properties = pcp.get_properties(
|
||||
['MolecularFormula', 'MolecularWeight', 'CanonicalSMILES'],
|
||||
'aspirin',
|
||||
'name'
|
||||
)
|
||||
# Returns list of dictionaries
|
||||
```
|
||||
|
||||
Get properties as pandas DataFrame:
|
||||
```python
|
||||
import pandas as pd
|
||||
df = pd.DataFrame(properties)
|
||||
```
|
||||
|
||||
#### Synonyms
|
||||
|
||||
Get all synonyms for a compound:
|
||||
```python
|
||||
synonyms = pcp.get_synonyms('aspirin', 'name')
|
||||
# Returns list of dictionaries with CID and synonym lists
|
||||
```
|
||||
|
||||
#### Download Formats
|
||||
|
||||
Download compound in various formats:
|
||||
```python
|
||||
# Get as SDF
|
||||
sdf_data = pcp.download('SDF', 'aspirin', 'name', overwrite=True)
|
||||
|
||||
# Get as JSON
|
||||
json_data = pcp.download('JSON', '2244', 'cid')
|
||||
|
||||
# Get as PNG image
|
||||
pcp.download('PNG', '2244', 'cid', 'aspirin.png', overwrite=True)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
from pubchempy import BadRequestError, NotFoundError, TimeoutError
|
||||
|
||||
try:
|
||||
compound = pcp.get_compounds('nonexistent', 'name')
|
||||
except NotFoundError:
|
||||
print("Compound not found")
|
||||
except BadRequestError:
|
||||
print("Invalid request")
|
||||
except TimeoutError:
|
||||
print("Request timed out")
|
||||
```
|
||||
|
||||
## PUG-View API
|
||||
|
||||
PUG-View provides access to full textual annotations and specialized reports.
|
||||
|
||||
### Key Endpoints
|
||||
|
||||
Get compound annotations:
|
||||
```
|
||||
GET /rest/pug_view/data/compound/{cid}/JSON
|
||||
```
|
||||
|
||||
Get specific annotation sections:
|
||||
```
|
||||
GET /rest/pug_view/data/compound/{cid}/JSON?heading={section_name}
|
||||
```
|
||||
|
||||
Available sections include:
|
||||
- Chemical and Physical Properties
|
||||
- Drug and Medication Information
|
||||
- Pharmacology and Biochemistry
|
||||
- Safety and Hazards
|
||||
- Toxicity
|
||||
- Literature
|
||||
- Patents
|
||||
- Biomolecular Interactions and Pathways
|
||||
|
||||
## Common Workflows
|
||||
|
||||
### 1. Chemical Identifier Conversion
|
||||
|
||||
Convert from name to SMILES to InChI:
|
||||
```python
|
||||
import pubchempy as pcp
|
||||
|
||||
compound = pcp.get_compounds('caffeine', 'name')[0]
|
||||
smiles = compound.canonical_smiles
|
||||
inchi = compound.inchi
|
||||
inchikey = compound.inchikey
|
||||
cid = compound.cid
|
||||
```
|
||||
|
||||
### 2. Batch Property Retrieval
|
||||
|
||||
Get properties for multiple compounds:
|
||||
```python
|
||||
compound_names = ['aspirin', 'ibuprofen', 'paracetamol']
|
||||
properties = []
|
||||
|
||||
for name in compound_names:
|
||||
props = pcp.get_properties(
|
||||
['MolecularFormula', 'MolecularWeight', 'XLogP'],
|
||||
name,
|
||||
'name'
|
||||
)
|
||||
properties.extend(props)
|
||||
|
||||
import pandas as pd
|
||||
df = pd.DataFrame(properties)
|
||||
```
|
||||
|
||||
### 3. Finding Similar Compounds
|
||||
|
||||
Find structurally similar compounds to a query:
|
||||
```python
|
||||
# Start with a known compound
|
||||
query_compound = pcp.get_compounds('gefitinib', 'name')[0]
|
||||
query_smiles = query_compound.canonical_smiles
|
||||
|
||||
# Perform similarity search
|
||||
similar = pcp.get_compounds(
|
||||
query_smiles,
|
||||
'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=85
|
||||
)
|
||||
|
||||
# Get properties for similar compounds
|
||||
for compound in similar[:10]: # First 10 results
|
||||
print(f"{compound.cid}: {compound.iupac_name}, MW: {compound.molecular_weight}")
|
||||
```
|
||||
|
||||
### 4. Substructure Screening
|
||||
|
||||
Find all compounds containing a specific substructure:
|
||||
```python
|
||||
# Search for compounds containing pyridine ring
|
||||
pyridine_smiles = 'c1ccncc1'
|
||||
|
||||
matches = pcp.get_compounds(
|
||||
pyridine_smiles,
|
||||
'smiles',
|
||||
searchtype='substructure',
|
||||
MaxRecords=100
|
||||
)
|
||||
|
||||
print(f"Found {len(matches)} compounds containing pyridine")
|
||||
```
|
||||
|
||||
### 5. Bioactivity Data Retrieval
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
cid = 2244 # Aspirin
|
||||
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/assaysummary/JSON"
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
bioassay_data = response.json()
|
||||
# Process bioassay information
|
||||
```
|
||||
|
||||
## Tips and Best Practices
|
||||
|
||||
1. **Use CIDs for repeated queries**: CIDs are more efficient than names or structures
|
||||
2. **Cache results**: Store frequently accessed data locally
|
||||
3. **Batch requests**: Combine multiple queries when possible
|
||||
4. **Handle rate limits**: Implement delays between requests
|
||||
5. **Use appropriate search types**: Similarity for related compounds, substructure for motif finding
|
||||
6. **Leverage PubChemPy**: Higher-level abstraction simplifies common tasks
|
||||
7. **Handle missing data**: Not all properties are available for all compounds
|
||||
8. **Use asynchronous pattern**: For large similarity/substructure searches
|
||||
9. **Specify output format**: Choose JSON for programmatic access, SDF for cheminformatics tools
|
||||
10. **Read documentation**: Full PUG-REST documentation available at https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- PubChem Home: https://pubchem.ncbi.nlm.nih.gov/
|
||||
- PUG-REST Documentation: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest
|
||||
- PUG-REST Tutorial: https://pubchem.ncbi.nlm.nih.gov/docs/pug-rest-tutorial
|
||||
- PubChemPy Documentation: https://pubchempy.readthedocs.io/
|
||||
- PubChemPy GitHub: https://github.com/mcs07/PubChemPy
|
||||
- IUPAC Tutorial: https://iupac.github.io/WFChemCookbook/datasources/pubchem_pugrest.html
|
||||
367
skills/pubchem-database/scripts/bioactivity_query.py
Normal file
367
skills/pubchem-database/scripts/bioactivity_query.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubChem Bioactivity Data Retrieval
|
||||
|
||||
This script provides functions for retrieving biological activity data
|
||||
from PubChem for compounds and assays.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
print("Error: requests is not installed. Install it with: pip install requests")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
BASE_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
|
||||
PUG_VIEW_URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
|
||||
|
||||
# Rate limiting: 5 requests per second maximum
|
||||
REQUEST_DELAY = 0.21 # seconds between requests
|
||||
|
||||
|
||||
def rate_limited_request(url: str, method: str = 'GET', **kwargs) -> Optional[requests.Response]:
|
||||
"""
|
||||
Make a rate-limited request to PubChem API.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
method: HTTP method ('GET' or 'POST')
|
||||
**kwargs: Additional arguments for requests
|
||||
|
||||
Returns:
|
||||
Response object or None on error
|
||||
"""
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
try:
|
||||
if method.upper() == 'GET':
|
||||
response = requests.get(url, **kwargs)
|
||||
else:
|
||||
response = requests.post(url, **kwargs)
|
||||
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Request error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_bioassay_summary(cid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get bioassay summary for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing bioassay summary data
|
||||
"""
|
||||
url = f"{BASE_URL}/compound/cid/{cid}/assaysummary/JSON"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
return response.json()
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_bioactivities(
|
||||
cid: int,
|
||||
activity_outcome: Optional[str] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Get bioactivity data for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
activity_outcome: Filter by activity ('active', 'inactive', 'inconclusive')
|
||||
|
||||
Returns:
|
||||
List of bioactivity records
|
||||
"""
|
||||
data = get_bioassay_summary(cid)
|
||||
|
||||
if not data:
|
||||
return []
|
||||
|
||||
activities = []
|
||||
table = data.get('Table', {})
|
||||
|
||||
for row in table.get('Row', []):
|
||||
activity = {}
|
||||
for i, cell in enumerate(row.get('Cell', [])):
|
||||
column_name = table['Columns']['Column'][i]
|
||||
activity[column_name] = cell
|
||||
|
||||
if activity_outcome:
|
||||
if activity.get('Activity Outcome', '').lower() == activity_outcome.lower():
|
||||
activities.append(activity)
|
||||
else:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
|
||||
def get_assay_description(aid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get detailed description for a specific assay.
|
||||
|
||||
Args:
|
||||
aid: PubChem Assay ID (AID)
|
||||
|
||||
Returns:
|
||||
Dictionary containing assay description
|
||||
"""
|
||||
url = f"{BASE_URL}/assay/aid/{aid}/description/JSON"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
return response.json()
|
||||
return None
|
||||
|
||||
|
||||
def get_assay_targets(aid: int) -> List[str]:
|
||||
"""
|
||||
Get biological targets for an assay.
|
||||
|
||||
Args:
|
||||
aid: PubChem Assay ID
|
||||
|
||||
Returns:
|
||||
List of target names
|
||||
"""
|
||||
description = get_assay_description(aid)
|
||||
|
||||
if not description:
|
||||
return []
|
||||
|
||||
targets = []
|
||||
assay_data = description.get('PC_AssayContainer', [{}])[0]
|
||||
assay = assay_data.get('assay', {})
|
||||
|
||||
# Extract target information
|
||||
descr = assay.get('descr', {})
|
||||
for target in descr.get('target', []):
|
||||
mol_id = target.get('mol_id', '')
|
||||
name = target.get('name', '')
|
||||
if name:
|
||||
targets.append(name)
|
||||
elif mol_id:
|
||||
targets.append(f"GI:{mol_id}")
|
||||
|
||||
return targets
|
||||
|
||||
|
||||
def search_assays_by_target(
|
||||
target_name: str,
|
||||
max_results: int = 100
|
||||
) -> List[int]:
|
||||
"""
|
||||
Search for assays targeting a specific protein or gene.
|
||||
|
||||
Args:
|
||||
target_name: Name of the target (e.g., 'EGFR', 'p53')
|
||||
max_results: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Assay IDs (AIDs)
|
||||
"""
|
||||
# Use PubChem's text search for assays
|
||||
url = f"{BASE_URL}/assay/target/{target_name}/aids/JSON"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
data = response.json()
|
||||
aids = data.get('IdentifierList', {}).get('AID', [])
|
||||
return aids[:max_results]
|
||||
return []
|
||||
|
||||
|
||||
def get_active_compounds_in_assay(aid: int, max_results: int = 1000) -> List[int]:
|
||||
"""
|
||||
Get list of active compounds in an assay.
|
||||
|
||||
Args:
|
||||
aid: PubChem Assay ID
|
||||
max_results: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Compound IDs (CIDs) that showed activity
|
||||
"""
|
||||
url = f"{BASE_URL}/assay/aid/{aid}/cids/JSON?cids_type=active"
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
data = response.json()
|
||||
cids = data.get('IdentifierList', {}).get('CID', [])
|
||||
return cids[:max_results]
|
||||
return []
|
||||
|
||||
|
||||
def get_compound_annotations(cid: int, section: Optional[str] = None) -> Optional[Dict]:
|
||||
"""
|
||||
Get comprehensive compound annotations from PUG-View.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
section: Specific section to retrieve (e.g., 'Pharmacology and Biochemistry')
|
||||
|
||||
Returns:
|
||||
Dictionary containing annotation data
|
||||
"""
|
||||
url = f"{PUG_VIEW_URL}/data/compound/{cid}/JSON"
|
||||
|
||||
if section:
|
||||
url += f"?heading={section}"
|
||||
|
||||
response = rate_limited_request(url)
|
||||
|
||||
if response and response.status_code == 200:
|
||||
return response.json()
|
||||
return None
|
||||
|
||||
|
||||
def get_drug_information(cid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get drug and medication information for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing drug information
|
||||
"""
|
||||
return get_compound_annotations(cid, section="Drug and Medication Information")
|
||||
|
||||
|
||||
def get_safety_hazards(cid: int) -> Optional[Dict]:
|
||||
"""
|
||||
Get safety and hazard information for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary containing safety information
|
||||
"""
|
||||
return get_compound_annotations(cid, section="Safety and Hazards")
|
||||
|
||||
|
||||
def summarize_bioactivities(cid: int) -> Dict:
|
||||
"""
|
||||
Generate a summary of bioactivity data for a compound.
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Dictionary with bioactivity summary statistics
|
||||
"""
|
||||
activities = get_compound_bioactivities(cid)
|
||||
|
||||
summary = {
|
||||
'total_assays': len(activities),
|
||||
'active': 0,
|
||||
'inactive': 0,
|
||||
'inconclusive': 0,
|
||||
'unspecified': 0,
|
||||
'assay_types': {}
|
||||
}
|
||||
|
||||
for activity in activities:
|
||||
outcome = activity.get('Activity Outcome', '').lower()
|
||||
|
||||
if 'active' in outcome:
|
||||
summary['active'] += 1
|
||||
elif 'inactive' in outcome:
|
||||
summary['inactive'] += 1
|
||||
elif 'inconclusive' in outcome:
|
||||
summary['inconclusive'] += 1
|
||||
else:
|
||||
summary['unspecified'] += 1
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def find_compounds_by_bioactivity(
|
||||
target: str,
|
||||
threshold: Optional[float] = None,
|
||||
max_compounds: int = 100
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Find compounds with bioactivity against a specific target.
|
||||
|
||||
Args:
|
||||
target: Target name (e.g., 'EGFR')
|
||||
threshold: Activity threshold (if applicable)
|
||||
max_compounds: Maximum number of compounds to return
|
||||
|
||||
Returns:
|
||||
List of dictionaries with compound information and activity data
|
||||
"""
|
||||
# Step 1: Find assays for the target
|
||||
assay_ids = search_assays_by_target(target, max_results=10)
|
||||
|
||||
if not assay_ids:
|
||||
print(f"No assays found for target: {target}")
|
||||
return []
|
||||
|
||||
# Step 2: Get active compounds from these assays
|
||||
compound_set = set()
|
||||
compound_data = []
|
||||
|
||||
for aid in assay_ids[:5]: # Limit to first 5 assays
|
||||
active_cids = get_active_compounds_in_assay(aid, max_results=max_compounds)
|
||||
|
||||
for cid in active_cids:
|
||||
if cid not in compound_set and len(compound_data) < max_compounds:
|
||||
compound_set.add(cid)
|
||||
compound_data.append({
|
||||
'cid': cid,
|
||||
'aid': aid,
|
||||
'target': target
|
||||
})
|
||||
|
||||
if len(compound_data) >= max_compounds:
|
||||
break
|
||||
|
||||
return compound_data
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of bioactivity query functions."""
|
||||
|
||||
# Example 1: Get bioassay summary for aspirin (CID 2244)
|
||||
print("Example 1: Getting bioassay summary for aspirin (CID 2244)...")
|
||||
summary = summarize_bioactivities(2244)
|
||||
print(json.dumps(summary, indent=2))
|
||||
|
||||
# Example 2: Get active bioactivities for a compound
|
||||
print("\nExample 2: Getting active bioactivities for aspirin...")
|
||||
activities = get_compound_bioactivities(2244, activity_outcome='active')
|
||||
print(f"Found {len(activities)} active bioactivities")
|
||||
if activities:
|
||||
print(f"First activity: {activities[0].get('Assay Name', 'N/A')}")
|
||||
|
||||
# Example 3: Get assay information
|
||||
print("\nExample 3: Getting assay description...")
|
||||
if activities:
|
||||
aid = activities[0].get('AID', 0)
|
||||
targets = get_assay_targets(aid)
|
||||
print(f"Assay {aid} targets: {', '.join(targets) if targets else 'N/A'}")
|
||||
|
||||
# Example 4: Search for compounds targeting EGFR
|
||||
print("\nExample 4: Searching for EGFR inhibitors...")
|
||||
egfr_compounds = find_compounds_by_bioactivity('EGFR', max_compounds=5)
|
||||
print(f"Found {len(egfr_compounds)} compounds with EGFR activity")
|
||||
for comp in egfr_compounds[:5]:
|
||||
print(f" CID {comp['cid']} (from AID {comp['aid']})")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
297
skills/pubchem-database/scripts/compound_search.py
Normal file
297
skills/pubchem-database/scripts/compound_search.py
Normal file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubChem Compound Search Utility
|
||||
|
||||
This script provides functions for searching and retrieving compound information
|
||||
from PubChem using the PubChemPy library.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Dict, Optional, Union
|
||||
|
||||
try:
|
||||
import pubchempy as pcp
|
||||
except ImportError:
|
||||
print("Error: pubchempy is not installed. Install it with: pip install pubchempy")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]:
|
||||
"""
|
||||
Search for compounds by name.
|
||||
|
||||
Args:
|
||||
name: Chemical name to search for
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of Compound objects
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(name, 'name')
|
||||
return compounds[:max_results]
|
||||
except Exception as e:
|
||||
print(f"Error searching for '{name}': {e}")
|
||||
return []
|
||||
|
||||
|
||||
def search_by_smiles(smiles: str) -> Optional[pcp.Compound]:
|
||||
"""
|
||||
Search for a compound by SMILES string.
|
||||
|
||||
Args:
|
||||
smiles: SMILES string
|
||||
|
||||
Returns:
|
||||
Compound object or None if not found
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(smiles, 'smiles')
|
||||
return compounds[0] if compounds else None
|
||||
except Exception as e:
|
||||
print(f"Error searching for SMILES '{smiles}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]:
|
||||
"""
|
||||
Retrieve a compound by its CID (Compound ID).
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Compound object or None if not found
|
||||
"""
|
||||
try:
|
||||
return pcp.Compound.from_cid(cid)
|
||||
except Exception as e:
|
||||
print(f"Error retrieving CID {cid}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_properties(
|
||||
identifier: Union[str, int],
|
||||
namespace: str = 'name',
|
||||
properties: Optional[List[str]] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Get specific properties for a compound.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier (name, SMILES, CID, etc.)
|
||||
namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.)
|
||||
properties: List of properties to retrieve. If None, returns common properties.
|
||||
|
||||
Returns:
|
||||
Dictionary of properties
|
||||
"""
|
||||
if properties is None:
|
||||
properties = [
|
||||
'MolecularFormula',
|
||||
'MolecularWeight',
|
||||
'CanonicalSMILES',
|
||||
'IUPACName',
|
||||
'XLogP',
|
||||
'TPSA',
|
||||
'HBondDonorCount',
|
||||
'HBondAcceptorCount'
|
||||
]
|
||||
|
||||
try:
|
||||
result = pcp.get_properties(properties, identifier, namespace)
|
||||
return result[0] if result else {}
|
||||
except Exception as e:
|
||||
print(f"Error getting properties for '{identifier}': {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def similarity_search(
|
||||
smiles: str,
|
||||
threshold: int = 90,
|
||||
max_records: int = 10
|
||||
) -> List[pcp.Compound]:
|
||||
"""
|
||||
Perform similarity search for compounds similar to the query structure.
|
||||
|
||||
Args:
|
||||
smiles: Query SMILES string
|
||||
threshold: Similarity threshold (0-100)
|
||||
max_records: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of similar Compound objects
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(
|
||||
smiles,
|
||||
'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=threshold,
|
||||
MaxRecords=max_records
|
||||
)
|
||||
return compounds
|
||||
except Exception as e:
|
||||
print(f"Error in similarity search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def substructure_search(
|
||||
smiles: str,
|
||||
max_records: int = 100
|
||||
) -> List[pcp.Compound]:
|
||||
"""
|
||||
Perform substructure search for compounds containing the query structure.
|
||||
|
||||
Args:
|
||||
smiles: Query SMILES string (substructure)
|
||||
max_records: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Compound objects containing the substructure
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(
|
||||
smiles,
|
||||
'smiles',
|
||||
searchtype='substructure',
|
||||
MaxRecords=max_records
|
||||
)
|
||||
return compounds
|
||||
except Exception as e:
|
||||
print(f"Error in substructure search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]:
|
||||
"""
|
||||
Get all synonyms for a compound.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier
|
||||
namespace: Type of identifier
|
||||
|
||||
Returns:
|
||||
List of synonym strings
|
||||
"""
|
||||
try:
|
||||
results = pcp.get_synonyms(identifier, namespace)
|
||||
if results:
|
||||
return results[0].get('Synonym', [])
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error getting synonyms: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def batch_search(
|
||||
identifiers: List[str],
|
||||
namespace: str = 'name',
|
||||
properties: Optional[List[str]] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Batch search for multiple compounds.
|
||||
|
||||
Args:
|
||||
identifiers: List of compound identifiers
|
||||
namespace: Type of identifiers
|
||||
properties: List of properties to retrieve
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing properties for each compound
|
||||
"""
|
||||
results = []
|
||||
for identifier in identifiers:
|
||||
props = get_compound_properties(identifier, namespace, properties)
|
||||
if props:
|
||||
props['query'] = identifier
|
||||
results.append(props)
|
||||
return results
|
||||
|
||||
|
||||
def download_structure(
|
||||
identifier: Union[str, int],
|
||||
namespace: str = 'name',
|
||||
format: str = 'SDF',
|
||||
filename: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Download compound structure in specified format.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier
|
||||
namespace: Type of identifier
|
||||
format: Output format ('SDF', 'JSON', 'PNG', etc.)
|
||||
filename: Output filename (if None, returns data as string)
|
||||
|
||||
Returns:
|
||||
Data string if filename is None, else None
|
||||
"""
|
||||
try:
|
||||
if filename:
|
||||
pcp.download(format, identifier, namespace, filename, overwrite=True)
|
||||
return None
|
||||
else:
|
||||
return pcp.download(format, identifier, namespace)
|
||||
except Exception as e:
|
||||
print(f"Error downloading structure: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def print_compound_info(compound: pcp.Compound) -> None:
|
||||
"""
|
||||
Print formatted compound information.
|
||||
|
||||
Args:
|
||||
compound: PubChemPy Compound object
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Compound CID: {compound.cid}")
|
||||
print(f"{'='*60}")
|
||||
print(f"IUPAC Name: {compound.iupac_name or 'N/A'}")
|
||||
print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}")
|
||||
print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol")
|
||||
print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}")
|
||||
print(f"InChI: {compound.inchi or 'N/A'}")
|
||||
print(f"InChI Key: {compound.inchikey or 'N/A'}")
|
||||
print(f"XLogP: {compound.xlogp or 'N/A'}")
|
||||
print(f"TPSA: {compound.tpsa or 'N/A'} Ų")
|
||||
print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}")
|
||||
print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of PubChem search functions."""
|
||||
|
||||
# Example 1: Search by name
|
||||
print("Example 1: Searching for 'aspirin'...")
|
||||
compounds = search_by_name('aspirin', max_results=1)
|
||||
if compounds:
|
||||
print_compound_info(compounds[0])
|
||||
|
||||
# Example 2: Get properties
|
||||
print("\nExample 2: Getting properties for caffeine...")
|
||||
props = get_compound_properties('caffeine', 'name')
|
||||
print(json.dumps(props, indent=2))
|
||||
|
||||
# Example 3: Similarity search
|
||||
print("\nExample 3: Finding compounds similar to benzene...")
|
||||
benzene_smiles = 'c1ccccc1'
|
||||
similar = similarity_search(benzene_smiles, threshold=95, max_records=5)
|
||||
print(f"Found {len(similar)} similar compounds:")
|
||||
for comp in similar:
|
||||
print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}")
|
||||
|
||||
# Example 4: Batch search
|
||||
print("\nExample 4: Batch search for multiple compounds...")
|
||||
names = ['aspirin', 'ibuprofen', 'paracetamol']
|
||||
results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight'])
|
||||
for result in results:
|
||||
print(f" {result.get('query')}: {result.get('MolecularFormula')} "
|
||||
f"({result.get('MolecularWeight')} g/mol)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user