279 lines
7.0 KiB
Python
279 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ChEMBL Database Query Examples
|
|
|
|
This script demonstrates common query patterns for the ChEMBL database
|
|
using the chembl_webresource_client Python library.
|
|
|
|
Requirements:
|
|
pip install chembl_webresource_client
|
|
pip install pandas (optional, for data manipulation)
|
|
"""
|
|
|
|
from chembl_webresource_client.new_client import new_client
|
|
|
|
|
|
def get_molecule_info(chembl_id):
|
|
"""
|
|
Retrieve detailed information about a molecule by ChEMBL ID.
|
|
|
|
Args:
|
|
chembl_id: ChEMBL identifier (e.g., 'CHEMBL25')
|
|
|
|
Returns:
|
|
Dictionary containing molecule information
|
|
"""
|
|
molecule = new_client.molecule
|
|
return molecule.get(chembl_id)
|
|
|
|
|
|
def search_molecules_by_name(name_pattern):
|
|
"""
|
|
Search for molecules by name pattern.
|
|
|
|
Args:
|
|
name_pattern: Name or pattern to search for
|
|
|
|
Returns:
|
|
List of matching molecules
|
|
"""
|
|
molecule = new_client.molecule
|
|
results = molecule.filter(pref_name__icontains=name_pattern)
|
|
return list(results)
|
|
|
|
|
|
def find_molecules_by_properties(max_mw=500, min_logp=None, max_logp=None):
|
|
"""
|
|
Find molecules based on physicochemical properties.
|
|
|
|
Args:
|
|
max_mw: Maximum molecular weight
|
|
min_logp: Minimum LogP value
|
|
max_logp: Maximum LogP value
|
|
|
|
Returns:
|
|
List of matching molecules
|
|
"""
|
|
molecule = new_client.molecule
|
|
|
|
filters = {
|
|
'molecule_properties__mw_freebase__lte': max_mw
|
|
}
|
|
|
|
if min_logp is not None:
|
|
filters['molecule_properties__alogp__gte'] = min_logp
|
|
if max_logp is not None:
|
|
filters['molecule_properties__alogp__lte'] = max_logp
|
|
|
|
results = molecule.filter(**filters)
|
|
return list(results)
|
|
|
|
|
|
def get_target_info(target_chembl_id):
|
|
"""
|
|
Retrieve information about a biological target.
|
|
|
|
Args:
|
|
target_chembl_id: ChEMBL target identifier (e.g., 'CHEMBL240')
|
|
|
|
Returns:
|
|
Dictionary containing target information
|
|
"""
|
|
target = new_client.target
|
|
return target.get(target_chembl_id)
|
|
|
|
|
|
def search_targets_by_name(target_name):
|
|
"""
|
|
Search for targets by name or keyword.
|
|
|
|
Args:
|
|
target_name: Target name or keyword (e.g., 'kinase', 'EGFR')
|
|
|
|
Returns:
|
|
List of matching targets
|
|
"""
|
|
target = new_client.target
|
|
results = target.filter(
|
|
target_type='SINGLE PROTEIN',
|
|
pref_name__icontains=target_name
|
|
)
|
|
return list(results)
|
|
|
|
|
|
def get_bioactivity_data(target_chembl_id, activity_type='IC50', max_value=100):
|
|
"""
|
|
Retrieve bioactivity data for a specific target.
|
|
|
|
Args:
|
|
target_chembl_id: ChEMBL target identifier
|
|
activity_type: Type of activity (IC50, Ki, EC50, etc.)
|
|
max_value: Maximum activity value in nM
|
|
|
|
Returns:
|
|
List of activity records
|
|
"""
|
|
activity = new_client.activity
|
|
results = activity.filter(
|
|
target_chembl_id=target_chembl_id,
|
|
standard_type=activity_type,
|
|
standard_value__lte=max_value,
|
|
standard_units='nM'
|
|
)
|
|
return list(results)
|
|
|
|
|
|
def find_similar_compounds(smiles, similarity_threshold=85):
|
|
"""
|
|
Find compounds similar to a query structure.
|
|
|
|
Args:
|
|
smiles: SMILES string of query molecule
|
|
similarity_threshold: Minimum similarity percentage (0-100)
|
|
|
|
Returns:
|
|
List of similar compounds
|
|
"""
|
|
similarity = new_client.similarity
|
|
results = similarity.filter(
|
|
smiles=smiles,
|
|
similarity=similarity_threshold
|
|
)
|
|
return list(results)
|
|
|
|
|
|
def substructure_search(smiles):
|
|
"""
|
|
Search for compounds containing a specific substructure.
|
|
|
|
Args:
|
|
smiles: SMILES string of substructure
|
|
|
|
Returns:
|
|
List of compounds containing the substructure
|
|
"""
|
|
substructure = new_client.substructure
|
|
results = substructure.filter(smiles=smiles)
|
|
return list(results)
|
|
|
|
|
|
def get_drug_info(molecule_chembl_id):
|
|
"""
|
|
Retrieve drug information including indications and mechanisms.
|
|
|
|
Args:
|
|
molecule_chembl_id: ChEMBL molecule identifier
|
|
|
|
Returns:
|
|
Tuple of (drug_info, mechanisms, indications)
|
|
"""
|
|
drug = new_client.drug
|
|
mechanism = new_client.mechanism
|
|
drug_indication = new_client.drug_indication
|
|
|
|
try:
|
|
drug_info = drug.get(molecule_chembl_id)
|
|
except:
|
|
drug_info = None
|
|
|
|
mechanisms = list(mechanism.filter(molecule_chembl_id=molecule_chembl_id))
|
|
indications = list(drug_indication.filter(molecule_chembl_id=molecule_chembl_id))
|
|
|
|
return drug_info, mechanisms, indications
|
|
|
|
|
|
def find_kinase_inhibitors(max_ic50=100):
|
|
"""
|
|
Find potent kinase inhibitors.
|
|
|
|
Args:
|
|
max_ic50: Maximum IC50 value in nM
|
|
|
|
Returns:
|
|
List of kinase inhibitor activities
|
|
"""
|
|
target = new_client.target
|
|
activity = new_client.activity
|
|
|
|
# Find kinase targets
|
|
kinase_targets = target.filter(
|
|
target_type='SINGLE PROTEIN',
|
|
pref_name__icontains='kinase'
|
|
)
|
|
|
|
# Get target IDs
|
|
target_ids = [t['target_chembl_id'] for t in kinase_targets[:10]] # Limit to first 10
|
|
|
|
# Find activities
|
|
results = activity.filter(
|
|
target_chembl_id__in=target_ids,
|
|
standard_type='IC50',
|
|
standard_value__lte=max_ic50,
|
|
standard_units='nM'
|
|
)
|
|
|
|
return list(results)
|
|
|
|
|
|
def get_compound_bioactivities(molecule_chembl_id):
|
|
"""
|
|
Get all bioactivity data for a specific compound.
|
|
|
|
Args:
|
|
molecule_chembl_id: ChEMBL molecule identifier
|
|
|
|
Returns:
|
|
List of all activity records for the compound
|
|
"""
|
|
activity = new_client.activity
|
|
results = activity.filter(
|
|
molecule_chembl_id=molecule_chembl_id,
|
|
pchembl_value__isnull=False
|
|
)
|
|
return list(results)
|
|
|
|
|
|
def export_to_dataframe(data):
|
|
"""
|
|
Convert ChEMBL data to pandas DataFrame (requires pandas).
|
|
|
|
Args:
|
|
data: List of ChEMBL records
|
|
|
|
Returns:
|
|
pandas DataFrame
|
|
"""
|
|
try:
|
|
import pandas as pd
|
|
return pd.DataFrame(data)
|
|
except ImportError:
|
|
print("pandas not installed. Install with: pip install pandas")
|
|
return None
|
|
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
print("ChEMBL Database Query Examples")
|
|
print("=" * 50)
|
|
|
|
# Example 1: Get information about aspirin
|
|
print("\n1. Getting information about aspirin (CHEMBL25)...")
|
|
aspirin = get_molecule_info('CHEMBL25')
|
|
print(f"Name: {aspirin.get('pref_name')}")
|
|
print(f"Formula: {aspirin.get('molecule_properties', {}).get('full_molformula')}")
|
|
|
|
# Example 2: Search for EGFR inhibitors
|
|
print("\n2. Searching for EGFR targets...")
|
|
egfr_targets = search_targets_by_name('EGFR')
|
|
if egfr_targets:
|
|
print(f"Found {len(egfr_targets)} EGFR-related targets")
|
|
print(f"First target: {egfr_targets[0]['pref_name']}")
|
|
|
|
# Example 3: Find potent activities for a target
|
|
print("\n3. Finding potent compounds for EGFR (CHEMBL203)...")
|
|
activities = get_bioactivity_data('CHEMBL203', 'IC50', max_value=10)
|
|
print(f"Found {len(activities)} compounds with IC50 <= 10 nM")
|
|
|
|
print("\n" + "=" * 50)
|
|
print("Examples completed successfully!")
|