405 lines
11 KiB
Python
405 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TDC Molecular Generation with Oracles Template
|
|
|
|
This script demonstrates how to use TDC oracles for molecular generation
|
|
tasks including goal-directed generation and distribution learning.
|
|
|
|
Usage:
|
|
python molecular_generation.py
|
|
"""
|
|
|
|
from tdc.generation import MolGen
|
|
from tdc import Oracle
|
|
import numpy as np
|
|
|
|
|
|
def load_generation_dataset():
|
|
"""
|
|
Load molecular generation dataset
|
|
"""
|
|
print("=" * 60)
|
|
print("Loading Molecular Generation Dataset")
|
|
print("=" * 60)
|
|
|
|
# Load ChEMBL dataset
|
|
data = MolGen(name='ChEMBL_V29')
|
|
|
|
# Get training molecules
|
|
split = data.get_split()
|
|
train_smiles = split['train']['Drug'].tolist()
|
|
|
|
print(f"\nDataset: ChEMBL_V29")
|
|
print(f"Training molecules: {len(train_smiles)}")
|
|
|
|
# Display sample molecules
|
|
print("\nSample SMILES:")
|
|
for i, smiles in enumerate(train_smiles[:5], 1):
|
|
print(f" {i}. {smiles}")
|
|
|
|
return train_smiles
|
|
|
|
|
|
def single_oracle_example():
|
|
"""
|
|
Example: Using a single oracle for molecular evaluation
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 1: Single Oracle Evaluation")
|
|
print("=" * 60)
|
|
|
|
# Initialize oracle for GSK3B target
|
|
oracle = Oracle(name='GSK3B')
|
|
|
|
# Test molecules
|
|
test_molecules = [
|
|
'CC(C)Cc1ccc(cc1)C(C)C(O)=O', # Ibuprofen
|
|
'CC(=O)Oc1ccccc1C(=O)O', # Aspirin
|
|
'Cn1c(=O)c2c(ncn2C)n(C)c1=O', # Caffeine
|
|
'CN1C=NC2=C1C(=O)N(C(=O)N2C)C' # Theophylline
|
|
]
|
|
|
|
print("\nEvaluating molecules with GSK3B oracle:")
|
|
print("-" * 60)
|
|
|
|
for smiles in test_molecules:
|
|
score = oracle(smiles)
|
|
print(f"SMILES: {smiles}")
|
|
print(f"GSK3B score: {score:.4f}\n")
|
|
|
|
|
|
def multiple_oracles_example():
|
|
"""
|
|
Example: Using multiple oracles for multi-objective optimization
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 2: Multiple Oracles (Multi-Objective)")
|
|
print("=" * 60)
|
|
|
|
# Initialize multiple oracles
|
|
oracles = {
|
|
'QED': Oracle(name='QED'), # Drug-likeness
|
|
'SA': Oracle(name='SA'), # Synthetic accessibility
|
|
'GSK3B': Oracle(name='GSK3B'), # Target binding
|
|
'LogP': Oracle(name='LogP') # Lipophilicity
|
|
}
|
|
|
|
# Test molecule
|
|
test_smiles = 'CC(C)Cc1ccc(cc1)C(C)C(O)=O'
|
|
|
|
print(f"\nEvaluating: {test_smiles}")
|
|
print("-" * 60)
|
|
|
|
scores = {}
|
|
for name, oracle in oracles.items():
|
|
score = oracle(test_smiles)
|
|
scores[name] = score
|
|
print(f"{name:10s}: {score:.4f}")
|
|
|
|
# Multi-objective score (weighted combination)
|
|
print("\n--- Multi-Objective Scoring ---")
|
|
|
|
# Invert SA (lower is better, so we invert for maximization)
|
|
sa_score = 1.0 / (1.0 + scores['SA'])
|
|
|
|
# Weighted combination
|
|
weights = {'QED': 0.3, 'SA': 0.2, 'GSK3B': 0.4, 'LogP': 0.1}
|
|
multi_score = (
|
|
weights['QED'] * scores['QED'] +
|
|
weights['SA'] * sa_score +
|
|
weights['GSK3B'] * scores['GSK3B'] +
|
|
weights['LogP'] * (scores['LogP'] / 5.0) # Normalize LogP
|
|
)
|
|
|
|
print(f"Multi-objective score: {multi_score:.4f}")
|
|
print(f"Weights: {weights}")
|
|
|
|
|
|
def batch_evaluation_example():
|
|
"""
|
|
Example: Batch evaluation of multiple molecules
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 3: Batch Evaluation")
|
|
print("=" * 60)
|
|
|
|
# Generate sample molecules
|
|
molecules = [
|
|
'CC(C)Cc1ccc(cc1)C(C)C(O)=O',
|
|
'CC(=O)Oc1ccccc1C(=O)O',
|
|
'Cn1c(=O)c2c(ncn2C)n(C)c1=O',
|
|
'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
|
|
'CC(C)NCC(COc1ccc(cc1)COCCOC(C)C)O'
|
|
]
|
|
|
|
# Initialize oracle
|
|
oracle = Oracle(name='DRD2')
|
|
|
|
print(f"\nBatch evaluating {len(molecules)} molecules with DRD2 oracle...")
|
|
|
|
# Batch evaluation (more efficient than individual calls)
|
|
scores = oracle(molecules)
|
|
|
|
print("\nResults:")
|
|
print("-" * 60)
|
|
for smiles, score in zip(molecules, scores):
|
|
print(f"{smiles[:40]:40s}... Score: {score:.4f}")
|
|
|
|
# Statistics
|
|
print(f"\nStatistics:")
|
|
print(f" Mean score: {np.mean(scores):.4f}")
|
|
print(f" Std score: {np.std(scores):.4f}")
|
|
print(f" Min score: {np.min(scores):.4f}")
|
|
print(f" Max score: {np.max(scores):.4f}")
|
|
|
|
|
|
def goal_directed_generation_template():
|
|
"""
|
|
Template for goal-directed molecular generation
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 4: Goal-Directed Generation Template")
|
|
print("=" * 60)
|
|
|
|
template = '''
|
|
# Template for goal-directed molecular generation
|
|
|
|
from tdc.generation import MolGen
|
|
from tdc import Oracle
|
|
import numpy as np
|
|
|
|
# 1. Load training data
|
|
data = MolGen(name='ChEMBL_V29')
|
|
train_smiles = data.get_split()['train']['Drug'].tolist()
|
|
|
|
# 2. Initialize oracle(s)
|
|
oracle = Oracle(name='GSK3B')
|
|
|
|
# 3. Initialize your generative model
|
|
# model = YourGenerativeModel()
|
|
# model.fit(train_smiles)
|
|
|
|
# 4. Generation loop
|
|
num_iterations = 100
|
|
num_molecules_per_iter = 100
|
|
best_molecules = []
|
|
|
|
for iteration in range(num_iterations):
|
|
# Generate candidate molecules
|
|
# candidates = model.generate(num_molecules_per_iter)
|
|
|
|
# Evaluate with oracle
|
|
scores = oracle(candidates)
|
|
|
|
# Select top molecules
|
|
top_indices = np.argsort(scores)[-10:]
|
|
top_molecules = [candidates[i] for i in top_indices]
|
|
top_scores = [scores[i] for i in top_indices]
|
|
|
|
# Store best molecules
|
|
best_molecules.extend(zip(top_molecules, top_scores))
|
|
|
|
# Optional: Fine-tune model on top molecules
|
|
# model.fine_tune(top_molecules)
|
|
|
|
# Print progress
|
|
print(f"Iteration {iteration}: Best score = {max(scores):.4f}")
|
|
|
|
# Sort and display top molecules
|
|
best_molecules.sort(key=lambda x: x[1], reverse=True)
|
|
print("\\nTop 10 molecules:")
|
|
for smiles, score in best_molecules[:10]:
|
|
print(f"{smiles}: {score:.4f}")
|
|
'''
|
|
|
|
print("\nGoal-Directed Generation Template:")
|
|
print("=" * 60)
|
|
print(template)
|
|
|
|
|
|
def distribution_learning_example(train_smiles):
|
|
"""
|
|
Example: Distribution learning evaluation
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 5: Distribution Learning")
|
|
print("=" * 60)
|
|
|
|
# Use subset for demonstration
|
|
train_subset = train_smiles[:1000]
|
|
|
|
# Initialize oracle
|
|
oracle = Oracle(name='QED')
|
|
|
|
print("\nEvaluating property distribution...")
|
|
|
|
# Evaluate training set
|
|
print("Computing training set distribution...")
|
|
train_scores = oracle(train_subset)
|
|
|
|
# Simulate generated molecules (in practice, use your generative model)
|
|
# For demo: add noise to training molecules
|
|
print("Computing generated set distribution...")
|
|
generated_scores = train_scores + np.random.normal(0, 0.1, len(train_scores))
|
|
generated_scores = np.clip(generated_scores, 0, 1) # QED is [0, 1]
|
|
|
|
# Compare distributions
|
|
print("\n--- Distribution Statistics ---")
|
|
print(f"Training set (n={len(train_subset)}):")
|
|
print(f" Mean: {np.mean(train_scores):.4f}")
|
|
print(f" Std: {np.std(train_scores):.4f}")
|
|
print(f" Median: {np.median(train_scores):.4f}")
|
|
|
|
print(f"\nGenerated set (n={len(generated_scores)}):")
|
|
print(f" Mean: {np.mean(generated_scores):.4f}")
|
|
print(f" Std: {np.std(generated_scores):.4f}")
|
|
print(f" Median: {np.median(generated_scores):.4f}")
|
|
|
|
# Distribution similarity metrics
|
|
from scipy.stats import ks_2samp
|
|
ks_statistic, p_value = ks_2samp(train_scores, generated_scores)
|
|
|
|
print(f"\nKolmogorov-Smirnov Test:")
|
|
print(f" KS statistic: {ks_statistic:.4f}")
|
|
print(f" P-value: {p_value:.4f}")
|
|
|
|
if p_value > 0.05:
|
|
print(" → Distributions are similar (p > 0.05)")
|
|
else:
|
|
print(" → Distributions are significantly different (p < 0.05)")
|
|
|
|
|
|
def available_oracles_info():
|
|
"""
|
|
Display information about available oracles
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 6: Available Oracles")
|
|
print("=" * 60)
|
|
|
|
oracle_info = {
|
|
'Biochemical Targets': [
|
|
'DRD2', 'GSK3B', 'JNK3', '5HT2A', 'ACE',
|
|
'MAPK', 'CDK', 'P38', 'PARP1', 'PIK3CA'
|
|
],
|
|
'Physicochemical Properties': [
|
|
'QED', 'SA', 'LogP', 'MW', 'Lipinski'
|
|
],
|
|
'Composite Metrics': [
|
|
'Isomer_Meta', 'Median1', 'Median2',
|
|
'Rediscovery', 'Similarity', 'Uniqueness', 'Novelty'
|
|
],
|
|
'Specialized': [
|
|
'ASKCOS', 'Docking', 'Vina'
|
|
]
|
|
}
|
|
|
|
print("\nAvailable Oracle Categories:")
|
|
print("-" * 60)
|
|
|
|
for category, oracles in oracle_info.items():
|
|
print(f"\n{category}:")
|
|
for oracle_name in oracles:
|
|
print(f" - {oracle_name}")
|
|
|
|
print("\nFor detailed oracle documentation, see:")
|
|
print(" references/oracles.md")
|
|
|
|
|
|
def constraint_satisfaction_example():
|
|
"""
|
|
Example: Molecular generation with constraints
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 7: Constraint Satisfaction")
|
|
print("=" * 60)
|
|
|
|
# Define constraints
|
|
constraints = {
|
|
'QED': (0.5, 1.0), # Drug-likeness >= 0.5
|
|
'SA': (1.0, 5.0), # Easy to synthesize
|
|
'MW': (200, 500), # Molecular weight 200-500 Da
|
|
'LogP': (0, 3) # Lipophilicity 0-3
|
|
}
|
|
|
|
# Initialize oracles
|
|
oracles = {name: Oracle(name=name) for name in constraints.keys()}
|
|
|
|
# Test molecules
|
|
test_molecules = [
|
|
'CC(C)Cc1ccc(cc1)C(C)C(O)=O',
|
|
'CC(=O)Oc1ccccc1C(=O)O',
|
|
'Cn1c(=O)c2c(ncn2C)n(C)c1=O'
|
|
]
|
|
|
|
print("\nConstraints:")
|
|
for prop, (min_val, max_val) in constraints.items():
|
|
print(f" {prop}: [{min_val}, {max_val}]")
|
|
|
|
print("\n" + "-" * 60)
|
|
print("Evaluating molecules against constraints:")
|
|
print("-" * 60)
|
|
|
|
for smiles in test_molecules:
|
|
print(f"\nSMILES: {smiles}")
|
|
|
|
satisfies_all = True
|
|
for prop, (min_val, max_val) in constraints.items():
|
|
score = oracles[prop](smiles)
|
|
satisfies = min_val <= score <= max_val
|
|
|
|
status = "✓" if satisfies else "✗"
|
|
print(f" {prop:10s}: {score:7.2f} [{min_val:5.1f}, {max_val:5.1f}] {status}")
|
|
|
|
satisfies_all = satisfies_all and satisfies
|
|
|
|
result = "PASS" if satisfies_all else "FAIL"
|
|
print(f" Overall: {result}")
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main function to run all molecular generation examples
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("TDC Molecular Generation with Oracles Examples")
|
|
print("=" * 60)
|
|
|
|
# Load generation dataset
|
|
train_smiles = load_generation_dataset()
|
|
|
|
# Example 1: Single oracle
|
|
single_oracle_example()
|
|
|
|
# Example 2: Multiple oracles
|
|
multiple_oracles_example()
|
|
|
|
# Example 3: Batch evaluation
|
|
batch_evaluation_example()
|
|
|
|
# Example 4: Goal-directed generation template
|
|
goal_directed_generation_template()
|
|
|
|
# Example 5: Distribution learning
|
|
distribution_learning_example(train_smiles)
|
|
|
|
# Example 6: Available oracles
|
|
available_oracles_info()
|
|
|
|
# Example 7: Constraint satisfaction
|
|
constraint_satisfaction_example()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Molecular generation examples completed!")
|
|
print("=" * 60)
|
|
print("\nNext steps:")
|
|
print("1. Implement your generative model")
|
|
print("2. Use oracles to guide generation")
|
|
print("3. Evaluate generated molecules")
|
|
print("4. Iterate and optimize")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|