Initial commit
This commit is contained in:
327
skills/pytdc/scripts/benchmark_evaluation.py
Normal file
327
skills/pytdc/scripts/benchmark_evaluation.py
Normal file
@@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TDC Benchmark Group Evaluation Template
|
||||
|
||||
This script demonstrates how to use TDC benchmark groups for systematic
|
||||
model evaluation following the required 5-seed protocol.
|
||||
|
||||
Usage:
|
||||
python benchmark_evaluation.py
|
||||
"""
|
||||
|
||||
from tdc.benchmark_group import admet_group
|
||||
from tdc import Evaluator
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def load_benchmark_group():
|
||||
"""
|
||||
Load the ADMET benchmark group
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Loading ADMET Benchmark Group")
|
||||
print("=" * 60)
|
||||
|
||||
# Initialize benchmark group
|
||||
group = admet_group(path='data/')
|
||||
|
||||
# Get available benchmarks
|
||||
print("\nAvailable benchmarks in ADMET group:")
|
||||
benchmark_names = group.dataset_names
|
||||
print(f"Total: {len(benchmark_names)} datasets")
|
||||
|
||||
for i, name in enumerate(benchmark_names[:10], 1):
|
||||
print(f" {i}. {name}")
|
||||
|
||||
if len(benchmark_names) > 10:
|
||||
print(f" ... and {len(benchmark_names) - 10} more")
|
||||
|
||||
return group
|
||||
|
||||
|
||||
def single_dataset_evaluation(group, dataset_name='Caco2_Wang'):
|
||||
"""
|
||||
Example: Evaluate on a single dataset with 5-seed protocol
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Example 1: Single Dataset Evaluation ({dataset_name})")
|
||||
print("=" * 60)
|
||||
|
||||
# Get dataset benchmarks
|
||||
benchmark = group.get(dataset_name)
|
||||
|
||||
print(f"\nBenchmark structure:")
|
||||
print(f" Seeds: {list(benchmark.keys())}")
|
||||
|
||||
# Required: Evaluate with 5 different seeds
|
||||
predictions = {}
|
||||
|
||||
for seed in [1, 2, 3, 4, 5]:
|
||||
print(f"\n--- Seed {seed} ---")
|
||||
|
||||
# Get train/valid data for this seed
|
||||
train = benchmark[seed]['train']
|
||||
valid = benchmark[seed]['valid']
|
||||
|
||||
print(f"Train size: {len(train)}")
|
||||
print(f"Valid size: {len(valid)}")
|
||||
|
||||
# TODO: Replace with your model training
|
||||
# model = YourModel()
|
||||
# model.fit(train['Drug'], train['Y'])
|
||||
|
||||
# For demonstration, create dummy predictions
|
||||
# Replace with: predictions[seed] = model.predict(benchmark[seed]['test'])
|
||||
test = benchmark[seed]['test']
|
||||
y_true = test['Y'].values
|
||||
|
||||
# Simulate predictions (add controlled noise)
|
||||
np.random.seed(seed)
|
||||
y_pred = y_true + np.random.normal(0, 0.3, len(y_true))
|
||||
|
||||
predictions[seed] = y_pred
|
||||
|
||||
# Evaluate this seed
|
||||
evaluator = Evaluator(name='MAE')
|
||||
score = evaluator(y_true, y_pred)
|
||||
print(f"MAE for seed {seed}: {score:.4f}")
|
||||
|
||||
# Evaluate across all seeds
|
||||
print("\n--- Overall Evaluation ---")
|
||||
results = group.evaluate(predictions)
|
||||
|
||||
print(f"\nResults for {dataset_name}:")
|
||||
mean_score, std_score = results[dataset_name]
|
||||
print(f" Mean MAE: {mean_score:.4f}")
|
||||
print(f" Std MAE: {std_score:.4f}")
|
||||
|
||||
return predictions, results
|
||||
|
||||
|
||||
def multiple_datasets_evaluation(group):
|
||||
"""
|
||||
Example: Evaluate on multiple datasets
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 2: Multiple Datasets Evaluation")
|
||||
print("=" * 60)
|
||||
|
||||
# Select a subset of datasets for demonstration
|
||||
selected_datasets = ['Caco2_Wang', 'HIA_Hou', 'Bioavailability_Ma']
|
||||
|
||||
all_predictions = {}
|
||||
all_results = {}
|
||||
|
||||
for dataset_name in selected_datasets:
|
||||
print(f"\n{'='*40}")
|
||||
print(f"Evaluating: {dataset_name}")
|
||||
print(f"{'='*40}")
|
||||
|
||||
benchmark = group.get(dataset_name)
|
||||
predictions = {}
|
||||
|
||||
# Train and predict for each seed
|
||||
for seed in [1, 2, 3, 4, 5]:
|
||||
train = benchmark[seed]['train']
|
||||
test = benchmark[seed]['test']
|
||||
|
||||
# TODO: Replace with your model
|
||||
# model = YourModel()
|
||||
# model.fit(train['Drug'], train['Y'])
|
||||
# predictions[seed] = model.predict(test['Drug'])
|
||||
|
||||
# Dummy predictions for demonstration
|
||||
np.random.seed(seed)
|
||||
y_true = test['Y'].values
|
||||
y_pred = y_true + np.random.normal(0, 0.3, len(y_true))
|
||||
predictions[seed] = y_pred
|
||||
|
||||
all_predictions[dataset_name] = predictions
|
||||
|
||||
# Evaluate this dataset
|
||||
results = group.evaluate({dataset_name: predictions})
|
||||
all_results[dataset_name] = results[dataset_name]
|
||||
|
||||
mean_score, std_score = results[dataset_name]
|
||||
print(f" {dataset_name}: {mean_score:.4f} ± {std_score:.4f}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary of Results")
|
||||
print("=" * 60)
|
||||
|
||||
results_df = pd.DataFrame([
|
||||
{
|
||||
'Dataset': name,
|
||||
'Mean MAE': f"{mean:.4f}",
|
||||
'Std MAE': f"{std:.4f}"
|
||||
}
|
||||
for name, (mean, std) in all_results.items()
|
||||
])
|
||||
|
||||
print(results_df.to_string(index=False))
|
||||
|
||||
return all_predictions, all_results
|
||||
|
||||
|
||||
def custom_model_template():
|
||||
"""
|
||||
Template for integrating your own model with TDC benchmarks
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 3: Custom Model Template")
|
||||
print("=" * 60)
|
||||
|
||||
code_template = '''
|
||||
# Template for using your own model with TDC benchmarks
|
||||
|
||||
from tdc.benchmark_group import admet_group
|
||||
from your_library import YourModel # Replace with your model
|
||||
|
||||
# Initialize benchmark group
|
||||
group = admet_group(path='data/')
|
||||
benchmark = group.get('Caco2_Wang')
|
||||
|
||||
predictions = {}
|
||||
|
||||
for seed in [1, 2, 3, 4, 5]:
|
||||
# Get data for this seed
|
||||
train = benchmark[seed]['train']
|
||||
valid = benchmark[seed]['valid']
|
||||
test = benchmark[seed]['test']
|
||||
|
||||
# Extract features and labels
|
||||
X_train, y_train = train['Drug'], train['Y']
|
||||
X_valid, y_valid = valid['Drug'], valid['Y']
|
||||
X_test = test['Drug']
|
||||
|
||||
# Initialize and train model
|
||||
model = YourModel(random_state=seed)
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Optionally use validation set for early stopping
|
||||
# model.fit(X_train, y_train, validation_data=(X_valid, y_valid))
|
||||
|
||||
# Make predictions on test set
|
||||
predictions[seed] = model.predict(X_test)
|
||||
|
||||
# Evaluate with TDC
|
||||
results = group.evaluate(predictions)
|
||||
print(f"Results: {results}")
|
||||
'''
|
||||
|
||||
print("\nCustom Model Integration Template:")
|
||||
print("=" * 60)
|
||||
print(code_template)
|
||||
|
||||
return code_template
|
||||
|
||||
|
||||
def multi_seed_statistics(predictions_dict):
|
||||
"""
|
||||
Example: Analyzing multi-seed prediction statistics
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 4: Multi-Seed Statistics Analysis")
|
||||
print("=" * 60)
|
||||
|
||||
# Analyze prediction variability across seeds
|
||||
all_preds = np.array([predictions_dict[seed] for seed in [1, 2, 3, 4, 5]])
|
||||
|
||||
print("\nPrediction statistics across 5 seeds:")
|
||||
print(f" Shape: {all_preds.shape}")
|
||||
print(f" Mean prediction: {all_preds.mean():.4f}")
|
||||
print(f" Std across seeds: {all_preds.std(axis=0).mean():.4f}")
|
||||
print(f" Min prediction: {all_preds.min():.4f}")
|
||||
print(f" Max prediction: {all_preds.max():.4f}")
|
||||
|
||||
# Per-sample variance
|
||||
per_sample_std = all_preds.std(axis=0)
|
||||
print(f"\nPer-sample prediction std:")
|
||||
print(f" Mean: {per_sample_std.mean():.4f}")
|
||||
print(f" Median: {np.median(per_sample_std):.4f}")
|
||||
print(f" Max: {per_sample_std.max():.4f}")
|
||||
|
||||
|
||||
def leaderboard_submission_guide():
|
||||
"""
|
||||
Guide for submitting to TDC leaderboards
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 5: Leaderboard Submission Guide")
|
||||
print("=" * 60)
|
||||
|
||||
guide = """
|
||||
To submit results to TDC leaderboards:
|
||||
|
||||
1. Evaluate your model following the 5-seed protocol:
|
||||
- Use seeds [1, 2, 3, 4, 5] exactly as provided
|
||||
- Do not modify the train/valid/test splits
|
||||
- Report mean ± std across all 5 seeds
|
||||
|
||||
2. Format your results:
|
||||
results = group.evaluate(predictions)
|
||||
# Returns: {'dataset_name': [mean_score, std_score]}
|
||||
|
||||
3. Submit to leaderboard:
|
||||
- Visit: https://tdcommons.ai/benchmark/admet_group/
|
||||
- Click on your dataset of interest
|
||||
- Submit your results with:
|
||||
* Model name and description
|
||||
* Mean score ± standard deviation
|
||||
* Reference to paper/code (if available)
|
||||
|
||||
4. Best practices:
|
||||
- Report all datasets in the benchmark group
|
||||
- Include model hyperparameters
|
||||
- Share code for reproducibility
|
||||
- Compare against baseline models
|
||||
|
||||
5. Evaluation metrics:
|
||||
- ADMET Group uses MAE by default
|
||||
- Other groups may use different metrics
|
||||
- Check benchmark-specific requirements
|
||||
"""
|
||||
|
||||
print(guide)
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to run all benchmark evaluation examples
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TDC Benchmark Group Evaluation Examples")
|
||||
print("=" * 60)
|
||||
|
||||
# Load benchmark group
|
||||
group = load_benchmark_group()
|
||||
|
||||
# Example 1: Single dataset evaluation
|
||||
predictions, results = single_dataset_evaluation(group)
|
||||
|
||||
# Example 2: Multiple datasets evaluation
|
||||
all_predictions, all_results = multiple_datasets_evaluation(group)
|
||||
|
||||
# Example 3: Custom model template
|
||||
custom_model_template()
|
||||
|
||||
# Example 4: Multi-seed statistics
|
||||
multi_seed_statistics(predictions)
|
||||
|
||||
# Example 5: Leaderboard submission guide
|
||||
leaderboard_submission_guide()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Benchmark evaluation examples completed!")
|
||||
print("=" * 60)
|
||||
print("\nNext steps:")
|
||||
print("1. Replace dummy predictions with your model")
|
||||
print("2. Run full evaluation on all benchmark datasets")
|
||||
print("3. Submit results to TDC leaderboard")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
214
skills/pytdc/scripts/load_and_split_data.py
Normal file
214
skills/pytdc/scripts/load_and_split_data.py
Normal file
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TDC Data Loading and Splitting Template
|
||||
|
||||
This script demonstrates how to load TDC datasets and apply different
|
||||
splitting strategies for model training and evaluation.
|
||||
|
||||
Usage:
|
||||
python load_and_split_data.py
|
||||
"""
|
||||
|
||||
from tdc.single_pred import ADME
|
||||
from tdc.multi_pred import DTI
|
||||
from tdc import Evaluator
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def load_single_pred_example():
|
||||
"""
|
||||
Example: Loading and splitting a single-prediction dataset (ADME)
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Example 1: Single-Prediction Task (ADME)")
|
||||
print("=" * 60)
|
||||
|
||||
# Load Caco2 dataset (intestinal permeability)
|
||||
print("\nLoading Caco2_Wang dataset...")
|
||||
data = ADME(name='Caco2_Wang')
|
||||
|
||||
# Get basic dataset info
|
||||
print(f"\nDataset size: {len(data.get_data())} molecules")
|
||||
data.print_stats()
|
||||
|
||||
# Method 1: Scaffold split (default, recommended)
|
||||
print("\n--- Scaffold Split ---")
|
||||
split = data.get_split(method='scaffold', seed=42, frac=[0.7, 0.1, 0.2])
|
||||
|
||||
train = split['train']
|
||||
valid = split['valid']
|
||||
test = split['test']
|
||||
|
||||
print(f"Train: {len(train)} molecules")
|
||||
print(f"Valid: {len(valid)} molecules")
|
||||
print(f"Test: {len(test)} molecules")
|
||||
|
||||
# Display sample data
|
||||
print("\nSample training data:")
|
||||
print(train.head(3))
|
||||
|
||||
# Method 2: Random split
|
||||
print("\n--- Random Split ---")
|
||||
split_random = data.get_split(method='random', seed=42, frac=[0.8, 0.1, 0.1])
|
||||
print(f"Train: {len(split_random['train'])} molecules")
|
||||
print(f"Valid: {len(split_random['valid'])} molecules")
|
||||
print(f"Test: {len(split_random['test'])} molecules")
|
||||
|
||||
return split
|
||||
|
||||
|
||||
def load_multi_pred_example():
|
||||
"""
|
||||
Example: Loading and splitting a multi-prediction dataset (DTI)
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 2: Multi-Prediction Task (DTI)")
|
||||
print("=" * 60)
|
||||
|
||||
# Load BindingDB Kd dataset (drug-target interactions)
|
||||
print("\nLoading BindingDB_Kd dataset...")
|
||||
data = DTI(name='BindingDB_Kd')
|
||||
|
||||
# Get basic dataset info
|
||||
full_data = data.get_data()
|
||||
print(f"\nDataset size: {len(full_data)} drug-target pairs")
|
||||
print(f"Unique drugs: {full_data['Drug_ID'].nunique()}")
|
||||
print(f"Unique targets: {full_data['Target_ID'].nunique()}")
|
||||
|
||||
# Method 1: Random split
|
||||
print("\n--- Random Split ---")
|
||||
split_random = data.get_split(method='random', seed=42)
|
||||
print(f"Train: {len(split_random['train'])} pairs")
|
||||
print(f"Valid: {len(split_random['valid'])} pairs")
|
||||
print(f"Test: {len(split_random['test'])} pairs")
|
||||
|
||||
# Method 2: Cold drug split (unseen drugs in test)
|
||||
print("\n--- Cold Drug Split ---")
|
||||
split_cold_drug = data.get_split(method='cold_drug', seed=42)
|
||||
|
||||
train = split_cold_drug['train']
|
||||
test = split_cold_drug['test']
|
||||
|
||||
# Verify no drug overlap
|
||||
train_drugs = set(train['Drug_ID'])
|
||||
test_drugs = set(test['Drug_ID'])
|
||||
overlap = train_drugs & test_drugs
|
||||
|
||||
print(f"Train: {len(train)} pairs, {len(train_drugs)} unique drugs")
|
||||
print(f"Test: {len(test)} pairs, {len(test_drugs)} unique drugs")
|
||||
print(f"Drug overlap: {len(overlap)} (should be 0)")
|
||||
|
||||
# Method 3: Cold target split (unseen targets in test)
|
||||
print("\n--- Cold Target Split ---")
|
||||
split_cold_target = data.get_split(method='cold_target', seed=42)
|
||||
|
||||
train = split_cold_target['train']
|
||||
test = split_cold_target['test']
|
||||
|
||||
train_targets = set(train['Target_ID'])
|
||||
test_targets = set(test['Target_ID'])
|
||||
overlap = train_targets & test_targets
|
||||
|
||||
print(f"Train: {len(train)} pairs, {len(train_targets)} unique targets")
|
||||
print(f"Test: {len(test)} pairs, {len(test_targets)} unique targets")
|
||||
print(f"Target overlap: {len(overlap)} (should be 0)")
|
||||
|
||||
# Display sample data
|
||||
print("\nSample DTI data:")
|
||||
print(full_data.head(3))
|
||||
|
||||
return split_cold_drug
|
||||
|
||||
|
||||
def evaluation_example(split):
|
||||
"""
|
||||
Example: Evaluating model predictions with TDC evaluators
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 3: Model Evaluation")
|
||||
print("=" * 60)
|
||||
|
||||
test = split['test']
|
||||
|
||||
# For demonstration, create dummy predictions
|
||||
# In practice, replace with your model's predictions
|
||||
import numpy as np
|
||||
np.random.seed(42)
|
||||
|
||||
# Simulate predictions (replace with model.predict(test['Drug']))
|
||||
y_true = test['Y'].values
|
||||
y_pred = y_true + np.random.normal(0, 0.5, len(y_true)) # Add noise
|
||||
|
||||
# Evaluate with different metrics
|
||||
print("\nEvaluating predictions...")
|
||||
|
||||
# Regression metrics
|
||||
mae_evaluator = Evaluator(name='MAE')
|
||||
mae = mae_evaluator(y_true, y_pred)
|
||||
print(f"MAE: {mae:.4f}")
|
||||
|
||||
rmse_evaluator = Evaluator(name='RMSE')
|
||||
rmse = rmse_evaluator(y_true, y_pred)
|
||||
print(f"RMSE: {rmse:.4f}")
|
||||
|
||||
r2_evaluator = Evaluator(name='R2')
|
||||
r2 = r2_evaluator(y_true, y_pred)
|
||||
print(f"R²: {r2:.4f}")
|
||||
|
||||
spearman_evaluator = Evaluator(name='Spearman')
|
||||
spearman = spearman_evaluator(y_true, y_pred)
|
||||
print(f"Spearman: {spearman:.4f}")
|
||||
|
||||
|
||||
def custom_split_example():
|
||||
"""
|
||||
Example: Creating custom splits with different fractions
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 4: Custom Split Fractions")
|
||||
print("=" * 60)
|
||||
|
||||
data = ADME(name='HIA_Hou')
|
||||
|
||||
# Custom split fractions
|
||||
custom_fracs = [
|
||||
([0.6, 0.2, 0.2], "60/20/20 split"),
|
||||
([0.8, 0.1, 0.1], "80/10/10 split"),
|
||||
([0.7, 0.15, 0.15], "70/15/15 split")
|
||||
]
|
||||
|
||||
for frac, description in custom_fracs:
|
||||
split = data.get_split(method='scaffold', seed=42, frac=frac)
|
||||
print(f"\n{description}:")
|
||||
print(f" Train: {len(split['train'])} ({frac[0]*100:.0f}%)")
|
||||
print(f" Valid: {len(split['valid'])} ({frac[1]*100:.0f}%)")
|
||||
print(f" Test: {len(split['test'])} ({frac[2]*100:.0f}%)")
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to run all examples
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TDC Data Loading and Splitting Examples")
|
||||
print("=" * 60)
|
||||
|
||||
# Example 1: Single prediction with scaffold split
|
||||
split = load_single_pred_example()
|
||||
|
||||
# Example 2: Multi prediction with cold splits
|
||||
dti_split = load_multi_pred_example()
|
||||
|
||||
# Example 3: Model evaluation
|
||||
evaluation_example(split)
|
||||
|
||||
# Example 4: Custom split fractions
|
||||
custom_split_example()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Examples completed!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
404
skills/pytdc/scripts/molecular_generation.py
Normal file
404
skills/pytdc/scripts/molecular_generation.py
Normal file
@@ -0,0 +1,404 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TDC Molecular Generation with Oracles Template
|
||||
|
||||
This script demonstrates how to use TDC oracles for molecular generation
|
||||
tasks including goal-directed generation and distribution learning.
|
||||
|
||||
Usage:
|
||||
python molecular_generation.py
|
||||
"""
|
||||
|
||||
from tdc.generation import MolGen
|
||||
from tdc import Oracle
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_generation_dataset():
|
||||
"""
|
||||
Load molecular generation dataset
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Loading Molecular Generation Dataset")
|
||||
print("=" * 60)
|
||||
|
||||
# Load ChEMBL dataset
|
||||
data = MolGen(name='ChEMBL_V29')
|
||||
|
||||
# Get training molecules
|
||||
split = data.get_split()
|
||||
train_smiles = split['train']['Drug'].tolist()
|
||||
|
||||
print(f"\nDataset: ChEMBL_V29")
|
||||
print(f"Training molecules: {len(train_smiles)}")
|
||||
|
||||
# Display sample molecules
|
||||
print("\nSample SMILES:")
|
||||
for i, smiles in enumerate(train_smiles[:5], 1):
|
||||
print(f" {i}. {smiles}")
|
||||
|
||||
return train_smiles
|
||||
|
||||
|
||||
def single_oracle_example():
|
||||
"""
|
||||
Example: Using a single oracle for molecular evaluation
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 1: Single Oracle Evaluation")
|
||||
print("=" * 60)
|
||||
|
||||
# Initialize oracle for GSK3B target
|
||||
oracle = Oracle(name='GSK3B')
|
||||
|
||||
# Test molecules
|
||||
test_molecules = [
|
||||
'CC(C)Cc1ccc(cc1)C(C)C(O)=O', # Ibuprofen
|
||||
'CC(=O)Oc1ccccc1C(=O)O', # Aspirin
|
||||
'Cn1c(=O)c2c(ncn2C)n(C)c1=O', # Caffeine
|
||||
'CN1C=NC2=C1C(=O)N(C(=O)N2C)C' # Theophylline
|
||||
]
|
||||
|
||||
print("\nEvaluating molecules with GSK3B oracle:")
|
||||
print("-" * 60)
|
||||
|
||||
for smiles in test_molecules:
|
||||
score = oracle(smiles)
|
||||
print(f"SMILES: {smiles}")
|
||||
print(f"GSK3B score: {score:.4f}\n")
|
||||
|
||||
|
||||
def multiple_oracles_example():
|
||||
"""
|
||||
Example: Using multiple oracles for multi-objective optimization
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 2: Multiple Oracles (Multi-Objective)")
|
||||
print("=" * 60)
|
||||
|
||||
# Initialize multiple oracles
|
||||
oracles = {
|
||||
'QED': Oracle(name='QED'), # Drug-likeness
|
||||
'SA': Oracle(name='SA'), # Synthetic accessibility
|
||||
'GSK3B': Oracle(name='GSK3B'), # Target binding
|
||||
'LogP': Oracle(name='LogP') # Lipophilicity
|
||||
}
|
||||
|
||||
# Test molecule
|
||||
test_smiles = 'CC(C)Cc1ccc(cc1)C(C)C(O)=O'
|
||||
|
||||
print(f"\nEvaluating: {test_smiles}")
|
||||
print("-" * 60)
|
||||
|
||||
scores = {}
|
||||
for name, oracle in oracles.items():
|
||||
score = oracle(test_smiles)
|
||||
scores[name] = score
|
||||
print(f"{name:10s}: {score:.4f}")
|
||||
|
||||
# Multi-objective score (weighted combination)
|
||||
print("\n--- Multi-Objective Scoring ---")
|
||||
|
||||
# Invert SA (lower is better, so we invert for maximization)
|
||||
sa_score = 1.0 / (1.0 + scores['SA'])
|
||||
|
||||
# Weighted combination
|
||||
weights = {'QED': 0.3, 'SA': 0.2, 'GSK3B': 0.4, 'LogP': 0.1}
|
||||
multi_score = (
|
||||
weights['QED'] * scores['QED'] +
|
||||
weights['SA'] * sa_score +
|
||||
weights['GSK3B'] * scores['GSK3B'] +
|
||||
weights['LogP'] * (scores['LogP'] / 5.0) # Normalize LogP
|
||||
)
|
||||
|
||||
print(f"Multi-objective score: {multi_score:.4f}")
|
||||
print(f"Weights: {weights}")
|
||||
|
||||
|
||||
def batch_evaluation_example():
|
||||
"""
|
||||
Example: Batch evaluation of multiple molecules
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 3: Batch Evaluation")
|
||||
print("=" * 60)
|
||||
|
||||
# Generate sample molecules
|
||||
molecules = [
|
||||
'CC(C)Cc1ccc(cc1)C(C)C(O)=O',
|
||||
'CC(=O)Oc1ccccc1C(=O)O',
|
||||
'Cn1c(=O)c2c(ncn2C)n(C)c1=O',
|
||||
'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',
|
||||
'CC(C)NCC(COc1ccc(cc1)COCCOC(C)C)O'
|
||||
]
|
||||
|
||||
# Initialize oracle
|
||||
oracle = Oracle(name='DRD2')
|
||||
|
||||
print(f"\nBatch evaluating {len(molecules)} molecules with DRD2 oracle...")
|
||||
|
||||
# Batch evaluation (more efficient than individual calls)
|
||||
scores = oracle(molecules)
|
||||
|
||||
print("\nResults:")
|
||||
print("-" * 60)
|
||||
for smiles, score in zip(molecules, scores):
|
||||
print(f"{smiles[:40]:40s}... Score: {score:.4f}")
|
||||
|
||||
# Statistics
|
||||
print(f"\nStatistics:")
|
||||
print(f" Mean score: {np.mean(scores):.4f}")
|
||||
print(f" Std score: {np.std(scores):.4f}")
|
||||
print(f" Min score: {np.min(scores):.4f}")
|
||||
print(f" Max score: {np.max(scores):.4f}")
|
||||
|
||||
|
||||
def goal_directed_generation_template():
|
||||
"""
|
||||
Template for goal-directed molecular generation
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 4: Goal-Directed Generation Template")
|
||||
print("=" * 60)
|
||||
|
||||
template = '''
|
||||
# Template for goal-directed molecular generation
|
||||
|
||||
from tdc.generation import MolGen
|
||||
from tdc import Oracle
|
||||
import numpy as np
|
||||
|
||||
# 1. Load training data
|
||||
data = MolGen(name='ChEMBL_V29')
|
||||
train_smiles = data.get_split()['train']['Drug'].tolist()
|
||||
|
||||
# 2. Initialize oracle(s)
|
||||
oracle = Oracle(name='GSK3B')
|
||||
|
||||
# 3. Initialize your generative model
|
||||
# model = YourGenerativeModel()
|
||||
# model.fit(train_smiles)
|
||||
|
||||
# 4. Generation loop
|
||||
num_iterations = 100
|
||||
num_molecules_per_iter = 100
|
||||
best_molecules = []
|
||||
|
||||
for iteration in range(num_iterations):
|
||||
# Generate candidate molecules
|
||||
# candidates = model.generate(num_molecules_per_iter)
|
||||
|
||||
# Evaluate with oracle
|
||||
scores = oracle(candidates)
|
||||
|
||||
# Select top molecules
|
||||
top_indices = np.argsort(scores)[-10:]
|
||||
top_molecules = [candidates[i] for i in top_indices]
|
||||
top_scores = [scores[i] for i in top_indices]
|
||||
|
||||
# Store best molecules
|
||||
best_molecules.extend(zip(top_molecules, top_scores))
|
||||
|
||||
# Optional: Fine-tune model on top molecules
|
||||
# model.fine_tune(top_molecules)
|
||||
|
||||
# Print progress
|
||||
print(f"Iteration {iteration}: Best score = {max(scores):.4f}")
|
||||
|
||||
# Sort and display top molecules
|
||||
best_molecules.sort(key=lambda x: x[1], reverse=True)
|
||||
print("\\nTop 10 molecules:")
|
||||
for smiles, score in best_molecules[:10]:
|
||||
print(f"{smiles}: {score:.4f}")
|
||||
'''
|
||||
|
||||
print("\nGoal-Directed Generation Template:")
|
||||
print("=" * 60)
|
||||
print(template)
|
||||
|
||||
|
||||
def distribution_learning_example(train_smiles):
|
||||
"""
|
||||
Example: Distribution learning evaluation
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 5: Distribution Learning")
|
||||
print("=" * 60)
|
||||
|
||||
# Use subset for demonstration
|
||||
train_subset = train_smiles[:1000]
|
||||
|
||||
# Initialize oracle
|
||||
oracle = Oracle(name='QED')
|
||||
|
||||
print("\nEvaluating property distribution...")
|
||||
|
||||
# Evaluate training set
|
||||
print("Computing training set distribution...")
|
||||
train_scores = oracle(train_subset)
|
||||
|
||||
# Simulate generated molecules (in practice, use your generative model)
|
||||
# For demo: add noise to training molecules
|
||||
print("Computing generated set distribution...")
|
||||
generated_scores = train_scores + np.random.normal(0, 0.1, len(train_scores))
|
||||
generated_scores = np.clip(generated_scores, 0, 1) # QED is [0, 1]
|
||||
|
||||
# Compare distributions
|
||||
print("\n--- Distribution Statistics ---")
|
||||
print(f"Training set (n={len(train_subset)}):")
|
||||
print(f" Mean: {np.mean(train_scores):.4f}")
|
||||
print(f" Std: {np.std(train_scores):.4f}")
|
||||
print(f" Median: {np.median(train_scores):.4f}")
|
||||
|
||||
print(f"\nGenerated set (n={len(generated_scores)}):")
|
||||
print(f" Mean: {np.mean(generated_scores):.4f}")
|
||||
print(f" Std: {np.std(generated_scores):.4f}")
|
||||
print(f" Median: {np.median(generated_scores):.4f}")
|
||||
|
||||
# Distribution similarity metrics
|
||||
from scipy.stats import ks_2samp
|
||||
ks_statistic, p_value = ks_2samp(train_scores, generated_scores)
|
||||
|
||||
print(f"\nKolmogorov-Smirnov Test:")
|
||||
print(f" KS statistic: {ks_statistic:.4f}")
|
||||
print(f" P-value: {p_value:.4f}")
|
||||
|
||||
if p_value > 0.05:
|
||||
print(" → Distributions are similar (p > 0.05)")
|
||||
else:
|
||||
print(" → Distributions are significantly different (p < 0.05)")
|
||||
|
||||
|
||||
def available_oracles_info():
|
||||
"""
|
||||
Display information about available oracles
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 6: Available Oracles")
|
||||
print("=" * 60)
|
||||
|
||||
oracle_info = {
|
||||
'Biochemical Targets': [
|
||||
'DRD2', 'GSK3B', 'JNK3', '5HT2A', 'ACE',
|
||||
'MAPK', 'CDK', 'P38', 'PARP1', 'PIK3CA'
|
||||
],
|
||||
'Physicochemical Properties': [
|
||||
'QED', 'SA', 'LogP', 'MW', 'Lipinski'
|
||||
],
|
||||
'Composite Metrics': [
|
||||
'Isomer_Meta', 'Median1', 'Median2',
|
||||
'Rediscovery', 'Similarity', 'Uniqueness', 'Novelty'
|
||||
],
|
||||
'Specialized': [
|
||||
'ASKCOS', 'Docking', 'Vina'
|
||||
]
|
||||
}
|
||||
|
||||
print("\nAvailable Oracle Categories:")
|
||||
print("-" * 60)
|
||||
|
||||
for category, oracles in oracle_info.items():
|
||||
print(f"\n{category}:")
|
||||
for oracle_name in oracles:
|
||||
print(f" - {oracle_name}")
|
||||
|
||||
print("\nFor detailed oracle documentation, see:")
|
||||
print(" references/oracles.md")
|
||||
|
||||
|
||||
def constraint_satisfaction_example():
|
||||
"""
|
||||
Example: Molecular generation with constraints
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 7: Constraint Satisfaction")
|
||||
print("=" * 60)
|
||||
|
||||
# Define constraints
|
||||
constraints = {
|
||||
'QED': (0.5, 1.0), # Drug-likeness >= 0.5
|
||||
'SA': (1.0, 5.0), # Easy to synthesize
|
||||
'MW': (200, 500), # Molecular weight 200-500 Da
|
||||
'LogP': (0, 3) # Lipophilicity 0-3
|
||||
}
|
||||
|
||||
# Initialize oracles
|
||||
oracles = {name: Oracle(name=name) for name in constraints.keys()}
|
||||
|
||||
# Test molecules
|
||||
test_molecules = [
|
||||
'CC(C)Cc1ccc(cc1)C(C)C(O)=O',
|
||||
'CC(=O)Oc1ccccc1C(=O)O',
|
||||
'Cn1c(=O)c2c(ncn2C)n(C)c1=O'
|
||||
]
|
||||
|
||||
print("\nConstraints:")
|
||||
for prop, (min_val, max_val) in constraints.items():
|
||||
print(f" {prop}: [{min_val}, {max_val}]")
|
||||
|
||||
print("\n" + "-" * 60)
|
||||
print("Evaluating molecules against constraints:")
|
||||
print("-" * 60)
|
||||
|
||||
for smiles in test_molecules:
|
||||
print(f"\nSMILES: {smiles}")
|
||||
|
||||
satisfies_all = True
|
||||
for prop, (min_val, max_val) in constraints.items():
|
||||
score = oracles[prop](smiles)
|
||||
satisfies = min_val <= score <= max_val
|
||||
|
||||
status = "✓" if satisfies else "✗"
|
||||
print(f" {prop:10s}: {score:7.2f} [{min_val:5.1f}, {max_val:5.1f}] {status}")
|
||||
|
||||
satisfies_all = satisfies_all and satisfies
|
||||
|
||||
result = "PASS" if satisfies_all else "FAIL"
|
||||
print(f" Overall: {result}")
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to run all molecular generation examples
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TDC Molecular Generation with Oracles Examples")
|
||||
print("=" * 60)
|
||||
|
||||
# Load generation dataset
|
||||
train_smiles = load_generation_dataset()
|
||||
|
||||
# Example 1: Single oracle
|
||||
single_oracle_example()
|
||||
|
||||
# Example 2: Multiple oracles
|
||||
multiple_oracles_example()
|
||||
|
||||
# Example 3: Batch evaluation
|
||||
batch_evaluation_example()
|
||||
|
||||
# Example 4: Goal-directed generation template
|
||||
goal_directed_generation_template()
|
||||
|
||||
# Example 5: Distribution learning
|
||||
distribution_learning_example(train_smiles)
|
||||
|
||||
# Example 6: Available oracles
|
||||
available_oracles_info()
|
||||
|
||||
# Example 7: Constraint satisfaction
|
||||
constraint_satisfaction_example()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Molecular generation examples completed!")
|
||||
print("=" * 60)
|
||||
print("\nNext steps:")
|
||||
print("1. Implement your generative model")
|
||||
print("2. Use oracles to guide generation")
|
||||
print("3. Evaluate generated molecules")
|
||||
print("4. Iterate and optimize")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user