328 lines
9.0 KiB
Python
328 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TDC Benchmark Group Evaluation Template
|
|
|
|
This script demonstrates how to use TDC benchmark groups for systematic
|
|
model evaluation following the required 5-seed protocol.
|
|
|
|
Usage:
|
|
python benchmark_evaluation.py
|
|
"""
|
|
|
|
from tdc.benchmark_group import admet_group
|
|
from tdc import Evaluator
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
|
|
def load_benchmark_group():
|
|
"""
|
|
Load the ADMET benchmark group
|
|
"""
|
|
print("=" * 60)
|
|
print("Loading ADMET Benchmark Group")
|
|
print("=" * 60)
|
|
|
|
# Initialize benchmark group
|
|
group = admet_group(path='data/')
|
|
|
|
# Get available benchmarks
|
|
print("\nAvailable benchmarks in ADMET group:")
|
|
benchmark_names = group.dataset_names
|
|
print(f"Total: {len(benchmark_names)} datasets")
|
|
|
|
for i, name in enumerate(benchmark_names[:10], 1):
|
|
print(f" {i}. {name}")
|
|
|
|
if len(benchmark_names) > 10:
|
|
print(f" ... and {len(benchmark_names) - 10} more")
|
|
|
|
return group
|
|
|
|
|
|
def single_dataset_evaluation(group, dataset_name='Caco2_Wang'):
|
|
"""
|
|
Example: Evaluate on a single dataset with 5-seed protocol
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print(f"Example 1: Single Dataset Evaluation ({dataset_name})")
|
|
print("=" * 60)
|
|
|
|
# Get dataset benchmarks
|
|
benchmark = group.get(dataset_name)
|
|
|
|
print(f"\nBenchmark structure:")
|
|
print(f" Seeds: {list(benchmark.keys())}")
|
|
|
|
# Required: Evaluate with 5 different seeds
|
|
predictions = {}
|
|
|
|
for seed in [1, 2, 3, 4, 5]:
|
|
print(f"\n--- Seed {seed} ---")
|
|
|
|
# Get train/valid data for this seed
|
|
train = benchmark[seed]['train']
|
|
valid = benchmark[seed]['valid']
|
|
|
|
print(f"Train size: {len(train)}")
|
|
print(f"Valid size: {len(valid)}")
|
|
|
|
# TODO: Replace with your model training
|
|
# model = YourModel()
|
|
# model.fit(train['Drug'], train['Y'])
|
|
|
|
# For demonstration, create dummy predictions
|
|
# Replace with: predictions[seed] = model.predict(benchmark[seed]['test'])
|
|
test = benchmark[seed]['test']
|
|
y_true = test['Y'].values
|
|
|
|
# Simulate predictions (add controlled noise)
|
|
np.random.seed(seed)
|
|
y_pred = y_true + np.random.normal(0, 0.3, len(y_true))
|
|
|
|
predictions[seed] = y_pred
|
|
|
|
# Evaluate this seed
|
|
evaluator = Evaluator(name='MAE')
|
|
score = evaluator(y_true, y_pred)
|
|
print(f"MAE for seed {seed}: {score:.4f}")
|
|
|
|
# Evaluate across all seeds
|
|
print("\n--- Overall Evaluation ---")
|
|
results = group.evaluate(predictions)
|
|
|
|
print(f"\nResults for {dataset_name}:")
|
|
mean_score, std_score = results[dataset_name]
|
|
print(f" Mean MAE: {mean_score:.4f}")
|
|
print(f" Std MAE: {std_score:.4f}")
|
|
|
|
return predictions, results
|
|
|
|
|
|
def multiple_datasets_evaluation(group):
|
|
"""
|
|
Example: Evaluate on multiple datasets
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 2: Multiple Datasets Evaluation")
|
|
print("=" * 60)
|
|
|
|
# Select a subset of datasets for demonstration
|
|
selected_datasets = ['Caco2_Wang', 'HIA_Hou', 'Bioavailability_Ma']
|
|
|
|
all_predictions = {}
|
|
all_results = {}
|
|
|
|
for dataset_name in selected_datasets:
|
|
print(f"\n{'='*40}")
|
|
print(f"Evaluating: {dataset_name}")
|
|
print(f"{'='*40}")
|
|
|
|
benchmark = group.get(dataset_name)
|
|
predictions = {}
|
|
|
|
# Train and predict for each seed
|
|
for seed in [1, 2, 3, 4, 5]:
|
|
train = benchmark[seed]['train']
|
|
test = benchmark[seed]['test']
|
|
|
|
# TODO: Replace with your model
|
|
# model = YourModel()
|
|
# model.fit(train['Drug'], train['Y'])
|
|
# predictions[seed] = model.predict(test['Drug'])
|
|
|
|
# Dummy predictions for demonstration
|
|
np.random.seed(seed)
|
|
y_true = test['Y'].values
|
|
y_pred = y_true + np.random.normal(0, 0.3, len(y_true))
|
|
predictions[seed] = y_pred
|
|
|
|
all_predictions[dataset_name] = predictions
|
|
|
|
# Evaluate this dataset
|
|
results = group.evaluate({dataset_name: predictions})
|
|
all_results[dataset_name] = results[dataset_name]
|
|
|
|
mean_score, std_score = results[dataset_name]
|
|
print(f" {dataset_name}: {mean_score:.4f} ± {std_score:.4f}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("Summary of Results")
|
|
print("=" * 60)
|
|
|
|
results_df = pd.DataFrame([
|
|
{
|
|
'Dataset': name,
|
|
'Mean MAE': f"{mean:.4f}",
|
|
'Std MAE': f"{std:.4f}"
|
|
}
|
|
for name, (mean, std) in all_results.items()
|
|
])
|
|
|
|
print(results_df.to_string(index=False))
|
|
|
|
return all_predictions, all_results
|
|
|
|
|
|
def custom_model_template():
|
|
"""
|
|
Template for integrating your own model with TDC benchmarks
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 3: Custom Model Template")
|
|
print("=" * 60)
|
|
|
|
code_template = '''
|
|
# Template for using your own model with TDC benchmarks
|
|
|
|
from tdc.benchmark_group import admet_group
|
|
from your_library import YourModel # Replace with your model
|
|
|
|
# Initialize benchmark group
|
|
group = admet_group(path='data/')
|
|
benchmark = group.get('Caco2_Wang')
|
|
|
|
predictions = {}
|
|
|
|
for seed in [1, 2, 3, 4, 5]:
|
|
# Get data for this seed
|
|
train = benchmark[seed]['train']
|
|
valid = benchmark[seed]['valid']
|
|
test = benchmark[seed]['test']
|
|
|
|
# Extract features and labels
|
|
X_train, y_train = train['Drug'], train['Y']
|
|
X_valid, y_valid = valid['Drug'], valid['Y']
|
|
X_test = test['Drug']
|
|
|
|
# Initialize and train model
|
|
model = YourModel(random_state=seed)
|
|
model.fit(X_train, y_train)
|
|
|
|
# Optionally use validation set for early stopping
|
|
# model.fit(X_train, y_train, validation_data=(X_valid, y_valid))
|
|
|
|
# Make predictions on test set
|
|
predictions[seed] = model.predict(X_test)
|
|
|
|
# Evaluate with TDC
|
|
results = group.evaluate(predictions)
|
|
print(f"Results: {results}")
|
|
'''
|
|
|
|
print("\nCustom Model Integration Template:")
|
|
print("=" * 60)
|
|
print(code_template)
|
|
|
|
return code_template
|
|
|
|
|
|
def multi_seed_statistics(predictions_dict):
|
|
"""
|
|
Example: Analyzing multi-seed prediction statistics
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 4: Multi-Seed Statistics Analysis")
|
|
print("=" * 60)
|
|
|
|
# Analyze prediction variability across seeds
|
|
all_preds = np.array([predictions_dict[seed] for seed in [1, 2, 3, 4, 5]])
|
|
|
|
print("\nPrediction statistics across 5 seeds:")
|
|
print(f" Shape: {all_preds.shape}")
|
|
print(f" Mean prediction: {all_preds.mean():.4f}")
|
|
print(f" Std across seeds: {all_preds.std(axis=0).mean():.4f}")
|
|
print(f" Min prediction: {all_preds.min():.4f}")
|
|
print(f" Max prediction: {all_preds.max():.4f}")
|
|
|
|
# Per-sample variance
|
|
per_sample_std = all_preds.std(axis=0)
|
|
print(f"\nPer-sample prediction std:")
|
|
print(f" Mean: {per_sample_std.mean():.4f}")
|
|
print(f" Median: {np.median(per_sample_std):.4f}")
|
|
print(f" Max: {per_sample_std.max():.4f}")
|
|
|
|
|
|
def leaderboard_submission_guide():
|
|
"""
|
|
Guide for submitting to TDC leaderboards
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("Example 5: Leaderboard Submission Guide")
|
|
print("=" * 60)
|
|
|
|
guide = """
|
|
To submit results to TDC leaderboards:
|
|
|
|
1. Evaluate your model following the 5-seed protocol:
|
|
- Use seeds [1, 2, 3, 4, 5] exactly as provided
|
|
- Do not modify the train/valid/test splits
|
|
- Report mean ± std across all 5 seeds
|
|
|
|
2. Format your results:
|
|
results = group.evaluate(predictions)
|
|
# Returns: {'dataset_name': [mean_score, std_score]}
|
|
|
|
3. Submit to leaderboard:
|
|
- Visit: https://tdcommons.ai/benchmark/admet_group/
|
|
- Click on your dataset of interest
|
|
- Submit your results with:
|
|
* Model name and description
|
|
* Mean score ± standard deviation
|
|
* Reference to paper/code (if available)
|
|
|
|
4. Best practices:
|
|
- Report all datasets in the benchmark group
|
|
- Include model hyperparameters
|
|
- Share code for reproducibility
|
|
- Compare against baseline models
|
|
|
|
5. Evaluation metrics:
|
|
- ADMET Group uses MAE by default
|
|
- Other groups may use different metrics
|
|
- Check benchmark-specific requirements
|
|
"""
|
|
|
|
print(guide)
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main function to run all benchmark evaluation examples
|
|
"""
|
|
print("\n" + "=" * 60)
|
|
print("TDC Benchmark Group Evaluation Examples")
|
|
print("=" * 60)
|
|
|
|
# Load benchmark group
|
|
group = load_benchmark_group()
|
|
|
|
# Example 1: Single dataset evaluation
|
|
predictions, results = single_dataset_evaluation(group)
|
|
|
|
# Example 2: Multiple datasets evaluation
|
|
all_predictions, all_results = multiple_datasets_evaluation(group)
|
|
|
|
# Example 3: Custom model template
|
|
custom_model_template()
|
|
|
|
# Example 4: Multi-seed statistics
|
|
multi_seed_statistics(predictions)
|
|
|
|
# Example 5: Leaderboard submission guide
|
|
leaderboard_submission_guide()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Benchmark evaluation examples completed!")
|
|
print("=" * 60)
|
|
print("\nNext steps:")
|
|
print("1. Replace dummy predictions with your model")
|
|
print("2. Run full evaluation on all benchmark datasets")
|
|
print("3. Submit results to TDC leaderboard")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|