Files
gh-k-dense-ai-claude-scient…/skills/pytdc/scripts/benchmark_evaluation.py
2025-11-30 08:30:10 +08:00

328 lines
9.0 KiB
Python

#!/usr/bin/env python3
"""
TDC Benchmark Group Evaluation Template
This script demonstrates how to use TDC benchmark groups for systematic
model evaluation following the required 5-seed protocol.
Usage:
python benchmark_evaluation.py
"""
from tdc.benchmark_group import admet_group
from tdc import Evaluator
import numpy as np
import pandas as pd
def load_benchmark_group():
"""
Load the ADMET benchmark group
"""
print("=" * 60)
print("Loading ADMET Benchmark Group")
print("=" * 60)
# Initialize benchmark group
group = admet_group(path='data/')
# Get available benchmarks
print("\nAvailable benchmarks in ADMET group:")
benchmark_names = group.dataset_names
print(f"Total: {len(benchmark_names)} datasets")
for i, name in enumerate(benchmark_names[:10], 1):
print(f" {i}. {name}")
if len(benchmark_names) > 10:
print(f" ... and {len(benchmark_names) - 10} more")
return group
def single_dataset_evaluation(group, dataset_name='Caco2_Wang'):
"""
Example: Evaluate on a single dataset with 5-seed protocol
"""
print("\n" + "=" * 60)
print(f"Example 1: Single Dataset Evaluation ({dataset_name})")
print("=" * 60)
# Get dataset benchmarks
benchmark = group.get(dataset_name)
print(f"\nBenchmark structure:")
print(f" Seeds: {list(benchmark.keys())}")
# Required: Evaluate with 5 different seeds
predictions = {}
for seed in [1, 2, 3, 4, 5]:
print(f"\n--- Seed {seed} ---")
# Get train/valid data for this seed
train = benchmark[seed]['train']
valid = benchmark[seed]['valid']
print(f"Train size: {len(train)}")
print(f"Valid size: {len(valid)}")
# TODO: Replace with your model training
# model = YourModel()
# model.fit(train['Drug'], train['Y'])
# For demonstration, create dummy predictions
# Replace with: predictions[seed] = model.predict(benchmark[seed]['test'])
test = benchmark[seed]['test']
y_true = test['Y'].values
# Simulate predictions (add controlled noise)
np.random.seed(seed)
y_pred = y_true + np.random.normal(0, 0.3, len(y_true))
predictions[seed] = y_pred
# Evaluate this seed
evaluator = Evaluator(name='MAE')
score = evaluator(y_true, y_pred)
print(f"MAE for seed {seed}: {score:.4f}")
# Evaluate across all seeds
print("\n--- Overall Evaluation ---")
results = group.evaluate(predictions)
print(f"\nResults for {dataset_name}:")
mean_score, std_score = results[dataset_name]
print(f" Mean MAE: {mean_score:.4f}")
print(f" Std MAE: {std_score:.4f}")
return predictions, results
def multiple_datasets_evaluation(group):
"""
Example: Evaluate on multiple datasets
"""
print("\n" + "=" * 60)
print("Example 2: Multiple Datasets Evaluation")
print("=" * 60)
# Select a subset of datasets for demonstration
selected_datasets = ['Caco2_Wang', 'HIA_Hou', 'Bioavailability_Ma']
all_predictions = {}
all_results = {}
for dataset_name in selected_datasets:
print(f"\n{'='*40}")
print(f"Evaluating: {dataset_name}")
print(f"{'='*40}")
benchmark = group.get(dataset_name)
predictions = {}
# Train and predict for each seed
for seed in [1, 2, 3, 4, 5]:
train = benchmark[seed]['train']
test = benchmark[seed]['test']
# TODO: Replace with your model
# model = YourModel()
# model.fit(train['Drug'], train['Y'])
# predictions[seed] = model.predict(test['Drug'])
# Dummy predictions for demonstration
np.random.seed(seed)
y_true = test['Y'].values
y_pred = y_true + np.random.normal(0, 0.3, len(y_true))
predictions[seed] = y_pred
all_predictions[dataset_name] = predictions
# Evaluate this dataset
results = group.evaluate({dataset_name: predictions})
all_results[dataset_name] = results[dataset_name]
mean_score, std_score = results[dataset_name]
print(f" {dataset_name}: {mean_score:.4f} ± {std_score:.4f}")
# Summary
print("\n" + "=" * 60)
print("Summary of Results")
print("=" * 60)
results_df = pd.DataFrame([
{
'Dataset': name,
'Mean MAE': f"{mean:.4f}",
'Std MAE': f"{std:.4f}"
}
for name, (mean, std) in all_results.items()
])
print(results_df.to_string(index=False))
return all_predictions, all_results
def custom_model_template():
"""
Template for integrating your own model with TDC benchmarks
"""
print("\n" + "=" * 60)
print("Example 3: Custom Model Template")
print("=" * 60)
code_template = '''
# Template for using your own model with TDC benchmarks
from tdc.benchmark_group import admet_group
from your_library import YourModel # Replace with your model
# Initialize benchmark group
group = admet_group(path='data/')
benchmark = group.get('Caco2_Wang')
predictions = {}
for seed in [1, 2, 3, 4, 5]:
# Get data for this seed
train = benchmark[seed]['train']
valid = benchmark[seed]['valid']
test = benchmark[seed]['test']
# Extract features and labels
X_train, y_train = train['Drug'], train['Y']
X_valid, y_valid = valid['Drug'], valid['Y']
X_test = test['Drug']
# Initialize and train model
model = YourModel(random_state=seed)
model.fit(X_train, y_train)
# Optionally use validation set for early stopping
# model.fit(X_train, y_train, validation_data=(X_valid, y_valid))
# Make predictions on test set
predictions[seed] = model.predict(X_test)
# Evaluate with TDC
results = group.evaluate(predictions)
print(f"Results: {results}")
'''
print("\nCustom Model Integration Template:")
print("=" * 60)
print(code_template)
return code_template
def multi_seed_statistics(predictions_dict):
"""
Example: Analyzing multi-seed prediction statistics
"""
print("\n" + "=" * 60)
print("Example 4: Multi-Seed Statistics Analysis")
print("=" * 60)
# Analyze prediction variability across seeds
all_preds = np.array([predictions_dict[seed] for seed in [1, 2, 3, 4, 5]])
print("\nPrediction statistics across 5 seeds:")
print(f" Shape: {all_preds.shape}")
print(f" Mean prediction: {all_preds.mean():.4f}")
print(f" Std across seeds: {all_preds.std(axis=0).mean():.4f}")
print(f" Min prediction: {all_preds.min():.4f}")
print(f" Max prediction: {all_preds.max():.4f}")
# Per-sample variance
per_sample_std = all_preds.std(axis=0)
print(f"\nPer-sample prediction std:")
print(f" Mean: {per_sample_std.mean():.4f}")
print(f" Median: {np.median(per_sample_std):.4f}")
print(f" Max: {per_sample_std.max():.4f}")
def leaderboard_submission_guide():
"""
Guide for submitting to TDC leaderboards
"""
print("\n" + "=" * 60)
print("Example 5: Leaderboard Submission Guide")
print("=" * 60)
guide = """
To submit results to TDC leaderboards:
1. Evaluate your model following the 5-seed protocol:
- Use seeds [1, 2, 3, 4, 5] exactly as provided
- Do not modify the train/valid/test splits
- Report mean ± std across all 5 seeds
2. Format your results:
results = group.evaluate(predictions)
# Returns: {'dataset_name': [mean_score, std_score]}
3. Submit to leaderboard:
- Visit: https://tdcommons.ai/benchmark/admet_group/
- Click on your dataset of interest
- Submit your results with:
* Model name and description
* Mean score ± standard deviation
* Reference to paper/code (if available)
4. Best practices:
- Report all datasets in the benchmark group
- Include model hyperparameters
- Share code for reproducibility
- Compare against baseline models
5. Evaluation metrics:
- ADMET Group uses MAE by default
- Other groups may use different metrics
- Check benchmark-specific requirements
"""
print(guide)
def main():
"""
Main function to run all benchmark evaluation examples
"""
print("\n" + "=" * 60)
print("TDC Benchmark Group Evaluation Examples")
print("=" * 60)
# Load benchmark group
group = load_benchmark_group()
# Example 1: Single dataset evaluation
predictions, results = single_dataset_evaluation(group)
# Example 2: Multiple datasets evaluation
all_predictions, all_results = multiple_datasets_evaluation(group)
# Example 3: Custom model template
custom_model_template()
# Example 4: Multi-seed statistics
multi_seed_statistics(predictions)
# Example 5: Leaderboard submission guide
leaderboard_submission_guide()
print("\n" + "=" * 60)
print("Benchmark evaluation examples completed!")
print("=" * 60)
print("\nNext steps:")
print("1. Replace dummy predictions with your model")
print("2. Run full evaluation on all benchmark datasets")
print("3. Submit results to TDC leaderboard")
print("=" * 60)
if __name__ == "__main__":
main()