""" PyMC Model Comparison Script Utilities for comparing multiple Bayesian models using information criteria and cross-validation metrics. Usage: from scripts.model_comparison import compare_models, plot_model_comparison # Compare multiple models comparison = compare_models( {'model1': idata1, 'model2': idata2, 'model3': idata3}, ic='loo' ) # Visualize comparison plot_model_comparison(comparison, output_path='model_comparison.png') """ import arviz as az import numpy as np import pandas as pd import matplotlib.pyplot as plt from typing import Dict def compare_models(models_dict: Dict[str, az.InferenceData], ic='loo', scale='deviance', verbose=True): """ Compare multiple models using information criteria. Parameters ---------- models_dict : dict Dictionary mapping model names to InferenceData objects. All models must have log_likelihood computed. ic : str Information criterion to use: 'loo' (default) or 'waic' scale : str Scale for IC: 'deviance' (default), 'log', or 'negative_log' verbose : bool Print detailed comparison results (default: True) Returns ------- pd.DataFrame Comparison DataFrame with model rankings and statistics Notes ----- Models must be fit with idata_kwargs={'log_likelihood': True} or log-likelihood computed afterwards with pm.compute_log_likelihood(). """ if verbose: print("="*70) print(f" " * 25 + f"MODEL COMPARISON ({ic.upper()})") print("="*70) # Perform comparison comparison = az.compare(models_dict, ic=ic, scale=scale) if verbose: print("\nModel Rankings:") print("-"*70) print(comparison.to_string()) print("\n" + "="*70) print("INTERPRETATION GUIDE") print("="*70) print(f"• rank: Model ranking (0 = best)") print(f"• {ic}: {ic.upper()} estimate (lower is better)") print(f"• p_{ic}: Effective number of parameters") print(f"• d{ic}: Difference from best model") print(f"• weight: Model probability (pseudo-BMA)") print(f"• se: Standard error of {ic.upper()}") print(f"• dse: Standard error of the difference") print(f"• warning: True if model has reliability issues") print(f"• scale: {scale}") print("\n" + "="*70) print("MODEL SELECTION GUIDELINES") print("="*70) best_model = comparison.index[0] print(f"\n✓ Best model: {best_model}") # Check for clear winner if len(comparison) > 1: delta = comparison.iloc[1][f'd{ic}'] delta_se = comparison.iloc[1]['dse'] if delta > 10: print(f" → STRONG evidence for {best_model} (Δ{ic} > 10)") elif delta > 4: print(f" → MODERATE evidence for {best_model} (4 < Δ{ic} < 10)") elif delta > 2: print(f" → WEAK evidence for {best_model} (2 < Δ{ic} < 4)") else: print(f" → Models are SIMILAR (Δ{ic} < 2)") print(f" Consider model averaging or choose based on simplicity") # Check if difference is significant relative to SE if delta > 2 * delta_se: print(f" → Difference is > 2 SE, likely reliable") else: print(f" → Difference is < 2 SE, uncertain distinction") # Check for warnings if comparison['warning'].any(): print("\n⚠️ WARNING: Some models have reliability issues") warned_models = comparison[comparison['warning']].index.tolist() print(f" Models with warnings: {', '.join(warned_models)}") print(f" → Check Pareto-k diagnostics with check_loo_reliability()") return comparison def check_loo_reliability(models_dict: Dict[str, az.InferenceData], threshold=0.7, verbose=True): """ Check LOO-CV reliability using Pareto-k diagnostics. Parameters ---------- models_dict : dict Dictionary mapping model names to InferenceData objects threshold : float Pareto-k threshold for flagging observations (default: 0.7) verbose : bool Print detailed diagnostics (default: True) Returns ------- dict Dictionary with Pareto-k diagnostics for each model """ if verbose: print("="*70) print(" " * 20 + "LOO RELIABILITY CHECK") print("="*70) results = {} for name, idata in models_dict.items(): if verbose: print(f"\n{name}:") print("-"*70) # Compute LOO with pointwise results loo_result = az.loo(idata, pointwise=True) pareto_k = loo_result.pareto_k.values # Count problematic observations n_high = (pareto_k > threshold).sum() n_very_high = (pareto_k > 1.0).sum() results[name] = { 'pareto_k': pareto_k, 'n_high': n_high, 'n_very_high': n_very_high, 'max_k': pareto_k.max(), 'loo': loo_result } if verbose: print(f"Pareto-k diagnostics:") print(f" • Good (k < 0.5): {(pareto_k < 0.5).sum()} observations") print(f" • OK (0.5 ≤ k < 0.7): {((pareto_k >= 0.5) & (pareto_k < 0.7)).sum()} observations") print(f" • Bad (0.7 ≤ k < 1.0): {((pareto_k >= 0.7) & (pareto_k < 1.0)).sum()} observations") print(f" • Very bad (k ≥ 1.0): {(pareto_k >= 1.0).sum()} observations") print(f" • Maximum k: {pareto_k.max():.3f}") if n_high > 0: print(f"\n⚠️ {n_high} observations with k > {threshold}") print(" LOO approximation may be unreliable for these points") print(" Solutions:") print(" → Use WAIC instead (less sensitive to outliers)") print(" → Investigate influential observations") print(" → Consider more flexible model") if n_very_high > 0: print(f"\n⚠️ {n_very_high} observations with k > 1.0") print(" These points have very high influence") print(" → Strongly consider K-fold CV or other validation") else: print(f"✓ All Pareto-k values < {threshold}") print(" LOO estimates are reliable") return results def plot_model_comparison(comparison, output_path=None, show=True): """ Visualize model comparison results. Parameters ---------- comparison : pd.DataFrame Comparison DataFrame from az.compare() output_path : str, optional If provided, save plot to this path show : bool Whether to display plot (default: True) Returns ------- matplotlib.figure.Figure The comparison figure """ fig = plt.figure(figsize=(10, 6)) az.plot_compare(comparison) plt.title('Model Comparison', fontsize=14, fontweight='bold') plt.tight_layout() if output_path: plt.savefig(output_path, dpi=300, bbox_inches='tight') print(f"Comparison plot saved to {output_path}") if show: plt.show() else: plt.close() return fig def model_averaging(models_dict: Dict[str, az.InferenceData], weights=None, var_name='y_obs', ic='loo'): """ Perform Bayesian model averaging using model weights. Parameters ---------- models_dict : dict Dictionary mapping model names to InferenceData objects weights : array-like, optional Model weights. If None, computed from IC (pseudo-BMA weights) var_name : str Name of the predicted variable (default: 'y_obs') ic : str Information criterion for computing weights if not provided Returns ------- np.ndarray Averaged predictions across models np.ndarray Model weights used """ if weights is None: comparison = az.compare(models_dict, ic=ic) weights = comparison['weight'].values model_names = comparison.index.tolist() else: model_names = list(models_dict.keys()) weights = np.array(weights) weights = weights / weights.sum() # Normalize print("="*70) print(" " * 22 + "BAYESIAN MODEL AVERAGING") print("="*70) print("\nModel weights:") for name, weight in zip(model_names, weights): print(f" {name}: {weight:.4f} ({weight*100:.2f}%)") # Extract predictions and average predictions = [] for name in model_names: idata = models_dict[name] if 'posterior_predictive' in idata: pred = idata.posterior_predictive[var_name].values else: print(f"Warning: {name} missing posterior_predictive, skipping") continue predictions.append(pred) # Weighted average averaged = sum(w * p for w, p in zip(weights, predictions)) print(f"\n✓ Model averaging complete") print(f" Combined predictions using {len(predictions)} models") return averaged, weights def cross_validation_comparison(models_dict: Dict[str, az.InferenceData], k=10, verbose=True): """ Perform k-fold cross-validation comparison (conceptual guide). Note: This function provides guidance. Full k-fold CV requires re-fitting models k times, which should be done in the main script. Parameters ---------- models_dict : dict Dictionary of model names to InferenceData k : int Number of folds (default: 10) verbose : bool Print guidance Returns ------- None """ if verbose: print("="*70) print(" " * 20 + "K-FOLD CROSS-VALIDATION GUIDE") print("="*70) print(f"\nTo perform {k}-fold CV:") print(""" 1. Split data into k folds 2. For each fold: - Train all models on k-1 folds - Compute log-likelihood on held-out fold 3. Sum log-likelihoods across folds for each model 4. Compare models using total CV score Example code: ------------- from sklearn.model_selection import KFold kf = KFold(n_splits=k, shuffle=True, random_seed=42) cv_scores = {name: [] for name in models_dict.keys()} for train_idx, test_idx in kf.split(X): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] for name in models_dict.keys(): # Fit model on train set with create_model(name, X_train, y_train) as model: idata = pm.sample() # Compute log-likelihood on test set with model: pm.set_data({'X': X_test, 'y': y_test}) log_lik = pm.compute_log_likelihood(idata).sum() cv_scores[name].append(log_lik) # Compare total CV scores for name, scores in cv_scores.items(): print(f"{name}: {np.sum(scores):.2f}") """) print("\nNote: K-fold CV is expensive but most reliable for model comparison") print(" Use when LOO has reliability issues (high Pareto-k values)") # Example usage if __name__ == '__main__': print("This script provides model comparison utilities for PyMC.") print("\nExample usage:") print(""" import pymc as pm from scripts.model_comparison import compare_models, check_loo_reliability # Fit multiple models (must include log_likelihood) with pm.Model() as model1: # ... define model 1 ... idata1 = pm.sample(idata_kwargs={'log_likelihood': True}) with pm.Model() as model2: # ... define model 2 ... idata2 = pm.sample(idata_kwargs={'log_likelihood': True}) # Compare models models = {'Simple': idata1, 'Complex': idata2} comparison = compare_models(models, ic='loo') # Check reliability reliability = check_loo_reliability(models) # Visualize plot_model_comparison(comparison, output_path='comparison.png') # Model averaging averaged_pred, weights = model_averaging(models, var_name='y_obs') """)