""" Complete classification pipeline example with preprocessing, model training, hyperparameter tuning, and evaluation. """ import numpy as np import pandas as pd from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score ) import warnings warnings.filterwarnings('ignore') def create_preprocessing_pipeline(numeric_features, categorical_features): """ Create a preprocessing pipeline for mixed data types. Parameters: ----------- numeric_features : list List of numeric feature column names categorical_features : list List of categorical feature column names Returns: -------- ColumnTransformer Preprocessing pipeline """ # Numeric preprocessing numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) # Categorical preprocessing categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ]) # Combine transformers preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ] ) return preprocessor def train_and_evaluate_model(X, y, numeric_features, categorical_features, test_size=0.2, random_state=42): """ Complete pipeline: preprocess, train, tune, and evaluate a classifier. Parameters: ----------- X : DataFrame or array Feature matrix y : Series or array Target variable numeric_features : list List of numeric feature names categorical_features : list List of categorical feature names test_size : float Proportion of data for testing random_state : int Random seed Returns: -------- dict Dictionary containing trained model, predictions, and metrics """ # Split data with stratification X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, stratify=y, random_state=random_state ) print(f"Training set size: {len(X_train)}") print(f"Test set size: {len(X_test)}") print(f"Class distribution in training: {pd.Series(y_train).value_counts().to_dict()}") # Create preprocessor preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features) # Define models to compare models = { 'Logistic Regression': Pipeline([ ('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000, random_state=random_state)) ]), 'Random Forest': Pipeline([ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=random_state)) ]), 'Gradient Boosting': Pipeline([ ('preprocessor', preprocessor), ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=random_state)) ]) } # Compare models using cross-validation print("\n" + "="*60) print("Model Comparison (5-Fold Cross-Validation)") print("="*60) cv_results = {} for name, model in models.items(): scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy') cv_results[name] = scores.mean() print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})") # Select best model based on CV best_model_name = max(cv_results, key=cv_results.get) best_model = models[best_model_name] print(f"\nBest model: {best_model_name}") # Hyperparameter tuning for best model if best_model_name == 'Random Forest': param_grid = { 'classifier__n_estimators': [100, 200], 'classifier__max_depth': [10, 20, None], 'classifier__min_samples_split': [2, 5] } elif best_model_name == 'Gradient Boosting': param_grid = { 'classifier__n_estimators': [100, 200], 'classifier__learning_rate': [0.01, 0.1], 'classifier__max_depth': [3, 5] } else: # Logistic Regression param_grid = { 'classifier__C': [0.1, 1.0, 10.0], 'classifier__penalty': ['l2'] } print("\n" + "="*60) print("Hyperparameter Tuning") print("="*60) grid_search = GridSearchCV( best_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=0 ) grid_search.fit(X_train, y_train) print(f"Best parameters: {grid_search.best_params_}") print(f"Best CV score: {grid_search.best_score_:.4f}") # Evaluate on test set tuned_model = grid_search.best_estimator_ y_pred = tuned_model.predict(X_test) y_pred_proba = tuned_model.predict_proba(X_test) print("\n" + "="*60) print("Test Set Evaluation") print("="*60) # Calculate metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') print(f"Accuracy: {accuracy:.4f}") print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") print(f"F1-Score: {f1:.4f}") # ROC AUC (if binary classification) if len(np.unique(y)) == 2: roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1]) print(f"ROC AUC: {roc_auc:.4f}") print("\n" + "="*60) print("Classification Report") print("="*60) print(classification_report(y_test, y_pred)) print("\n" + "="*60) print("Confusion Matrix") print("="*60) print(confusion_matrix(y_test, y_pred)) # Feature importance (if available) if hasattr(tuned_model.named_steps['classifier'], 'feature_importances_'): print("\n" + "="*60) print("Top 10 Most Important Features") print("="*60) feature_names = tuned_model.named_steps['preprocessor'].get_feature_names_out() importances = tuned_model.named_steps['classifier'].feature_importances_ feature_importance_df = pd.DataFrame({ 'feature': feature_names, 'importance': importances }).sort_values('importance', ascending=False).head(10) print(feature_importance_df.to_string(index=False)) return { 'model': tuned_model, 'y_test': y_test, 'y_pred': y_pred, 'y_pred_proba': y_pred_proba, 'metrics': { 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1 } } # Example usage if __name__ == "__main__": # Load example dataset from sklearn.datasets import load_breast_cancer # Load data data = load_breast_cancer() X = pd.DataFrame(data.data, columns=data.feature_names) y = data.target # For demonstration, treat all features as numeric numeric_features = X.columns.tolist() categorical_features = [] print("="*60) print("Classification Pipeline Example") print("Dataset: Breast Cancer Wisconsin") print("="*60) # Run complete pipeline results = train_and_evaluate_model( X, y, numeric_features, categorical_features, test_size=0.2, random_state=42 ) print("\n" + "="*60) print("Pipeline Complete!") print("="*60)