Files
gh-k-dense-ai-claude-scient…/skills/scikit-learn/scripts/classification_pipeline.py
2025-11-30 08:30:10 +08:00

258 lines
7.9 KiB
Python

"""
Complete classification pipeline example with preprocessing, model training,
hyperparameter tuning, and evaluation.
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
classification_report, confusion_matrix, roc_auc_score,
accuracy_score, precision_score, recall_score, f1_score
)
import warnings
warnings.filterwarnings('ignore')
def create_preprocessing_pipeline(numeric_features, categorical_features):
"""
Create a preprocessing pipeline for mixed data types.
Parameters:
-----------
numeric_features : list
List of numeric feature column names
categorical_features : list
List of categorical feature column names
Returns:
--------
ColumnTransformer
Preprocessing pipeline
"""
# Numeric preprocessing
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Categorical preprocessing
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Combine transformers
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)
return preprocessor
def train_and_evaluate_model(X, y, numeric_features, categorical_features,
test_size=0.2, random_state=42):
"""
Complete pipeline: preprocess, train, tune, and evaluate a classifier.
Parameters:
-----------
X : DataFrame or array
Feature matrix
y : Series or array
Target variable
numeric_features : list
List of numeric feature names
categorical_features : list
List of categorical feature names
test_size : float
Proportion of data for testing
random_state : int
Random seed
Returns:
--------
dict
Dictionary containing trained model, predictions, and metrics
"""
# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, stratify=y, random_state=random_state
)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Class distribution in training: {pd.Series(y_train).value_counts().to_dict()}")
# Create preprocessor
preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
# Define models to compare
models = {
'Logistic Regression': Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression(max_iter=1000, random_state=random_state))
]),
'Random Forest': Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=random_state))
]),
'Gradient Boosting': Pipeline([
('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(n_estimators=100, random_state=random_state))
])
}
# Compare models using cross-validation
print("\n" + "="*60)
print("Model Comparison (5-Fold Cross-Validation)")
print("="*60)
cv_results = {}
for name, model in models.items():
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
cv_results[name] = scores.mean()
print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
# Select best model based on CV
best_model_name = max(cv_results, key=cv_results.get)
best_model = models[best_model_name]
print(f"\nBest model: {best_model_name}")
# Hyperparameter tuning for best model
if best_model_name == 'Random Forest':
param_grid = {
'classifier__n_estimators': [100, 200],
'classifier__max_depth': [10, 20, None],
'classifier__min_samples_split': [2, 5]
}
elif best_model_name == 'Gradient Boosting':
param_grid = {
'classifier__n_estimators': [100, 200],
'classifier__learning_rate': [0.01, 0.1],
'classifier__max_depth': [3, 5]
}
else: # Logistic Regression
param_grid = {
'classifier__C': [0.1, 1.0, 10.0],
'classifier__penalty': ['l2']
}
print("\n" + "="*60)
print("Hyperparameter Tuning")
print("="*60)
grid_search = GridSearchCV(
best_model, param_grid, cv=5, scoring='accuracy',
n_jobs=-1, verbose=0
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")
# Evaluate on test set
tuned_model = grid_search.best_estimator_
y_pred = tuned_model.predict(X_test)
y_pred_proba = tuned_model.predict_proba(X_test)
print("\n" + "="*60)
print("Test Set Evaluation")
print("="*60)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
# ROC AUC (if binary classification)
if len(np.unique(y)) == 2:
roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print(f"ROC AUC: {roc_auc:.4f}")
print("\n" + "="*60)
print("Classification Report")
print("="*60)
print(classification_report(y_test, y_pred))
print("\n" + "="*60)
print("Confusion Matrix")
print("="*60)
print(confusion_matrix(y_test, y_pred))
# Feature importance (if available)
if hasattr(tuned_model.named_steps['classifier'], 'feature_importances_'):
print("\n" + "="*60)
print("Top 10 Most Important Features")
print("="*60)
feature_names = tuned_model.named_steps['preprocessor'].get_feature_names_out()
importances = tuned_model.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=False).head(10)
print(feature_importance_df.to_string(index=False))
return {
'model': tuned_model,
'y_test': y_test,
'y_pred': y_pred,
'y_pred_proba': y_pred_proba,
'metrics': {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
}
# Example usage
if __name__ == "__main__":
# Load example dataset
from sklearn.datasets import load_breast_cancer
# Load data
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
# For demonstration, treat all features as numeric
numeric_features = X.columns.tolist()
categorical_features = []
print("="*60)
print("Classification Pipeline Example")
print("Dataset: Breast Cancer Wisconsin")
print("="*60)
# Run complete pipeline
results = train_and_evaluate_model(
X, y, numeric_features, categorical_features,
test_size=0.2, random_state=42
)
print("\n" + "="*60)
print("Pipeline Complete!")
print("="*60)