# -*- coding: utf-8 -*- """ Feature Engineering Toolkit Template. This notebook provides a structured framework for feature engineering. Follow the instructions and placeholders to create, select, and transform features effectively. Author: Claude Date: [Date] """ # %% [markdown] # # 1. Setup and Data Loading # # Import necessary libraries and load the dataset. # %% import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, QuantileTransformer from sklearn.feature_selection import SelectKBest, f_classif # Add more libraries as needed (e.g., scikit-learn, feature-engine) # %% # Load your dataset try: DATA_PATH = "[Path to your data file]" df = pd.read_csv(DATA_PATH) print("Data loaded successfully.") print(df.head()) except FileNotFoundError: print(f"Error: File not found at {DATA_PATH}. Please check the file path.") raise except Exception as e: print(f"An error occurred while loading the data: {e}") raise # %% [markdown] # # 2. Data Exploration and Preprocessing # # Understand your data and perform initial cleaning and preprocessing steps. # %% # Basic data exploration print(df.info()) print(df.describe()) # %% # Handle missing values (replace with appropriate strategy - mean, median, mode, or removal) # Example: # df['column_with_missing'].fillna(df['column_with_missing'].mean(), inplace=True) # Implement your missing value strategy here # %% # Handle categorical variables (one-hot encoding, label encoding, etc.) # Example: # df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True) # Implement your categorical encoding strategy here # %% # Split data into training and testing sets TARGET_VARIABLE = "[Your target variable name]" FEATURES = [col for col in df.columns if col != TARGET_VARIABLE] # Select all columns except the target as features X = df[FEATURES] y = df[TARGET_VARIABLE] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed print("Training set shape:", X_train.shape) print("Testing set shape:", X_test.shape) # %% [markdown] # # 3. Feature Creation # # Generate new features based on existing ones. # %% # Create new features (e.g., polynomial features, interaction terms, domain-specific features) # Example: # df['new_feature'] = df['feature1'] * df['feature2'] # Implement your feature creation logic here # %% [markdown] # # 4. Feature Transformation # # Transform features to improve model performance (e.g., scaling, normalization, power transforms). # %% # Scaling numerical features numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']] # Select numerical features scaler = StandardScaler() # or MinMaxScaler, RobustScaler, etc. X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features]) X_test[numerical_features] = scaler.transform(X_test[numerical_features]) print("Features scaled.") # %% # Apply power transforms (e.g., Yeo-Johnson, Box-Cox) to address skewness # Example: # from sklearn.preprocessing import PowerTransformer # pt = PowerTransformer(method='yeo-johnson') # X_train['feature_to_transform'] = pt.fit_transform(X_train[['feature_to_transform']]) # X_test['feature_to_transform'] = pt.transform(X_test[['feature_to_transform']]) # Implement your power transform logic here # %% # Apply QuantileTransformer for non-linear transformations # Example: # quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0) # X_train['feature_to_transform'] = quantile_transformer.fit_transform(X_train[['feature_to_transform']]) # X_test['feature_to_transform'] = quantile_transformer.transform(X_test[['feature_to_transform']]) # Implement your quantile transform logic here # %% [markdown] # # 5. Feature Selection # # Select the most relevant features to reduce dimensionality and improve model performance. # %% # Univariate feature selection (e.g., SelectKBest, SelectPercentile) # Example: selector = SelectKBest(score_func=f_classif, k=10) # Choose an appropriate scoring function and k X_train_selected = selector.fit_transform(X_train, y_train) X_test_selected = selector.transform(X_test) selected_feature_indices = selector.get_support(indices=True) selected_features = [X_train.columns[i] for i in selected_feature_indices] print("Selected features:", selected_features) # %% # Implement other feature selection techniques (e.g., Recursive Feature Elimination, SelectFromModel) # Example: # from sklearn.feature_selection import RFE # from sklearn.linear_model import LogisticRegression # estimator = LogisticRegression(solver='liblinear') # selector = RFE(estimator, n_features_to_select=5, step=1) # selector = selector.fit(X_train, y_train) # selected_features = X_train.columns[selector.support_] # %% [markdown] # # 6. Feature Encoding (Advanced) # %% # Feature Encoding using techniques like target encoding, weight of evidence, or embeddings. # Requires installing feature-engine or category_encoders # Example: # from feature_engine.encoding import MeanEncoder # encoder = MeanEncoder(variables=['categorical_column']) # encoder.fit(X_train, y_train) # X_train_encoded = encoder.transform(X_train) # X_test_encoded = encoder.transform(X_test) # Implement advanced feature encoding techniques here # %% [markdown] # # 7. Evaluation and Refinement # # Evaluate the impact of feature engineering on model performance and refine the process. # %% # Train a model with and without feature engineering to compare performance # Example: # from sklearn.linear_model import LogisticRegression # from sklearn.metrics import accuracy_score # model = LogisticRegression() # model.fit(X_train, y_train) # y_pred = model.predict(X_test) # accuracy = accuracy_score(y_test, y_pred) # print("Accuracy without feature engineering:", accuracy) # model_engineered = LogisticRegression() # model_engineered.fit(X_train_selected, y_train) # Or X_train_encoded, depending on your selection # y_pred_engineered = model_engineered.predict(X_test_selected) # Or X_test_encoded # accuracy_engineered = accuracy_score(y_test, y_pred_engineered) # print("Accuracy with feature engineering:", accuracy_engineered) # Implement your model training and evaluation logic here # %% [markdown] # # 8. Conclusion # # Summarize the results of feature engineering and discuss potential improvements. # %% # Summarize the findings # Discuss the impact of each feature engineering step # Suggest further improvements and next steps print("Feature engineering process completed.") print("Further improvements can be made by exploring different feature combinations and model architectures.")