196 lines
6.6 KiB
Plaintext
196 lines
6.6 KiB
Plaintext
# -*- coding: utf-8 -*-
|
|
"""
|
|
Feature Engineering Toolkit Template.
|
|
|
|
This notebook provides a structured framework for feature engineering.
|
|
Follow the instructions and placeholders to create, select, and transform features effectively.
|
|
|
|
Author: Claude
|
|
Date: [Date]
|
|
"""
|
|
|
|
# %% [markdown]
|
|
# # 1. Setup and Data Loading
|
|
#
|
|
# Import necessary libraries and load the dataset.
|
|
|
|
# %%
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, QuantileTransformer
|
|
from sklearn.feature_selection import SelectKBest, f_classif
|
|
# Add more libraries as needed (e.g., scikit-learn, feature-engine)
|
|
|
|
# %%
|
|
# Load your dataset
|
|
try:
|
|
DATA_PATH = "[Path to your data file]"
|
|
df = pd.read_csv(DATA_PATH)
|
|
print("Data loaded successfully.")
|
|
print(df.head())
|
|
except FileNotFoundError:
|
|
print(f"Error: File not found at {DATA_PATH}. Please check the file path.")
|
|
raise
|
|
except Exception as e:
|
|
print(f"An error occurred while loading the data: {e}")
|
|
raise
|
|
|
|
# %% [markdown]
|
|
# # 2. Data Exploration and Preprocessing
|
|
#
|
|
# Understand your data and perform initial cleaning and preprocessing steps.
|
|
|
|
# %%
|
|
# Basic data exploration
|
|
print(df.info())
|
|
print(df.describe())
|
|
|
|
# %%
|
|
# Handle missing values (replace with appropriate strategy - mean, median, mode, or removal)
|
|
# Example:
|
|
# df['column_with_missing'].fillna(df['column_with_missing'].mean(), inplace=True)
|
|
# Implement your missing value strategy here
|
|
|
|
# %%
|
|
# Handle categorical variables (one-hot encoding, label encoding, etc.)
|
|
# Example:
|
|
# df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)
|
|
# Implement your categorical encoding strategy here
|
|
|
|
# %%
|
|
# Split data into training and testing sets
|
|
TARGET_VARIABLE = "[Your target variable name]"
|
|
FEATURES = [col for col in df.columns if col != TARGET_VARIABLE] # Select all columns except the target as features
|
|
|
|
X = df[FEATURES]
|
|
y = df[TARGET_VARIABLE]
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed
|
|
|
|
print("Training set shape:", X_train.shape)
|
|
print("Testing set shape:", X_test.shape)
|
|
|
|
# %% [markdown]
|
|
# # 3. Feature Creation
|
|
#
|
|
# Generate new features based on existing ones.
|
|
|
|
# %%
|
|
# Create new features (e.g., polynomial features, interaction terms, domain-specific features)
|
|
# Example:
|
|
# df['new_feature'] = df['feature1'] * df['feature2']
|
|
# Implement your feature creation logic here
|
|
|
|
# %% [markdown]
|
|
# # 4. Feature Transformation
|
|
#
|
|
# Transform features to improve model performance (e.g., scaling, normalization, power transforms).
|
|
|
|
# %%
|
|
# Scaling numerical features
|
|
numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']] # Select numerical features
|
|
|
|
scaler = StandardScaler() # or MinMaxScaler, RobustScaler, etc.
|
|
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
|
|
X_test[numerical_features] = scaler.transform(X_test[numerical_features])
|
|
|
|
print("Features scaled.")
|
|
|
|
# %%
|
|
# Apply power transforms (e.g., Yeo-Johnson, Box-Cox) to address skewness
|
|
# Example:
|
|
# from sklearn.preprocessing import PowerTransformer
|
|
# pt = PowerTransformer(method='yeo-johnson')
|
|
# X_train['feature_to_transform'] = pt.fit_transform(X_train[['feature_to_transform']])
|
|
# X_test['feature_to_transform'] = pt.transform(X_test[['feature_to_transform']])
|
|
|
|
# Implement your power transform logic here
|
|
|
|
# %%
|
|
# Apply QuantileTransformer for non-linear transformations
|
|
# Example:
|
|
# quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
|
|
# X_train['feature_to_transform'] = quantile_transformer.fit_transform(X_train[['feature_to_transform']])
|
|
# X_test['feature_to_transform'] = quantile_transformer.transform(X_test[['feature_to_transform']])
|
|
|
|
# Implement your quantile transform logic here
|
|
|
|
# %% [markdown]
|
|
# # 5. Feature Selection
|
|
#
|
|
# Select the most relevant features to reduce dimensionality and improve model performance.
|
|
|
|
# %%
|
|
# Univariate feature selection (e.g., SelectKBest, SelectPercentile)
|
|
# Example:
|
|
selector = SelectKBest(score_func=f_classif, k=10) # Choose an appropriate scoring function and k
|
|
X_train_selected = selector.fit_transform(X_train, y_train)
|
|
X_test_selected = selector.transform(X_test)
|
|
|
|
selected_feature_indices = selector.get_support(indices=True)
|
|
selected_features = [X_train.columns[i] for i in selected_feature_indices]
|
|
|
|
print("Selected features:", selected_features)
|
|
|
|
# %%
|
|
# Implement other feature selection techniques (e.g., Recursive Feature Elimination, SelectFromModel)
|
|
# Example:
|
|
# from sklearn.feature_selection import RFE
|
|
# from sklearn.linear_model import LogisticRegression
|
|
# estimator = LogisticRegression(solver='liblinear')
|
|
# selector = RFE(estimator, n_features_to_select=5, step=1)
|
|
# selector = selector.fit(X_train, y_train)
|
|
# selected_features = X_train.columns[selector.support_]
|
|
|
|
# %% [markdown]
|
|
# # 6. Feature Encoding (Advanced)
|
|
|
|
# %%
|
|
# Feature Encoding using techniques like target encoding, weight of evidence, or embeddings.
|
|
# Requires installing feature-engine or category_encoders
|
|
# Example:
|
|
# from feature_engine.encoding import MeanEncoder
|
|
# encoder = MeanEncoder(variables=['categorical_column'])
|
|
# encoder.fit(X_train, y_train)
|
|
# X_train_encoded = encoder.transform(X_train)
|
|
# X_test_encoded = encoder.transform(X_test)
|
|
|
|
# Implement advanced feature encoding techniques here
|
|
|
|
# %% [markdown]
|
|
# # 7. Evaluation and Refinement
|
|
#
|
|
# Evaluate the impact of feature engineering on model performance and refine the process.
|
|
|
|
# %%
|
|
# Train a model with and without feature engineering to compare performance
|
|
# Example:
|
|
# from sklearn.linear_model import LogisticRegression
|
|
# from sklearn.metrics import accuracy_score
|
|
|
|
# model = LogisticRegression()
|
|
# model.fit(X_train, y_train)
|
|
# y_pred = model.predict(X_test)
|
|
# accuracy = accuracy_score(y_test, y_pred)
|
|
# print("Accuracy without feature engineering:", accuracy)
|
|
|
|
# model_engineered = LogisticRegression()
|
|
# model_engineered.fit(X_train_selected, y_train) # Or X_train_encoded, depending on your selection
|
|
# y_pred_engineered = model_engineered.predict(X_test_selected) # Or X_test_encoded
|
|
# accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
|
|
# print("Accuracy with feature engineering:", accuracy_engineered)
|
|
|
|
# Implement your model training and evaluation logic here
|
|
|
|
# %% [markdown]
|
|
# # 8. Conclusion
|
|
#
|
|
# Summarize the results of feature engineering and discuss potential improvements.
|
|
|
|
# %%
|
|
# Summarize the findings
|
|
# Discuss the impact of each feature engineering step
|
|
# Suggest further improvements and next steps
|
|
print("Feature engineering process completed.")
|
|
print("Further improvements can be made by exploring different feature combinations and model architectures.") |