Files
gh-jeremylongshore-claude-c…/skills/feature-engineering-toolkit/assets/feature_engineering_template.ipynb
2025-11-29 18:51:19 +08:00

196 lines
6.6 KiB
Plaintext

# -*- coding: utf-8 -*-
"""
Feature Engineering Toolkit Template.
This notebook provides a structured framework for feature engineering.
Follow the instructions and placeholders to create, select, and transform features effectively.
Author: Claude
Date: [Date]
"""
# %% [markdown]
# # 1. Setup and Data Loading
#
# Import necessary libraries and load the dataset.
# %%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif
# Add more libraries as needed (e.g., scikit-learn, feature-engine)
# %%
# Load your dataset
try:
DATA_PATH = "[Path to your data file]"
df = pd.read_csv(DATA_PATH)
print("Data loaded successfully.")
print(df.head())
except FileNotFoundError:
print(f"Error: File not found at {DATA_PATH}. Please check the file path.")
raise
except Exception as e:
print(f"An error occurred while loading the data: {e}")
raise
# %% [markdown]
# # 2. Data Exploration and Preprocessing
#
# Understand your data and perform initial cleaning and preprocessing steps.
# %%
# Basic data exploration
print(df.info())
print(df.describe())
# %%
# Handle missing values (replace with appropriate strategy - mean, median, mode, or removal)
# Example:
# df['column_with_missing'].fillna(df['column_with_missing'].mean(), inplace=True)
# Implement your missing value strategy here
# %%
# Handle categorical variables (one-hot encoding, label encoding, etc.)
# Example:
# df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)
# Implement your categorical encoding strategy here
# %%
# Split data into training and testing sets
TARGET_VARIABLE = "[Your target variable name]"
FEATURES = [col for col in df.columns if col != TARGET_VARIABLE] # Select all columns except the target as features
X = df[FEATURES]
y = df[TARGET_VARIABLE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
# %% [markdown]
# # 3. Feature Creation
#
# Generate new features based on existing ones.
# %%
# Create new features (e.g., polynomial features, interaction terms, domain-specific features)
# Example:
# df['new_feature'] = df['feature1'] * df['feature2']
# Implement your feature creation logic here
# %% [markdown]
# # 4. Feature Transformation
#
# Transform features to improve model performance (e.g., scaling, normalization, power transforms).
# %%
# Scaling numerical features
numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']] # Select numerical features
scaler = StandardScaler() # or MinMaxScaler, RobustScaler, etc.
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])
print("Features scaled.")
# %%
# Apply power transforms (e.g., Yeo-Johnson, Box-Cox) to address skewness
# Example:
# from sklearn.preprocessing import PowerTransformer
# pt = PowerTransformer(method='yeo-johnson')
# X_train['feature_to_transform'] = pt.fit_transform(X_train[['feature_to_transform']])
# X_test['feature_to_transform'] = pt.transform(X_test[['feature_to_transform']])
# Implement your power transform logic here
# %%
# Apply QuantileTransformer for non-linear transformations
# Example:
# quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# X_train['feature_to_transform'] = quantile_transformer.fit_transform(X_train[['feature_to_transform']])
# X_test['feature_to_transform'] = quantile_transformer.transform(X_test[['feature_to_transform']])
# Implement your quantile transform logic here
# %% [markdown]
# # 5. Feature Selection
#
# Select the most relevant features to reduce dimensionality and improve model performance.
# %%
# Univariate feature selection (e.g., SelectKBest, SelectPercentile)
# Example:
selector = SelectKBest(score_func=f_classif, k=10) # Choose an appropriate scoring function and k
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
selected_feature_indices = selector.get_support(indices=True)
selected_features = [X_train.columns[i] for i in selected_feature_indices]
print("Selected features:", selected_features)
# %%
# Implement other feature selection techniques (e.g., Recursive Feature Elimination, SelectFromModel)
# Example:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
# estimator = LogisticRegression(solver='liblinear')
# selector = RFE(estimator, n_features_to_select=5, step=1)
# selector = selector.fit(X_train, y_train)
# selected_features = X_train.columns[selector.support_]
# %% [markdown]
# # 6. Feature Encoding (Advanced)
# %%
# Feature Encoding using techniques like target encoding, weight of evidence, or embeddings.
# Requires installing feature-engine or category_encoders
# Example:
# from feature_engine.encoding import MeanEncoder
# encoder = MeanEncoder(variables=['categorical_column'])
# encoder.fit(X_train, y_train)
# X_train_encoded = encoder.transform(X_train)
# X_test_encoded = encoder.transform(X_test)
# Implement advanced feature encoding techniques here
# %% [markdown]
# # 7. Evaluation and Refinement
#
# Evaluate the impact of feature engineering on model performance and refine the process.
# %%
# Train a model with and without feature engineering to compare performance
# Example:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# model = LogisticRegression()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy without feature engineering:", accuracy)
# model_engineered = LogisticRegression()
# model_engineered.fit(X_train_selected, y_train) # Or X_train_encoded, depending on your selection
# y_pred_engineered = model_engineered.predict(X_test_selected) # Or X_test_encoded
# accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
# print("Accuracy with feature engineering:", accuracy_engineered)
# Implement your model training and evaluation logic here
# %% [markdown]
# # 8. Conclusion
#
# Summarize the results of feature engineering and discuss potential improvements.
# %%
# Summarize the findings
# Discuss the impact of each feature engineering step
# Suggest further improvements and next steps
print("Feature engineering process completed.")
print("Further improvements can be made by exploring different feature combinations and model architectures.")