Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:51:19 +08:00
commit d595b82716
11 changed files with 557 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
# Assets
Bundled resources for feature-engineering-toolkit skill
- [ ] feature_engineering_template.ipynb: A Jupyter Notebook template for feature engineering, providing a structured framework for Claude to follow.
- [ ] example_dataset.csv: A sample dataset that Claude can use to test and demonstrate feature engineering techniques.
- [ ] configuration_template.yaml: A YAML template for configuring the feature engineering pipeline, allowing users to customize the process.

View File

@@ -0,0 +1,129 @@
# Configuration template for the feature engineering toolkit plugin
# This file allows you to customize the feature engineering pipeline.
# --- Data Input ---
data_input:
# Path to the input data file (CSV, Parquet, etc.)
# Supported formats: csv, parquet
data_path: "data/input.csv" # REPLACE_ME: Path to your input data
# File type of the input data
file_type: "csv"
# Separator for CSV files (e.g., comma, tab)
csv_separator: ","
# Quote character for CSV files
csv_quotechar: '"'
# Encoding of the input file
encoding: "utf-8"
# Target variable name
target_variable: "target" # REPLACE_ME: Name of your target variable
# Index column (optional)
index_column: null # Set to column name if you have an index column
# --- Feature Creation ---
feature_creation:
# Enable or disable automated feature creation
enabled: true
# List of feature creation techniques to apply
# Available techniques: polynomial_features, interaction_terms, aggregation_features, datetime_features
techniques:
- "polynomial_features"
- "interaction_terms"
- "datetime_features"
# Polynomial Features configuration
polynomial_features:
degree: 2
include_bias: false
# Interaction Terms configuration
interaction_terms:
interaction_degree: 2
# Aggregation Features configuration
aggregation_features:
# Group by column for aggregation (e.g., user_id, product_id)
group_by: "user_id" # REPLACE_ME: Column to group by for aggregation
# Aggregation functions to apply (e.g., mean, sum, count)
aggregations:
- "mean"
- "sum"
# Datetime Features configuration
datetime_features:
# List of datetime columns to extract features from
datetime_columns:
- "date" # REPLACE_ME: Name of your datetime column
# List of datetime features to extract (e.g., year, month, day, dayofweek)
features_to_extract:
- "year"
- "month"
- "dayofweek"
# --- Feature Selection ---
feature_selection:
# Enable or disable feature selection
enabled: true
# Selection method
# Available methods: variance_threshold, select_k_best, recursive_feature_elimination
method: "select_k_best"
# Variance Threshold configuration
variance_threshold:
threshold: 0.0
# Select K Best configuration
select_k_best:
k: 10 # REPLACE_ME: Number of top features to select
score_func: "f_classif" # Or "f_regression"
# Recursive Feature Elimination configuration
recursive_feature_elimination:
n_features_to_select: 10
step: 1
# --- Feature Transformation ---
feature_transformation:
# Enable or disable feature transformation
enabled: true
# Transformation method
# Available methods: standard_scaler, min_max_scaler, robust_scaler, power_transformer, quantile_transformer
method: "standard_scaler"
# Standard Scaler configuration (no specific parameters)
standard_scaler: {}
# Min-Max Scaler configuration
min_max_scaler:
feature_range: [0, 1]
# Robust Scaler configuration
robust_scaler:
quantile_range: [25.0, 75.0]
# Power Transformer configuration
power_transformer:
method: "yeo-johnson"
standardize: true
# Quantile Transformer configuration
quantile_transformer:
n_quantiles: 100
output_distribution: "uniform"
# --- Output ---
output:
# Path to save the processed data
output_path: "data/processed.csv" # REPLACE_ME: Path to save the processed data
# Save format for the output data
output_format: "csv"
# Save the feature engineering report
save_report: true
# Path to save the feature engineering report
report_path: "reports/feature_engineering_report.html"
# Save the trained model (if applicable, e.g., for feature importance selection)
save_model: false
# Path to save the trained model
model_path: "models/feature_selection_model.pkl"
# --- Advanced Options ---
advanced:
# Number of CPU cores to use for parallel processing
n_jobs: -1 # -1 means using all available cores
# Random seed for reproducibility
random_state: 42 # YOUR_VALUE_HERE: Set a random seed for reproducibility
# Verbosity level (0: silent, 1: info, 2: debug)
verbosity: 1
# Handle missing values (imputation)
handle_missing: true
# Imputation strategy (mean, median, most_frequent, constant)
imputation_strategy: "mean"
# Constant value for imputation (if imputation_strategy is "constant")
imputation_constant: 0

View File

@@ -0,0 +1,50 @@
# example_dataset.csv
# This CSV file provides a sample dataset for demonstrating feature engineering techniques within the feature-engineering-toolkit plugin.
#
# Column Descriptions:
# - user_id: Unique identifier for each user (integer).
# - age: Age of the user (integer).
# - gender: Gender of the user (categorical: Male, Female, Other).
# - signup_date: Date the user signed up (YYYY-MM-DD).
# - last_login: Date of the user's last login (YYYY-MM-DD).
# - total_purchases: Total number of purchases made by the user (integer).
# - avg_purchase_value: Average value of each purchase (float).
# - country: Country of the user (categorical).
# - marketing_channel: The marketing channel through which the user signed up (categorical).
# - is_active: Indicates whether the user is currently active (boolean: True, False).
# - churned: Target variable indicating whether the user churned (boolean: True, False). This is what we want to predict.
#
# Instructions:
# - Use this dataset to experiment with feature engineering techniques.
# - Consider creating new features such as:
# - Time since signup (calculated from signup_date).
# - Time since last login (calculated from last_login).
# - Purchase frequency (total_purchases / time since signup).
# - Age groups (binning the age variable).
# - Interactions between features (e.g., age * avg_purchase_value).
# - Use feature selection techniques to identify the most important features for predicting churn.
# - Apply feature transformations (e.g., scaling, normalization, encoding categorical variables).
# - Remember to handle missing values appropriately (if any).
# - The 'churned' column is the target variable. The goal is to build a model that accurately predicts churn.
user_id,age,gender,signup_date,last_login,total_purchases,avg_purchase_value,country,marketing_channel,is_active,churned
1,25,Male,2023-01-15,2024-01-10,10,25.50,USA,Facebook,True,False
2,30,Female,2023-02-20,2024-01-15,5,50.00,Canada,Google Ads,True,False
3,40,Other,2023-03-10,2023-12-20,2,100.00,UK,Email,False,True
4,22,Male,2023-04-05,2024-01-05,15,15.75,Germany,Facebook,True,False
5,35,Female,2023-05-01,2023-11-30,1,200.00,France,Referral,False,True
6,28,Male,2023-06-12,2024-01-20,8,30.20,USA,Google Ads,True,False
7,45,Female,2023-07-08,2023-10-25,3,75.00,Canada,Email,False,True
8,31,Other,2023-08-03,2024-01-01,12,20.00,UK,Facebook,True,False
9,24,Male,2023-09-18,2023-12-10,7,40.00,Germany,Referral,False,True
10,38,Female,2023-10-22,2024-01-25,6,60.50,France,Google Ads,True,False
11,29,Male,2023-11-05,2023-12-15,4,80.00,USA,Email,False,True
12,33,Female,2023-12-01,2024-01-08,9,28.00,Canada,Facebook,True,False
13,42,Other,2024-01-02,2024-01-28,11,22.50,UK,Google Ads,True,False
14,27,Male,2023-01-28,2024-01-12,13,18.00,Germany,Referral,True,False
15,36,Female,2023-02-15,2023-11-01,0,0.00,France,Email,False,True
16,23,Male,2023-03-22,2024-01-18,14,17.25,USA,Facebook,True,False
17,39,Female,2023-04-10,2023-10-10,2,90.00,Canada,Google Ads,False,True
18,41,Other,2023-05-05,2024-01-03,16,14.50,UK,Referral,True,False
19,26,Male,2023-06-01,2023-12-25,5,55.00,Germany,Email,False,True
20,34,Female,2023-07-15,2024-01-22,17,13.00,France,Facebook,True,False
1 # example_dataset.csv
2 # This CSV file provides a sample dataset for demonstrating feature engineering techniques within the feature-engineering-toolkit plugin.
3 #
4 # Column Descriptions:
5 # - user_id: Unique identifier for each user (integer).
6 # - age: Age of the user (integer).
7 # - gender: Gender of the user (categorical: Male, Female, Other).
8 # - signup_date: Date the user signed up (YYYY-MM-DD).
9 # - last_login: Date of the user's last login (YYYY-MM-DD).
10 # - total_purchases: Total number of purchases made by the user (integer).
11 # - avg_purchase_value: Average value of each purchase (float).
12 # - country: Country of the user (categorical).
13 # - marketing_channel: The marketing channel through which the user signed up (categorical).
14 # - is_active: Indicates whether the user is currently active (boolean: True, False).
15 # - churned: Target variable indicating whether the user churned (boolean: True, False). This is what we want to predict.
16 #
17 # Instructions:
18 # - Use this dataset to experiment with feature engineering techniques.
19 # - Consider creating new features such as:
20 # - Time since signup (calculated from signup_date).
21 # - Time since last login (calculated from last_login).
22 # - Purchase frequency (total_purchases / time since signup).
23 # - Age groups (binning the age variable).
24 # - Interactions between features (e.g., age * avg_purchase_value).
25 # - Use feature selection techniques to identify the most important features for predicting churn.
26 # - Apply feature transformations (e.g., scaling, normalization, encoding categorical variables).
27 # - Remember to handle missing values appropriately (if any).
28 # - The 'churned' column is the target variable. The goal is to build a model that accurately predicts churn.
29 user_id,age,gender,signup_date,last_login,total_purchases,avg_purchase_value,country,marketing_channel,is_active,churned
30 1,25,Male,2023-01-15,2024-01-10,10,25.50,USA,Facebook,True,False
31 2,30,Female,2023-02-20,2024-01-15,5,50.00,Canada,Google Ads,True,False
32 3,40,Other,2023-03-10,2023-12-20,2,100.00,UK,Email,False,True
33 4,22,Male,2023-04-05,2024-01-05,15,15.75,Germany,Facebook,True,False
34 5,35,Female,2023-05-01,2023-11-30,1,200.00,France,Referral,False,True
35 6,28,Male,2023-06-12,2024-01-20,8,30.20,USA,Google Ads,True,False
36 7,45,Female,2023-07-08,2023-10-25,3,75.00,Canada,Email,False,True
37 8,31,Other,2023-08-03,2024-01-01,12,20.00,UK,Facebook,True,False
38 9,24,Male,2023-09-18,2023-12-10,7,40.00,Germany,Referral,False,True
39 10,38,Female,2023-10-22,2024-01-25,6,60.50,France,Google Ads,True,False
40 11,29,Male,2023-11-05,2023-12-15,4,80.00,USA,Email,False,True
41 12,33,Female,2023-12-01,2024-01-08,9,28.00,Canada,Facebook,True,False
42 13,42,Other,2024-01-02,2024-01-28,11,22.50,UK,Google Ads,True,False
43 14,27,Male,2023-01-28,2024-01-12,13,18.00,Germany,Referral,True,False
44 15,36,Female,2023-02-15,2023-11-01,0,0.00,France,Email,False,True
45 16,23,Male,2023-03-22,2024-01-18,14,17.25,USA,Facebook,True,False
46 17,39,Female,2023-04-10,2023-10-10,2,90.00,Canada,Google Ads,False,True
47 18,41,Other,2023-05-05,2024-01-03,16,14.50,UK,Referral,True,False
48 19,26,Male,2023-06-01,2023-12-25,5,55.00,Germany,Email,False,True
49 20,34,Female,2023-07-15,2024-01-22,17,13.00,France,Facebook,True,False

View File

@@ -0,0 +1,196 @@
# -*- coding: utf-8 -*-
"""
Feature Engineering Toolkit Template.
This notebook provides a structured framework for feature engineering.
Follow the instructions and placeholders to create, select, and transform features effectively.
Author: Claude
Date: [Date]
"""
# %% [markdown]
# # 1. Setup and Data Loading
#
# Import necessary libraries and load the dataset.
# %%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif
# Add more libraries as needed (e.g., scikit-learn, feature-engine)
# %%
# Load your dataset
try:
DATA_PATH = "[Path to your data file]"
df = pd.read_csv(DATA_PATH)
print("Data loaded successfully.")
print(df.head())
except FileNotFoundError:
print(f"Error: File not found at {DATA_PATH}. Please check the file path.")
raise
except Exception as e:
print(f"An error occurred while loading the data: {e}")
raise
# %% [markdown]
# # 2. Data Exploration and Preprocessing
#
# Understand your data and perform initial cleaning and preprocessing steps.
# %%
# Basic data exploration
print(df.info())
print(df.describe())
# %%
# Handle missing values (replace with appropriate strategy - mean, median, mode, or removal)
# Example:
# df['column_with_missing'].fillna(df['column_with_missing'].mean(), inplace=True)
# Implement your missing value strategy here
# %%
# Handle categorical variables (one-hot encoding, label encoding, etc.)
# Example:
# df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)
# Implement your categorical encoding strategy here
# %%
# Split data into training and testing sets
TARGET_VARIABLE = "[Your target variable name]"
FEATURES = [col for col in df.columns if col != TARGET_VARIABLE] # Select all columns except the target as features
X = df[FEATURES]
y = df[TARGET_VARIABLE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
# %% [markdown]
# # 3. Feature Creation
#
# Generate new features based on existing ones.
# %%
# Create new features (e.g., polynomial features, interaction terms, domain-specific features)
# Example:
# df['new_feature'] = df['feature1'] * df['feature2']
# Implement your feature creation logic here
# %% [markdown]
# # 4. Feature Transformation
#
# Transform features to improve model performance (e.g., scaling, normalization, power transforms).
# %%
# Scaling numerical features
numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']] # Select numerical features
scaler = StandardScaler() # or MinMaxScaler, RobustScaler, etc.
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])
print("Features scaled.")
# %%
# Apply power transforms (e.g., Yeo-Johnson, Box-Cox) to address skewness
# Example:
# from sklearn.preprocessing import PowerTransformer
# pt = PowerTransformer(method='yeo-johnson')
# X_train['feature_to_transform'] = pt.fit_transform(X_train[['feature_to_transform']])
# X_test['feature_to_transform'] = pt.transform(X_test[['feature_to_transform']])
# Implement your power transform logic here
# %%
# Apply QuantileTransformer for non-linear transformations
# Example:
# quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# X_train['feature_to_transform'] = quantile_transformer.fit_transform(X_train[['feature_to_transform']])
# X_test['feature_to_transform'] = quantile_transformer.transform(X_test[['feature_to_transform']])
# Implement your quantile transform logic here
# %% [markdown]
# # 5. Feature Selection
#
# Select the most relevant features to reduce dimensionality and improve model performance.
# %%
# Univariate feature selection (e.g., SelectKBest, SelectPercentile)
# Example:
selector = SelectKBest(score_func=f_classif, k=10) # Choose an appropriate scoring function and k
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
selected_feature_indices = selector.get_support(indices=True)
selected_features = [X_train.columns[i] for i in selected_feature_indices]
print("Selected features:", selected_features)
# %%
# Implement other feature selection techniques (e.g., Recursive Feature Elimination, SelectFromModel)
# Example:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
# estimator = LogisticRegression(solver='liblinear')
# selector = RFE(estimator, n_features_to_select=5, step=1)
# selector = selector.fit(X_train, y_train)
# selected_features = X_train.columns[selector.support_]
# %% [markdown]
# # 6. Feature Encoding (Advanced)
# %%
# Feature Encoding using techniques like target encoding, weight of evidence, or embeddings.
# Requires installing feature-engine or category_encoders
# Example:
# from feature_engine.encoding import MeanEncoder
# encoder = MeanEncoder(variables=['categorical_column'])
# encoder.fit(X_train, y_train)
# X_train_encoded = encoder.transform(X_train)
# X_test_encoded = encoder.transform(X_test)
# Implement advanced feature encoding techniques here
# %% [markdown]
# # 7. Evaluation and Refinement
#
# Evaluate the impact of feature engineering on model performance and refine the process.
# %%
# Train a model with and without feature engineering to compare performance
# Example:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# model = LogisticRegression()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy without feature engineering:", accuracy)
# model_engineered = LogisticRegression()
# model_engineered.fit(X_train_selected, y_train) # Or X_train_encoded, depending on your selection
# y_pred_engineered = model_engineered.predict(X_test_selected) # Or X_test_encoded
# accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
# print("Accuracy with feature engineering:", accuracy_engineered)
# Implement your model training and evaluation logic here
# %% [markdown]
# # 8. Conclusion
#
# Summarize the results of feature engineering and discuss potential improvements.
# %%
# Summarize the findings
# Discuss the impact of each feature engineering step
# Suggest further improvements and next steps
print("Feature engineering process completed.")
print("Further improvements can be made by exploring different feature combinations and model architectures.")