Initial commit
This commit is contained in:
7
skills/feature-engineering-toolkit/assets/README.md
Normal file
7
skills/feature-engineering-toolkit/assets/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Assets
|
||||
|
||||
Bundled resources for feature-engineering-toolkit skill
|
||||
|
||||
- [ ] feature_engineering_template.ipynb: A Jupyter Notebook template for feature engineering, providing a structured framework for Claude to follow.
|
||||
- [ ] example_dataset.csv: A sample dataset that Claude can use to test and demonstrate feature engineering techniques.
|
||||
- [ ] configuration_template.yaml: A YAML template for configuring the feature engineering pipeline, allowing users to customize the process.
|
||||
@@ -0,0 +1,129 @@
|
||||
# Configuration template for the feature engineering toolkit plugin
|
||||
# This file allows you to customize the feature engineering pipeline.
|
||||
|
||||
# --- Data Input ---
|
||||
data_input:
|
||||
# Path to the input data file (CSV, Parquet, etc.)
|
||||
# Supported formats: csv, parquet
|
||||
data_path: "data/input.csv" # REPLACE_ME: Path to your input data
|
||||
# File type of the input data
|
||||
file_type: "csv"
|
||||
# Separator for CSV files (e.g., comma, tab)
|
||||
csv_separator: ","
|
||||
# Quote character for CSV files
|
||||
csv_quotechar: '"'
|
||||
# Encoding of the input file
|
||||
encoding: "utf-8"
|
||||
# Target variable name
|
||||
target_variable: "target" # REPLACE_ME: Name of your target variable
|
||||
# Index column (optional)
|
||||
index_column: null # Set to column name if you have an index column
|
||||
|
||||
# --- Feature Creation ---
|
||||
feature_creation:
|
||||
# Enable or disable automated feature creation
|
||||
enabled: true
|
||||
# List of feature creation techniques to apply
|
||||
# Available techniques: polynomial_features, interaction_terms, aggregation_features, datetime_features
|
||||
techniques:
|
||||
- "polynomial_features"
|
||||
- "interaction_terms"
|
||||
- "datetime_features"
|
||||
# Polynomial Features configuration
|
||||
polynomial_features:
|
||||
degree: 2
|
||||
include_bias: false
|
||||
# Interaction Terms configuration
|
||||
interaction_terms:
|
||||
interaction_degree: 2
|
||||
# Aggregation Features configuration
|
||||
aggregation_features:
|
||||
# Group by column for aggregation (e.g., user_id, product_id)
|
||||
group_by: "user_id" # REPLACE_ME: Column to group by for aggregation
|
||||
# Aggregation functions to apply (e.g., mean, sum, count)
|
||||
aggregations:
|
||||
- "mean"
|
||||
- "sum"
|
||||
# Datetime Features configuration
|
||||
datetime_features:
|
||||
# List of datetime columns to extract features from
|
||||
datetime_columns:
|
||||
- "date" # REPLACE_ME: Name of your datetime column
|
||||
# List of datetime features to extract (e.g., year, month, day, dayofweek)
|
||||
features_to_extract:
|
||||
- "year"
|
||||
- "month"
|
||||
- "dayofweek"
|
||||
|
||||
# --- Feature Selection ---
|
||||
feature_selection:
|
||||
# Enable or disable feature selection
|
||||
enabled: true
|
||||
# Selection method
|
||||
# Available methods: variance_threshold, select_k_best, recursive_feature_elimination
|
||||
method: "select_k_best"
|
||||
# Variance Threshold configuration
|
||||
variance_threshold:
|
||||
threshold: 0.0
|
||||
# Select K Best configuration
|
||||
select_k_best:
|
||||
k: 10 # REPLACE_ME: Number of top features to select
|
||||
score_func: "f_classif" # Or "f_regression"
|
||||
# Recursive Feature Elimination configuration
|
||||
recursive_feature_elimination:
|
||||
n_features_to_select: 10
|
||||
step: 1
|
||||
|
||||
# --- Feature Transformation ---
|
||||
feature_transformation:
|
||||
# Enable or disable feature transformation
|
||||
enabled: true
|
||||
# Transformation method
|
||||
# Available methods: standard_scaler, min_max_scaler, robust_scaler, power_transformer, quantile_transformer
|
||||
method: "standard_scaler"
|
||||
# Standard Scaler configuration (no specific parameters)
|
||||
standard_scaler: {}
|
||||
# Min-Max Scaler configuration
|
||||
min_max_scaler:
|
||||
feature_range: [0, 1]
|
||||
# Robust Scaler configuration
|
||||
robust_scaler:
|
||||
quantile_range: [25.0, 75.0]
|
||||
# Power Transformer configuration
|
||||
power_transformer:
|
||||
method: "yeo-johnson"
|
||||
standardize: true
|
||||
# Quantile Transformer configuration
|
||||
quantile_transformer:
|
||||
n_quantiles: 100
|
||||
output_distribution: "uniform"
|
||||
|
||||
# --- Output ---
|
||||
output:
|
||||
# Path to save the processed data
|
||||
output_path: "data/processed.csv" # REPLACE_ME: Path to save the processed data
|
||||
# Save format for the output data
|
||||
output_format: "csv"
|
||||
# Save the feature engineering report
|
||||
save_report: true
|
||||
# Path to save the feature engineering report
|
||||
report_path: "reports/feature_engineering_report.html"
|
||||
# Save the trained model (if applicable, e.g., for feature importance selection)
|
||||
save_model: false
|
||||
# Path to save the trained model
|
||||
model_path: "models/feature_selection_model.pkl"
|
||||
|
||||
# --- Advanced Options ---
|
||||
advanced:
|
||||
# Number of CPU cores to use for parallel processing
|
||||
n_jobs: -1 # -1 means using all available cores
|
||||
# Random seed for reproducibility
|
||||
random_state: 42 # YOUR_VALUE_HERE: Set a random seed for reproducibility
|
||||
# Verbosity level (0: silent, 1: info, 2: debug)
|
||||
verbosity: 1
|
||||
# Handle missing values (imputation)
|
||||
handle_missing: true
|
||||
# Imputation strategy (mean, median, most_frequent, constant)
|
||||
imputation_strategy: "mean"
|
||||
# Constant value for imputation (if imputation_strategy is "constant")
|
||||
imputation_constant: 0
|
||||
@@ -0,0 +1,50 @@
|
||||
# example_dataset.csv
|
||||
# This CSV file provides a sample dataset for demonstrating feature engineering techniques within the feature-engineering-toolkit plugin.
|
||||
#
|
||||
# Column Descriptions:
|
||||
# - user_id: Unique identifier for each user (integer).
|
||||
# - age: Age of the user (integer).
|
||||
# - gender: Gender of the user (categorical: Male, Female, Other).
|
||||
# - signup_date: Date the user signed up (YYYY-MM-DD).
|
||||
# - last_login: Date of the user's last login (YYYY-MM-DD).
|
||||
# - total_purchases: Total number of purchases made by the user (integer).
|
||||
# - avg_purchase_value: Average value of each purchase (float).
|
||||
# - country: Country of the user (categorical).
|
||||
# - marketing_channel: The marketing channel through which the user signed up (categorical).
|
||||
# - is_active: Indicates whether the user is currently active (boolean: True, False).
|
||||
# - churned: Target variable indicating whether the user churned (boolean: True, False). This is what we want to predict.
|
||||
#
|
||||
# Instructions:
|
||||
# - Use this dataset to experiment with feature engineering techniques.
|
||||
# - Consider creating new features such as:
|
||||
# - Time since signup (calculated from signup_date).
|
||||
# - Time since last login (calculated from last_login).
|
||||
# - Purchase frequency (total_purchases / time since signup).
|
||||
# - Age groups (binning the age variable).
|
||||
# - Interactions between features (e.g., age * avg_purchase_value).
|
||||
# - Use feature selection techniques to identify the most important features for predicting churn.
|
||||
# - Apply feature transformations (e.g., scaling, normalization, encoding categorical variables).
|
||||
# - Remember to handle missing values appropriately (if any).
|
||||
# - The 'churned' column is the target variable. The goal is to build a model that accurately predicts churn.
|
||||
|
||||
user_id,age,gender,signup_date,last_login,total_purchases,avg_purchase_value,country,marketing_channel,is_active,churned
|
||||
1,25,Male,2023-01-15,2024-01-10,10,25.50,USA,Facebook,True,False
|
||||
2,30,Female,2023-02-20,2024-01-15,5,50.00,Canada,Google Ads,True,False
|
||||
3,40,Other,2023-03-10,2023-12-20,2,100.00,UK,Email,False,True
|
||||
4,22,Male,2023-04-05,2024-01-05,15,15.75,Germany,Facebook,True,False
|
||||
5,35,Female,2023-05-01,2023-11-30,1,200.00,France,Referral,False,True
|
||||
6,28,Male,2023-06-12,2024-01-20,8,30.20,USA,Google Ads,True,False
|
||||
7,45,Female,2023-07-08,2023-10-25,3,75.00,Canada,Email,False,True
|
||||
8,31,Other,2023-08-03,2024-01-01,12,20.00,UK,Facebook,True,False
|
||||
9,24,Male,2023-09-18,2023-12-10,7,40.00,Germany,Referral,False,True
|
||||
10,38,Female,2023-10-22,2024-01-25,6,60.50,France,Google Ads,True,False
|
||||
11,29,Male,2023-11-05,2023-12-15,4,80.00,USA,Email,False,True
|
||||
12,33,Female,2023-12-01,2024-01-08,9,28.00,Canada,Facebook,True,False
|
||||
13,42,Other,2024-01-02,2024-01-28,11,22.50,UK,Google Ads,True,False
|
||||
14,27,Male,2023-01-28,2024-01-12,13,18.00,Germany,Referral,True,False
|
||||
15,36,Female,2023-02-15,2023-11-01,0,0.00,France,Email,False,True
|
||||
16,23,Male,2023-03-22,2024-01-18,14,17.25,USA,Facebook,True,False
|
||||
17,39,Female,2023-04-10,2023-10-10,2,90.00,Canada,Google Ads,False,True
|
||||
18,41,Other,2023-05-05,2024-01-03,16,14.50,UK,Referral,True,False
|
||||
19,26,Male,2023-06-01,2023-12-25,5,55.00,Germany,Email,False,True
|
||||
20,34,Female,2023-07-15,2024-01-22,17,13.00,France,Facebook,True,False
|
||||
|
@@ -0,0 +1,196 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Feature Engineering Toolkit Template.
|
||||
|
||||
This notebook provides a structured framework for feature engineering.
|
||||
Follow the instructions and placeholders to create, select, and transform features effectively.
|
||||
|
||||
Author: Claude
|
||||
Date: [Date]
|
||||
"""
|
||||
|
||||
# %% [markdown]
|
||||
# # 1. Setup and Data Loading
|
||||
#
|
||||
# Import necessary libraries and load the dataset.
|
||||
|
||||
# %%
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, QuantileTransformer
|
||||
from sklearn.feature_selection import SelectKBest, f_classif
|
||||
# Add more libraries as needed (e.g., scikit-learn, feature-engine)
|
||||
|
||||
# %%
|
||||
# Load your dataset
|
||||
try:
|
||||
DATA_PATH = "[Path to your data file]"
|
||||
df = pd.read_csv(DATA_PATH)
|
||||
print("Data loaded successfully.")
|
||||
print(df.head())
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found at {DATA_PATH}. Please check the file path.")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"An error occurred while loading the data: {e}")
|
||||
raise
|
||||
|
||||
# %% [markdown]
|
||||
# # 2. Data Exploration and Preprocessing
|
||||
#
|
||||
# Understand your data and perform initial cleaning and preprocessing steps.
|
||||
|
||||
# %%
|
||||
# Basic data exploration
|
||||
print(df.info())
|
||||
print(df.describe())
|
||||
|
||||
# %%
|
||||
# Handle missing values (replace with appropriate strategy - mean, median, mode, or removal)
|
||||
# Example:
|
||||
# df['column_with_missing'].fillna(df['column_with_missing'].mean(), inplace=True)
|
||||
# Implement your missing value strategy here
|
||||
|
||||
# %%
|
||||
# Handle categorical variables (one-hot encoding, label encoding, etc.)
|
||||
# Example:
|
||||
# df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)
|
||||
# Implement your categorical encoding strategy here
|
||||
|
||||
# %%
|
||||
# Split data into training and testing sets
|
||||
TARGET_VARIABLE = "[Your target variable name]"
|
||||
FEATURES = [col for col in df.columns if col != TARGET_VARIABLE] # Select all columns except the target as features
|
||||
|
||||
X = df[FEATURES]
|
||||
y = df[TARGET_VARIABLE]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Adjust test_size and random_state as needed
|
||||
|
||||
print("Training set shape:", X_train.shape)
|
||||
print("Testing set shape:", X_test.shape)
|
||||
|
||||
# %% [markdown]
|
||||
# # 3. Feature Creation
|
||||
#
|
||||
# Generate new features based on existing ones.
|
||||
|
||||
# %%
|
||||
# Create new features (e.g., polynomial features, interaction terms, domain-specific features)
|
||||
# Example:
|
||||
# df['new_feature'] = df['feature1'] * df['feature2']
|
||||
# Implement your feature creation logic here
|
||||
|
||||
# %% [markdown]
|
||||
# # 4. Feature Transformation
|
||||
#
|
||||
# Transform features to improve model performance (e.g., scaling, normalization, power transforms).
|
||||
|
||||
# %%
|
||||
# Scaling numerical features
|
||||
numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']] # Select numerical features
|
||||
|
||||
scaler = StandardScaler() # or MinMaxScaler, RobustScaler, etc.
|
||||
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
|
||||
X_test[numerical_features] = scaler.transform(X_test[numerical_features])
|
||||
|
||||
print("Features scaled.")
|
||||
|
||||
# %%
|
||||
# Apply power transforms (e.g., Yeo-Johnson, Box-Cox) to address skewness
|
||||
# Example:
|
||||
# from sklearn.preprocessing import PowerTransformer
|
||||
# pt = PowerTransformer(method='yeo-johnson')
|
||||
# X_train['feature_to_transform'] = pt.fit_transform(X_train[['feature_to_transform']])
|
||||
# X_test['feature_to_transform'] = pt.transform(X_test[['feature_to_transform']])
|
||||
|
||||
# Implement your power transform logic here
|
||||
|
||||
# %%
|
||||
# Apply QuantileTransformer for non-linear transformations
|
||||
# Example:
|
||||
# quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
|
||||
# X_train['feature_to_transform'] = quantile_transformer.fit_transform(X_train[['feature_to_transform']])
|
||||
# X_test['feature_to_transform'] = quantile_transformer.transform(X_test[['feature_to_transform']])
|
||||
|
||||
# Implement your quantile transform logic here
|
||||
|
||||
# %% [markdown]
|
||||
# # 5. Feature Selection
|
||||
#
|
||||
# Select the most relevant features to reduce dimensionality and improve model performance.
|
||||
|
||||
# %%
|
||||
# Univariate feature selection (e.g., SelectKBest, SelectPercentile)
|
||||
# Example:
|
||||
selector = SelectKBest(score_func=f_classif, k=10) # Choose an appropriate scoring function and k
|
||||
X_train_selected = selector.fit_transform(X_train, y_train)
|
||||
X_test_selected = selector.transform(X_test)
|
||||
|
||||
selected_feature_indices = selector.get_support(indices=True)
|
||||
selected_features = [X_train.columns[i] for i in selected_feature_indices]
|
||||
|
||||
print("Selected features:", selected_features)
|
||||
|
||||
# %%
|
||||
# Implement other feature selection techniques (e.g., Recursive Feature Elimination, SelectFromModel)
|
||||
# Example:
|
||||
# from sklearn.feature_selection import RFE
|
||||
# from sklearn.linear_model import LogisticRegression
|
||||
# estimator = LogisticRegression(solver='liblinear')
|
||||
# selector = RFE(estimator, n_features_to_select=5, step=1)
|
||||
# selector = selector.fit(X_train, y_train)
|
||||
# selected_features = X_train.columns[selector.support_]
|
||||
|
||||
# %% [markdown]
|
||||
# # 6. Feature Encoding (Advanced)
|
||||
|
||||
# %%
|
||||
# Feature Encoding using techniques like target encoding, weight of evidence, or embeddings.
|
||||
# Requires installing feature-engine or category_encoders
|
||||
# Example:
|
||||
# from feature_engine.encoding import MeanEncoder
|
||||
# encoder = MeanEncoder(variables=['categorical_column'])
|
||||
# encoder.fit(X_train, y_train)
|
||||
# X_train_encoded = encoder.transform(X_train)
|
||||
# X_test_encoded = encoder.transform(X_test)
|
||||
|
||||
# Implement advanced feature encoding techniques here
|
||||
|
||||
# %% [markdown]
|
||||
# # 7. Evaluation and Refinement
|
||||
#
|
||||
# Evaluate the impact of feature engineering on model performance and refine the process.
|
||||
|
||||
# %%
|
||||
# Train a model with and without feature engineering to compare performance
|
||||
# Example:
|
||||
# from sklearn.linear_model import LogisticRegression
|
||||
# from sklearn.metrics import accuracy_score
|
||||
|
||||
# model = LogisticRegression()
|
||||
# model.fit(X_train, y_train)
|
||||
# y_pred = model.predict(X_test)
|
||||
# accuracy = accuracy_score(y_test, y_pred)
|
||||
# print("Accuracy without feature engineering:", accuracy)
|
||||
|
||||
# model_engineered = LogisticRegression()
|
||||
# model_engineered.fit(X_train_selected, y_train) # Or X_train_encoded, depending on your selection
|
||||
# y_pred_engineered = model_engineered.predict(X_test_selected) # Or X_test_encoded
|
||||
# accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
|
||||
# print("Accuracy with feature engineering:", accuracy_engineered)
|
||||
|
||||
# Implement your model training and evaluation logic here
|
||||
|
||||
# %% [markdown]
|
||||
# # 8. Conclusion
|
||||
#
|
||||
# Summarize the results of feature engineering and discuss potential improvements.
|
||||
|
||||
# %%
|
||||
# Summarize the findings
|
||||
# Discuss the impact of each feature engineering step
|
||||
# Suggest further improvements and next steps
|
||||
print("Feature engineering process completed.")
|
||||
print("Further improvements can be made by exploring different feature combinations and model architectures.")
|
||||
Reference in New Issue
Block a user