Initial commit

2025-11-29 18:51:19 +08:00
commit d595b82716
11 changed files with 557 additions and 0 deletions
--- a/skills/feature-engineering-toolkit/assets/README.md
+++ b/skills/feature-engineering-toolkit/assets/README.md
@@ -0,0 +1,7 @@
+# Assets
+
+Bundled resources for feature-engineering-toolkit skill
+
+- [ ] feature_engineering_template.ipynb: A Jupyter Notebook template for feature engineering, providing a structured framework for Claude to follow.
+- [ ] example_dataset.csv: A sample dataset that Claude can use to test and demonstrate feature engineering techniques.
+- [ ] configuration_template.yaml: A YAML template for configuring the feature engineering pipeline, allowing users to customize the process.
--- a/skills/feature-engineering-toolkit/assets/configuration_template.yaml
+++ b/skills/feature-engineering-toolkit/assets/configuration_template.yaml
@@ -0,0 +1,129 @@
+# Configuration template for the feature engineering toolkit plugin
+# This file allows you to customize the feature engineering pipeline.
+
+# --- Data Input ---
+data_input:
+  # Path to the input data file (CSV, Parquet, etc.)
+  # Supported formats: csv, parquet
+  data_path: "data/input.csv" # REPLACE_ME: Path to your input data
+  # File type of the input data
+  file_type: "csv"
+  # Separator for CSV files (e.g., comma, tab)
+  csv_separator: ","
+  # Quote character for CSV files
+  csv_quotechar: '"'
+  # Encoding of the input file
+  encoding: "utf-8"
+  # Target variable name
+  target_variable: "target" # REPLACE_ME: Name of your target variable
+  # Index column (optional)
+  index_column: null  # Set to column name if you have an index column
+
+# --- Feature Creation ---
+feature_creation:
+  # Enable or disable automated feature creation
+  enabled: true
+  # List of feature creation techniques to apply
+  # Available techniques: polynomial_features, interaction_terms, aggregation_features, datetime_features
+  techniques:
+    - "polynomial_features"
+    - "interaction_terms"
+    - "datetime_features"
+  # Polynomial Features configuration
+  polynomial_features:
+    degree: 2
+    include_bias: false
+  # Interaction Terms configuration
+  interaction_terms:
+    interaction_degree: 2
+  # Aggregation Features configuration
+  aggregation_features:
+    # Group by column for aggregation (e.g., user_id, product_id)
+    group_by: "user_id" # REPLACE_ME: Column to group by for aggregation
+    # Aggregation functions to apply (e.g., mean, sum, count)
+    aggregations:
+      - "mean"
+      - "sum"
+  # Datetime Features configuration
+  datetime_features:
+    # List of datetime columns to extract features from
+    datetime_columns:
+      - "date" # REPLACE_ME: Name of your datetime column
+    # List of datetime features to extract (e.g., year, month, day, dayofweek)
+    features_to_extract:
+      - "year"
+      - "month"
+      - "dayofweek"
+
+# --- Feature Selection ---
+feature_selection:
+  # Enable or disable feature selection
+  enabled: true
+  # Selection method
+  # Available methods: variance_threshold, select_k_best, recursive_feature_elimination
+  method: "select_k_best"
+  # Variance Threshold configuration
+  variance_threshold:
+    threshold: 0.0
+  # Select K Best configuration
+  select_k_best:
+    k: 10 # REPLACE_ME: Number of top features to select
+    score_func: "f_classif" # Or "f_regression"
+  # Recursive Feature Elimination configuration
+  recursive_feature_elimination:
+    n_features_to_select: 10
+    step: 1
+
+# --- Feature Transformation ---
+feature_transformation:
+  # Enable or disable feature transformation
+  enabled: true
+  # Transformation method
+  # Available methods: standard_scaler, min_max_scaler, robust_scaler, power_transformer, quantile_transformer
+  method: "standard_scaler"
+  # Standard Scaler configuration (no specific parameters)
+  standard_scaler: {}
+  # Min-Max Scaler configuration
+  min_max_scaler:
+    feature_range: [0, 1]
+  # Robust Scaler configuration
+  robust_scaler:
+    quantile_range: [25.0, 75.0]
+  # Power Transformer configuration
+  power_transformer:
+    method: "yeo-johnson"
+    standardize: true
+  # Quantile Transformer configuration
+  quantile_transformer:
+    n_quantiles: 100
+    output_distribution: "uniform"
+
+# --- Output ---
+output:
+  # Path to save the processed data
+  output_path: "data/processed.csv" # REPLACE_ME: Path to save the processed data
+  # Save format for the output data
+  output_format: "csv"
+  # Save the feature engineering report
+  save_report: true
+  # Path to save the feature engineering report
+  report_path: "reports/feature_engineering_report.html"
+  # Save the trained model (if applicable, e.g., for feature importance selection)
+  save_model: false
+  # Path to save the trained model
+  model_path: "models/feature_selection_model.pkl"
+
+# --- Advanced Options ---
+advanced:
+  # Number of CPU cores to use for parallel processing
+  n_jobs: -1 # -1 means using all available cores
+  # Random seed for reproducibility
+  random_state: 42 # YOUR_VALUE_HERE: Set a random seed for reproducibility
+  # Verbosity level (0: silent, 1: info, 2: debug)
+  verbosity: 1
+  # Handle missing values (imputation)
+  handle_missing: true
+  # Imputation strategy (mean, median, most_frequent, constant)
+  imputation_strategy: "mean"
+  # Constant value for imputation (if imputation_strategy is "constant")
+  imputation_constant: 0
--- a/skills/feature-engineering-toolkit/assets/example_dataset.csv
+++ b/skills/feature-engineering-toolkit/assets/example_dataset.csv
@@ -0,0 +1,50 @@
+# example_dataset.csv
+# This CSV file provides a sample dataset for demonstrating feature engineering techniques within the feature-engineering-toolkit plugin.
+#
+# Column Descriptions:
+#   - user_id: Unique identifier for each user (integer).
+#   - age: Age of the user (integer).
+#   - gender: Gender of the user (categorical: Male, Female, Other).
+#   - signup_date: Date the user signed up (YYYY-MM-DD).
+#   - last_login: Date of the user's last login (YYYY-MM-DD).
+#   - total_purchases: Total number of purchases made by the user (integer).
+#   - avg_purchase_value: Average value of each purchase (float).
+#   - country: Country of the user (categorical).
+#   - marketing_channel: The marketing channel through which the user signed up (categorical).
+#   - is_active: Indicates whether the user is currently active (boolean: True, False).
+#   - churned: Target variable indicating whether the user churned (boolean: True, False).  This is what we want to predict.
+#
+# Instructions:
+#   - Use this dataset to experiment with feature engineering techniques.
+#   - Consider creating new features such as:
+#     - Time since signup (calculated from signup_date).
+#     - Time since last login (calculated from last_login).
+#     - Purchase frequency (total_purchases / time since signup).
+#     - Age groups (binning the age variable).
+#     - Interactions between features (e.g., age * avg_purchase_value).
+#   - Use feature selection techniques to identify the most important features for predicting churn.
+#   - Apply feature transformations (e.g., scaling, normalization, encoding categorical variables).
+#   - Remember to handle missing values appropriately (if any).
+#   - The 'churned' column is the target variable.  The goal is to build a model that accurately predicts churn.
+
+user_id,age,gender,signup_date,last_login,total_purchases,avg_purchase_value,country,marketing_channel,is_active,churned
+1,25,Male,2023-01-15,2024-01-10,10,25.50,USA,Facebook,True,False
+2,30,Female,2023-02-20,2024-01-15,5,50.00,Canada,Google Ads,True,False
+3,40,Other,2023-03-10,2023-12-20,2,100.00,UK,Email,False,True
+4,22,Male,2023-04-05,2024-01-05,15,15.75,Germany,Facebook,True,False
+5,35,Female,2023-05-01,2023-11-30,1,200.00,France,Referral,False,True
+6,28,Male,2023-06-12,2024-01-20,8,30.20,USA,Google Ads,True,False
+7,45,Female,2023-07-08,2023-10-25,3,75.00,Canada,Email,False,True
+8,31,Other,2023-08-03,2024-01-01,12,20.00,UK,Facebook,True,False
+9,24,Male,2023-09-18,2023-12-10,7,40.00,Germany,Referral,False,True
+10,38,Female,2023-10-22,2024-01-25,6,60.50,France,Google Ads,True,False
+11,29,Male,2023-11-05,2023-12-15,4,80.00,USA,Email,False,True
+12,33,Female,2023-12-01,2024-01-08,9,28.00,Canada,Facebook,True,False
+13,42,Other,2024-01-02,2024-01-28,11,22.50,UK,Google Ads,True,False
+14,27,Male,2023-01-28,2024-01-12,13,18.00,Germany,Referral,True,False
+15,36,Female,2023-02-15,2023-11-01,0,0.00,France,Email,False,True
+16,23,Male,2023-03-22,2024-01-18,14,17.25,USA,Facebook,True,False
+17,39,Female,2023-04-10,2023-10-10,2,90.00,Canada,Google Ads,False,True
+18,41,Other,2023-05-05,2024-01-03,16,14.50,UK,Referral,True,False
+19,26,Male,2023-06-01,2023-12-25,5,55.00,Germany,Email,False,True
+20,34,Female,2023-07-15,2024-01-22,17,13.00,France,Facebook,True,False
--- a/skills/feature-engineering-toolkit/assets/feature_engineering_template.ipynb
+++ b/skills/feature-engineering-toolkit/assets/feature_engineering_template.ipynb
@@ -0,0 +1,196 @@
+# -*- coding: utf-8 -*-
+"""
+Feature Engineering Toolkit Template.
+
+This notebook provides a structured framework for feature engineering.
+Follow the instructions and placeholders to create, select, and transform features effectively.
+
+Author: Claude
+Date: [Date]
+"""
+
+# %% [markdown]
+# # 1. Setup and Data Loading
+#
+# Import necessary libraries and load the dataset.
+
+# %%
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, QuantileTransformer
+from sklearn.feature_selection import SelectKBest, f_classif
+# Add more libraries as needed (e.g., scikit-learn, feature-engine)
+
+# %%
+# Load your dataset
+try:
+    DATA_PATH = "[Path to your data file]"
+    df = pd.read_csv(DATA_PATH)
+    print("Data loaded successfully.")
+    print(df.head())
+except FileNotFoundError:
+    print(f"Error: File not found at {DATA_PATH}. Please check the file path.")
+    raise
+except Exception as e:
+    print(f"An error occurred while loading the data: {e}")
+    raise
+
+# %% [markdown]
+# # 2. Data Exploration and Preprocessing
+#
+# Understand your data and perform initial cleaning and preprocessing steps.
+
+# %%
+# Basic data exploration
+print(df.info())
+print(df.describe())
+
+# %%
+# Handle missing values (replace with appropriate strategy - mean, median, mode, or removal)
+# Example:
+# df['column_with_missing'].fillna(df['column_with_missing'].mean(), inplace=True)
+# Implement your missing value strategy here
+
+# %%
+# Handle categorical variables (one-hot encoding, label encoding, etc.)
+# Example:
+# df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)
+# Implement your categorical encoding strategy here
+
+# %%
+# Split data into training and testing sets
+TARGET_VARIABLE = "[Your target variable name]"
+FEATURES = [col for col in df.columns if col != TARGET_VARIABLE] # Select all columns except the target as features
+
+X = df[FEATURES]
+y = df[TARGET_VARIABLE]
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed
+
+print("Training set shape:", X_train.shape)
+print("Testing set shape:", X_test.shape)
+
+# %% [markdown]
+# # 3. Feature Creation
+#
+# Generate new features based on existing ones.
+
+# %%
+# Create new features (e.g., polynomial features, interaction terms, domain-specific features)
+# Example:
+# df['new_feature'] = df['feature1'] * df['feature2']
+# Implement your feature creation logic here
+
+# %% [markdown]
+# # 4. Feature Transformation
+#
+# Transform features to improve model performance (e.g., scaling, normalization, power transforms).
+
+# %%
+# Scaling numerical features
+numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']] # Select numerical features
+
+scaler = StandardScaler() # or MinMaxScaler, RobustScaler, etc.
+X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
+X_test[numerical_features] = scaler.transform(X_test[numerical_features])
+
+print("Features scaled.")
+
+# %%
+# Apply power transforms (e.g., Yeo-Johnson, Box-Cox) to address skewness
+# Example:
+# from sklearn.preprocessing import PowerTransformer
+# pt = PowerTransformer(method='yeo-johnson')
+# X_train['feature_to_transform'] = pt.fit_transform(X_train[['feature_to_transform']])
+# X_test['feature_to_transform'] = pt.transform(X_test[['feature_to_transform']])
+
+# Implement your power transform logic here
+
+# %%
+# Apply QuantileTransformer for non-linear transformations
+# Example:
+# quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
+# X_train['feature_to_transform'] = quantile_transformer.fit_transform(X_train[['feature_to_transform']])
+# X_test['feature_to_transform'] = quantile_transformer.transform(X_test[['feature_to_transform']])
+
+# Implement your quantile transform logic here
+
+# %% [markdown]
+# # 5. Feature Selection
+#
+# Select the most relevant features to reduce dimensionality and improve model performance.
+
+# %%
+# Univariate feature selection (e.g., SelectKBest, SelectPercentile)
+# Example:
+selector = SelectKBest(score_func=f_classif, k=10)  # Choose an appropriate scoring function and k
+X_train_selected = selector.fit_transform(X_train, y_train)
+X_test_selected = selector.transform(X_test)
+
+selected_feature_indices = selector.get_support(indices=True)
+selected_features = [X_train.columns[i] for i in selected_feature_indices]
+
+print("Selected features:", selected_features)
+
+# %%
+# Implement other feature selection techniques (e.g., Recursive Feature Elimination, SelectFromModel)
+# Example:
+# from sklearn.feature_selection import RFE
+# from sklearn.linear_model import LogisticRegression
+# estimator = LogisticRegression(solver='liblinear')
+# selector = RFE(estimator, n_features_to_select=5, step=1)
+# selector = selector.fit(X_train, y_train)
+# selected_features = X_train.columns[selector.support_]
+
+# %% [markdown]
+# # 6. Feature Encoding (Advanced)
+
+# %%
+# Feature Encoding using techniques like target encoding, weight of evidence, or embeddings.
+# Requires installing feature-engine or category_encoders
+# Example:
+# from feature_engine.encoding import MeanEncoder
+# encoder = MeanEncoder(variables=['categorical_column'])
+# encoder.fit(X_train, y_train)
+# X_train_encoded = encoder.transform(X_train)
+# X_test_encoded = encoder.transform(X_test)
+
+# Implement advanced feature encoding techniques here
+
+# %% [markdown]
+# # 7. Evaluation and Refinement
+#
+# Evaluate the impact of feature engineering on model performance and refine the process.
+
+# %%
+# Train a model with and without feature engineering to compare performance
+# Example:
+# from sklearn.linear_model import LogisticRegression
+# from sklearn.metrics import accuracy_score
+
+# model = LogisticRegression()
+# model.fit(X_train, y_train)
+# y_pred = model.predict(X_test)
+# accuracy = accuracy_score(y_test, y_pred)
+# print("Accuracy without feature engineering:", accuracy)
+
+# model_engineered = LogisticRegression()
+# model_engineered.fit(X_train_selected, y_train) # Or X_train_encoded, depending on your selection
+# y_pred_engineered = model_engineered.predict(X_test_selected) # Or X_test_encoded
+# accuracy_engineered = accuracy_score(y_test, y_pred_engineered)
+# print("Accuracy with feature engineering:", accuracy_engineered)
+
+# Implement your model training and evaluation logic here
+
+# %% [markdown]
+# # 8. Conclusion
+#
+# Summarize the results of feature engineering and discuss potential improvements.
+
+# %%
+# Summarize the findings
+# Discuss the impact of each feature engineering step
+# Suggest further improvements and next steps
+print("Feature engineering process completed.")
+print("Further improvements can be made by exploring different feature combinations and model architectures.")