Initial commit

2025-11-29 18:51:19 +08:00
commit d595b82716
11 changed files with 557 additions and 0 deletions
--- a/skills/feature-engineering-toolkit/assets/configuration_template.yaml
+++ b/skills/feature-engineering-toolkit/assets/configuration_template.yaml
@@ -0,0 +1,129 @@
+# Configuration template for the feature engineering toolkit plugin
+# This file allows you to customize the feature engineering pipeline.
+
+# --- Data Input ---
+data_input:
+  # Path to the input data file (CSV, Parquet, etc.)
+  # Supported formats: csv, parquet
+  data_path: "data/input.csv" # REPLACE_ME: Path to your input data
+  # File type of the input data
+  file_type: "csv"
+  # Separator for CSV files (e.g., comma, tab)
+  csv_separator: ","
+  # Quote character for CSV files
+  csv_quotechar: '"'
+  # Encoding of the input file
+  encoding: "utf-8"
+  # Target variable name
+  target_variable: "target" # REPLACE_ME: Name of your target variable
+  # Index column (optional)
+  index_column: null  # Set to column name if you have an index column
+
+# --- Feature Creation ---
+feature_creation:
+  # Enable or disable automated feature creation
+  enabled: true
+  # List of feature creation techniques to apply
+  # Available techniques: polynomial_features, interaction_terms, aggregation_features, datetime_features
+  techniques:
+    - "polynomial_features"
+    - "interaction_terms"
+    - "datetime_features"
+  # Polynomial Features configuration
+  polynomial_features:
+    degree: 2
+    include_bias: false
+  # Interaction Terms configuration
+  interaction_terms:
+    interaction_degree: 2
+  # Aggregation Features configuration
+  aggregation_features:
+    # Group by column for aggregation (e.g., user_id, product_id)
+    group_by: "user_id" # REPLACE_ME: Column to group by for aggregation
+    # Aggregation functions to apply (e.g., mean, sum, count)
+    aggregations:
+      - "mean"
+      - "sum"
+  # Datetime Features configuration
+  datetime_features:
+    # List of datetime columns to extract features from
+    datetime_columns:
+      - "date" # REPLACE_ME: Name of your datetime column
+    # List of datetime features to extract (e.g., year, month, day, dayofweek)
+    features_to_extract:
+      - "year"
+      - "month"
+      - "dayofweek"
+
+# --- Feature Selection ---
+feature_selection:
+  # Enable or disable feature selection
+  enabled: true
+  # Selection method
+  # Available methods: variance_threshold, select_k_best, recursive_feature_elimination
+  method: "select_k_best"
+  # Variance Threshold configuration
+  variance_threshold:
+    threshold: 0.0
+  # Select K Best configuration
+  select_k_best:
+    k: 10 # REPLACE_ME: Number of top features to select
+    score_func: "f_classif" # Or "f_regression"
+  # Recursive Feature Elimination configuration
+  recursive_feature_elimination:
+    n_features_to_select: 10
+    step: 1
+
+# --- Feature Transformation ---
+feature_transformation:
+  # Enable or disable feature transformation
+  enabled: true
+  # Transformation method
+  # Available methods: standard_scaler, min_max_scaler, robust_scaler, power_transformer, quantile_transformer
+  method: "standard_scaler"
+  # Standard Scaler configuration (no specific parameters)
+  standard_scaler: {}
+  # Min-Max Scaler configuration
+  min_max_scaler:
+    feature_range: [0, 1]
+  # Robust Scaler configuration
+  robust_scaler:
+    quantile_range: [25.0, 75.0]
+  # Power Transformer configuration
+  power_transformer:
+    method: "yeo-johnson"
+    standardize: true
+  # Quantile Transformer configuration
+  quantile_transformer:
+    n_quantiles: 100
+    output_distribution: "uniform"
+
+# --- Output ---
+output:
+  # Path to save the processed data
+  output_path: "data/processed.csv" # REPLACE_ME: Path to save the processed data
+  # Save format for the output data
+  output_format: "csv"
+  # Save the feature engineering report
+  save_report: true
+  # Path to save the feature engineering report
+  report_path: "reports/feature_engineering_report.html"
+  # Save the trained model (if applicable, e.g., for feature importance selection)
+  save_model: false
+  # Path to save the trained model
+  model_path: "models/feature_selection_model.pkl"
+
+# --- Advanced Options ---
+advanced:
+  # Number of CPU cores to use for parallel processing
+  n_jobs: -1 # -1 means using all available cores
+  # Random seed for reproducibility
+  random_state: 42 # YOUR_VALUE_HERE: Set a random seed for reproducibility
+  # Verbosity level (0: silent, 1: info, 2: debug)
+  verbosity: 1
+  # Handle missing values (imputation)
+  handle_missing: true
+  # Imputation strategy (mean, median, most_frequent, constant)
+  imputation_strategy: "mean"
+  # Constant value for imputation (if imputation_strategy is "constant")
+  imputation_constant: 0