gh-jeremylongshore-claude-c…/skills/feature-engineering-toolkit/assets/configuration_template.yaml

# Configuration template for the feature engineering toolkit plugin
# This file allows you to customize the feature engineering pipeline.

# --- Data Input ---
data_input:
  # Path to the input data file (CSV, Parquet, etc.)
  # Supported formats: csv, parquet
  data_path: "data/input.csv" # REPLACE_ME: Path to your input data
  # File type of the input data
  file_type: "csv"
  # Separator for CSV files (e.g., comma, tab)
  csv_separator: ","
  # Quote character for CSV files
  csv_quotechar: '"'
  # Encoding of the input file
  encoding: "utf-8"
  # Target variable name
  target_variable: "target" # REPLACE_ME: Name of your target variable
  # Index column (optional)
  index_column: null  # Set to column name if you have an index column

# --- Feature Creation ---
feature_creation:
  # Enable or disable automated feature creation
  enabled: true
  # List of feature creation techniques to apply
  # Available techniques: polynomial_features, interaction_terms, aggregation_features, datetime_features
  techniques:
    - "polynomial_features"
    - "interaction_terms"
    - "datetime_features"
  # Polynomial Features configuration
  polynomial_features:
    degree: 2
    include_bias: false
  # Interaction Terms configuration
  interaction_terms:
    interaction_degree: 2
  # Aggregation Features configuration
  aggregation_features:
    # Group by column for aggregation (e.g., user_id, product_id)
    group_by: "user_id" # REPLACE_ME: Column to group by for aggregation
    # Aggregation functions to apply (e.g., mean, sum, count)
    aggregations:
      - "mean"
      - "sum"
  # Datetime Features configuration
  datetime_features:
    # List of datetime columns to extract features from
    datetime_columns:
      - "date" # REPLACE_ME: Name of your datetime column
    # List of datetime features to extract (e.g., year, month, day, dayofweek)
    features_to_extract:
      - "year"
      - "month"
      - "dayofweek"

# --- Feature Selection ---
feature_selection:
  # Enable or disable feature selection
  enabled: true
  # Selection method
  # Available methods: variance_threshold, select_k_best, recursive_feature_elimination
  method: "select_k_best"
  # Variance Threshold configuration
  variance_threshold:
    threshold: 0.0
  # Select K Best configuration
  select_k_best:
    k: 10 # REPLACE_ME: Number of top features to select
    score_func: "f_classif" # Or "f_regression"
  # Recursive Feature Elimination configuration
  recursive_feature_elimination:
    n_features_to_select: 10
    step: 1

# --- Feature Transformation ---
feature_transformation:
  # Enable or disable feature transformation
  enabled: true
  # Transformation method
  # Available methods: standard_scaler, min_max_scaler, robust_scaler, power_transformer, quantile_transformer
  method: "standard_scaler"
  # Standard Scaler configuration (no specific parameters)
  standard_scaler: {}
  # Min-Max Scaler configuration
  min_max_scaler:
    feature_range: [0, 1]
  # Robust Scaler configuration
  robust_scaler:
    quantile_range: [25.0, 75.0]
  # Power Transformer configuration
  power_transformer:
    method: "yeo-johnson"
    standardize: true
  # Quantile Transformer configuration
  quantile_transformer:
    n_quantiles: 100
    output_distribution: "uniform"

# --- Output ---
output:
  # Path to save the processed data
  output_path: "data/processed.csv" # REPLACE_ME: Path to save the processed data
  # Save format for the output data
  output_format: "csv"
  # Save the feature engineering report
  save_report: true
  # Path to save the feature engineering report
  report_path: "reports/feature_engineering_report.html"
  # Save the trained model (if applicable, e.g., for feature importance selection)
  save_model: false
  # Path to save the trained model
  model_path: "models/feature_selection_model.pkl"

# --- Advanced Options ---
advanced:
  # Number of CPU cores to use for parallel processing
  n_jobs: -1 # -1 means using all available cores
  # Random seed for reproducibility
  random_state: 42 # YOUR_VALUE_HERE: Set a random seed for reproducibility
  # Verbosity level (0: silent, 1: info, 2: debug)
  verbosity: 1
  # Handle missing values (imputation)
  handle_missing: true
  # Imputation strategy (mean, median, most_frequent, constant)
  imputation_strategy: "mean"
  # Constant value for imputation (if imputation_strategy is "constant")
  imputation_constant: 0