69 lines
3.6 KiB
YAML
69 lines
3.6 KiB
YAML
# pipeline_template.yaml
|
|
|
|
# --- General Pipeline Configuration ---
|
|
pipeline_name: "AutoML Pipeline - REPLACE_ME" # Name of the pipeline (e.g., Customer Churn Prediction)
|
|
description: "Automated Machine Learning pipeline for REPLACE_ME." # Short description of the pipeline's purpose
|
|
version: "1.0.0" # Pipeline version
|
|
|
|
# --- Data Source Configuration ---
|
|
data_source:
|
|
type: "csv" # Type of data source (e.g., csv, database, api)
|
|
location: "data/YOUR_DATASET.csv" # Path to the data file or connection string
|
|
target_column: "target" # Name of the target variable column
|
|
index_column: null # Name of the index column (optional)
|
|
delimiter: "," # Delimiter for CSV files (e.g., ",", ";", "\t")
|
|
quotechar: '"' # Quote character for CSV files
|
|
encoding: "utf-8" # Encoding of the data file
|
|
|
|
# --- Feature Engineering Configuration ---
|
|
feature_engineering:
|
|
enabled: true # Enable or disable feature engineering
|
|
numeric_imputation: "mean" # Strategy for handling missing numerical values (e.g., mean, median, most_frequent, constant)
|
|
categorical_encoding: "onehot" # Method for encoding categorical features (e.g., onehot, ordinal, target)
|
|
feature_scaling: "standard" # Scaling method for numeric features (e.g., standard, minmax, robust)
|
|
feature_selection:
|
|
enabled: false # Enable or disable feature selection
|
|
method: "variance_threshold" # Feature selection method (e.g., variance_threshold, selectkbest)
|
|
threshold: 0.01 # Threshold for feature selection (depends on the method)
|
|
|
|
# --- Model Training Configuration ---
|
|
model_training:
|
|
algorithm: "xgboost" # Machine learning algorithm to use (e.g., xgboost, lightgbm, randomforest, logisticregression)
|
|
hyperparameter_tuning:
|
|
enabled: true # Enable or disable hyperparameter tuning
|
|
method: "random_search" # Hyperparameter tuning method (e.g., random_search, grid_search, bayesian_optimization)
|
|
n_trials: 50 # Number of trials for hyperparameter tuning
|
|
scoring_metric: "roc_auc" # Metric to optimize for (e.g., roc_auc, accuracy, f1, precision, recall)
|
|
hyperparameter_space: # Define hyperparameter ranges for each algorithm
|
|
xgboost: # Example for XGBoost
|
|
n_estimators: [100, 200, 300]
|
|
learning_rate: [0.01, 0.1, 0.2]
|
|
max_depth: [3, 5, 7]
|
|
# Add hyperparameter spaces for other algorithms as needed
|
|
|
|
# --- Model Evaluation Configuration ---
|
|
model_evaluation:
|
|
split_ratio: 0.2 # Ratio for splitting data into training and validation sets
|
|
scoring_metrics: ["roc_auc", "accuracy", "f1", "precision", "recall"] # List of metrics to evaluate the model
|
|
cross_validation:
|
|
enabled: true # Enable or disable cross-validation
|
|
n_folds: 5 # Number of folds for cross-validation
|
|
|
|
# --- Model Deployment Configuration ---
|
|
model_deployment:
|
|
enabled: false # Enable or disable model deployment
|
|
environment: "staging" # Target deployment environment (e.g., staging, production)
|
|
model_registry: "local" # Location to store the trained model (e.g., local, s3, gcp)
|
|
model_path: "models/YOUR_MODEL.pkl" # Path to save the trained model
|
|
api_endpoint: "YOUR_API_ENDPOINT" # API endpoint for model deployment (if applicable)
|
|
|
|
# --- Logging Configuration ---
|
|
logging:
|
|
level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR)
|
|
format: "%(asctime)s - %(levelname)s - %(message)s" # Logging format
|
|
file_path: "logs/pipeline.log" # Path to the log file
|
|
|
|
# --- Error Handling Configuration ---
|
|
error_handling:
|
|
on_failure: "email_notification" # Action to take on pipeline failure (e.g., email_notification, retry, stop)
|
|
email_recipients: ["YOUR_EMAIL@example.com"] # List of email addresses to notify on failure |