# pipeline_template.yaml # --- General Pipeline Configuration --- pipeline_name: "AutoML Pipeline - REPLACE_ME" # Name of the pipeline (e.g., Customer Churn Prediction) description: "Automated Machine Learning pipeline for REPLACE_ME." # Short description of the pipeline's purpose version: "1.0.0" # Pipeline version # --- Data Source Configuration --- data_source: type: "csv" # Type of data source (e.g., csv, database, api) location: "data/YOUR_DATASET.csv" # Path to the data file or connection string target_column: "target" # Name of the target variable column index_column: null # Name of the index column (optional) delimiter: "," # Delimiter for CSV files (e.g., ",", ";", "\t") quotechar: '"' # Quote character for CSV files encoding: "utf-8" # Encoding of the data file # --- Feature Engineering Configuration --- feature_engineering: enabled: true # Enable or disable feature engineering numeric_imputation: "mean" # Strategy for handling missing numerical values (e.g., mean, median, most_frequent, constant) categorical_encoding: "onehot" # Method for encoding categorical features (e.g., onehot, ordinal, target) feature_scaling: "standard" # Scaling method for numeric features (e.g., standard, minmax, robust) feature_selection: enabled: false # Enable or disable feature selection method: "variance_threshold" # Feature selection method (e.g., variance_threshold, selectkbest) threshold: 0.01 # Threshold for feature selection (depends on the method) # --- Model Training Configuration --- model_training: algorithm: "xgboost" # Machine learning algorithm to use (e.g., xgboost, lightgbm, randomforest, logisticregression) hyperparameter_tuning: enabled: true # Enable or disable hyperparameter tuning method: "random_search" # Hyperparameter tuning method (e.g., random_search, grid_search, bayesian_optimization) n_trials: 50 # Number of trials for hyperparameter tuning scoring_metric: "roc_auc" # Metric to optimize for (e.g., roc_auc, accuracy, f1, precision, recall) hyperparameter_space: # Define hyperparameter ranges for each algorithm xgboost: # Example for XGBoost n_estimators: [100, 200, 300] learning_rate: [0.01, 0.1, 0.2] max_depth: [3, 5, 7] # Add hyperparameter spaces for other algorithms as needed # --- Model Evaluation Configuration --- model_evaluation: split_ratio: 0.2 # Ratio for splitting data into training and validation sets scoring_metrics: ["roc_auc", "accuracy", "f1", "precision", "recall"] # List of metrics to evaluate the model cross_validation: enabled: true # Enable or disable cross-validation n_folds: 5 # Number of folds for cross-validation # --- Model Deployment Configuration --- model_deployment: enabled: false # Enable or disable model deployment environment: "staging" # Target deployment environment (e.g., staging, production) model_registry: "local" # Location to store the trained model (e.g., local, s3, gcp) model_path: "models/YOUR_MODEL.pkl" # Path to save the trained model api_endpoint: "YOUR_API_ENDPOINT" # API endpoint for model deployment (if applicable) # --- Logging Configuration --- logging: level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR) format: "%(asctime)s - %(levelname)s - %(message)s" # Logging format file_path: "logs/pipeline.log" # Path to the log file # --- Error Handling Configuration --- error_handling: on_failure: "email_notification" # Action to take on pipeline failure (e.g., email_notification, retry, stop) email_recipients: ["YOUR_EMAIL@example.com"] # List of email addresses to notify on failure