Files
2025-11-29 18:50:58 +08:00

69 lines
3.6 KiB
YAML

# pipeline_template.yaml
# --- General Pipeline Configuration ---
pipeline_name: "AutoML Pipeline - REPLACE_ME" # Name of the pipeline (e.g., Customer Churn Prediction)
description: "Automated Machine Learning pipeline for REPLACE_ME." # Short description of the pipeline's purpose
version: "1.0.0" # Pipeline version
# --- Data Source Configuration ---
data_source:
type: "csv" # Type of data source (e.g., csv, database, api)
location: "data/YOUR_DATASET.csv" # Path to the data file or connection string
target_column: "target" # Name of the target variable column
index_column: null # Name of the index column (optional)
delimiter: "," # Delimiter for CSV files (e.g., ",", ";", "\t")
quotechar: '"' # Quote character for CSV files
encoding: "utf-8" # Encoding of the data file
# --- Feature Engineering Configuration ---
feature_engineering:
enabled: true # Enable or disable feature engineering
numeric_imputation: "mean" # Strategy for handling missing numerical values (e.g., mean, median, most_frequent, constant)
categorical_encoding: "onehot" # Method for encoding categorical features (e.g., onehot, ordinal, target)
feature_scaling: "standard" # Scaling method for numeric features (e.g., standard, minmax, robust)
feature_selection:
enabled: false # Enable or disable feature selection
method: "variance_threshold" # Feature selection method (e.g., variance_threshold, selectkbest)
threshold: 0.01 # Threshold for feature selection (depends on the method)
# --- Model Training Configuration ---
model_training:
algorithm: "xgboost" # Machine learning algorithm to use (e.g., xgboost, lightgbm, randomforest, logisticregression)
hyperparameter_tuning:
enabled: true # Enable or disable hyperparameter tuning
method: "random_search" # Hyperparameter tuning method (e.g., random_search, grid_search, bayesian_optimization)
n_trials: 50 # Number of trials for hyperparameter tuning
scoring_metric: "roc_auc" # Metric to optimize for (e.g., roc_auc, accuracy, f1, precision, recall)
hyperparameter_space: # Define hyperparameter ranges for each algorithm
xgboost: # Example for XGBoost
n_estimators: [100, 200, 300]
learning_rate: [0.01, 0.1, 0.2]
max_depth: [3, 5, 7]
# Add hyperparameter spaces for other algorithms as needed
# --- Model Evaluation Configuration ---
model_evaluation:
split_ratio: 0.2 # Ratio for splitting data into training and validation sets
scoring_metrics: ["roc_auc", "accuracy", "f1", "precision", "recall"] # List of metrics to evaluate the model
cross_validation:
enabled: true # Enable or disable cross-validation
n_folds: 5 # Number of folds for cross-validation
# --- Model Deployment Configuration ---
model_deployment:
enabled: false # Enable or disable model deployment
environment: "staging" # Target deployment environment (e.g., staging, production)
model_registry: "local" # Location to store the trained model (e.g., local, s3, gcp)
model_path: "models/YOUR_MODEL.pkl" # Path to save the trained model
api_endpoint: "YOUR_API_ENDPOINT" # API endpoint for model deployment (if applicable)
# --- Logging Configuration ---
logging:
level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR)
format: "%(asctime)s - %(levelname)s - %(message)s" # Logging format
file_path: "logs/pipeline.log" # Path to the log file
# --- Error Handling Configuration ---
error_handling:
on_failure: "email_notification" # Action to take on pipeline failure (e.g., email_notification, retry, stop)
email_recipients: ["YOUR_EMAIL@example.com"] # List of email addresses to notify on failure