Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:50:58 +08:00
commit 3fb2d73fdf
11 changed files with 488 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
# Assets
Bundled resources for automl-pipeline-builder skill
- [ ] pipeline_template.yaml: YAML template for defining the structure and configuration of the AutoML pipeline.
- [ ] example_dataset.csv: Sample dataset that can be used as input for the AutoML pipeline.
- [ ] evaluation_report_template.html: HTML template for generating the model evaluation report.

View File

@@ -0,0 +1,203 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>AutoML Model Evaluation Report</title>
<style>
/* Basic Reset */
body, h1, h2, h3, p, table, th, td {
margin: 0;
padding: 0;
border: 0;
font-size: 100%;
font: inherit;
vertical-align: baseline;
}
/* General Styles */
body {
font-family: sans-serif;
line-height: 1.6;
background-color: #f4f4f4;
color: #333;
padding: 20px;
}
.container {
max-width: 960px;
margin: 0 auto;
background-color: #fff;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
h1, h2, h3 {
margin-bottom: 15px;
color: #0056b3;
}
h1 {
font-size: 2.5em;
}
h2 {
font-size: 2em;
}
h3 {
font-size: 1.5em;
}
p {
margin-bottom: 15px;
}
/* Table Styles */
table {
width: 100%;
border-collapse: collapse;
margin-bottom: 20px;
}
th, td {
padding: 12px 15px;
text-align: left;
border-bottom: 1px solid #ddd;
}
th {
background-color: #f0f0f0;
font-weight: bold;
}
/* Responsive Design */
@media (max-width: 768px) {
.container {
padding: 15px;
}
h1 {
font-size: 2em;
}
h2 {
font-size: 1.6em;
}
h3 {
font-size: 1.3em;
}
table {
display: block;
overflow-x: auto;
}
}
/* Specific Styles */
.model-summary {
margin-bottom: 30px;
}
.evaluation-metrics {
margin-bottom: 30px;
}
.visualizations {
margin-bottom: 30px;
}
.conclusion {
margin-bottom: 20px;
}
.visualization-image {
max-width: 100%;
height: auto;
border: 1px solid #ccc;
border-radius: 5px;
margin-bottom: 10px;
}
</style>
</head>
<body>
<div class="container">
<!-- Report Header -->
<h1>AutoML Model Evaluation Report</h1>
<p>Generated on: {{generation_date}}</p>
<!-- Model Summary -->
<section class="model-summary">
<h2>Model Summary</h2>
<p><strong>Model Name:</strong> {{model_name}}</p>
<p><strong>Algorithm:</strong> {{algorithm}}</p>
<p><strong>Dataset:</strong> {{dataset_name}}</p>
<p><strong>Features Used:</strong> {{features_used}}</p>
</section>
<!-- Evaluation Metrics -->
<section class="evaluation-metrics">
<h2>Evaluation Metrics</h2>
<table>
<thead>
<tr>
<th>Metric</th>
<th>Value</th>
</tr>
</thead>
<tbody>
<tr>
<td>Accuracy</td>
<td>{{accuracy}}</td>
</tr>
<tr>
<td>Precision</td>
<td>{{precision}}</td>
</tr>
<tr>
<td>Recall</td>
<td>{{recall}}</td>
</tr>
<tr>
<td>F1-Score</td>
<td>{{f1_score}}</td>
</tr>
<tr>
<td>AUC-ROC</td>
<td>{{auc_roc}}</td>
</tr>
</tbody>
</table>
</section>
<!-- Visualizations -->
<section class="visualizations">
<h2>Visualizations</h2>
<h3>Confusion Matrix</h3>
<img src="{{confusion_matrix_image}}" alt="Confusion Matrix" class="visualization-image">
<h3>ROC Curve</h3>
<img src="{{roc_curve_image}}" alt="ROC Curve" class="visualization-image">
<h3>Feature Importance</h3>
<img src="{{feature_importance_image}}" alt="Feature Importance" class="visualization-image">
</section>
<!-- Conclusion -->
<section class="conclusion">
<h2>Conclusion</h2>
<p>{{conclusion_text}}</p>
</section>
<!-- Additional Notes -->
<section class="notes">
<h3>Additional Notes</h3>
<p>{{additional_notes}}</p>
</section>
</div>
</body>
</html>

View File

@@ -0,0 +1,36 @@
# Sample dataset for AutoML pipeline builder plugin
# This dataset is a simplified example and may not be suitable for all AutoML tasks.
# Replace this with your actual dataset for optimal results.
#
# Columns:
# feature1: Numerical feature (e.g., age, income)
# feature2: Categorical feature (e.g., city, product type) - encoded as strings
# target: Target variable (e.g., churn, conversion) - binary (0 or 1)
feature1,feature2,target
25,New York,0
30,Los Angeles,1
40,Chicago,0
22,Houston,0
35,Phoenix,1
48,Philadelphia,1
28,San Antonio,0
32,San Diego,1
45,Dallas,0
27,San Jose,0
31,Austin,1
38,Jacksonville,0
24,Fort Worth,0
41,Columbus,1
29,Charlotte,0
33,San Francisco,1
46,Indianapolis,1
23,Seattle,0
36,Denver,1
49,Washington,1
# Add more data rows here. Aim for a larger dataset (hundreds or thousands of rows) for better AutoML performance.
# Example:
# 52,Miami,0
# 39,Boston,1
# Consider adding missing values (e.g., empty strings) to test the pipeline's handling of missing data.
# For categorical features with many unique values, consider using techniques like one-hot encoding or target encoding.
1 # Sample dataset for AutoML pipeline builder plugin
2 # This dataset is a simplified example and may not be suitable for all AutoML tasks.
3 # Replace this with your actual dataset for optimal results.
4 #
5 # Columns:
6 # feature1: Numerical feature (e.g., age, income)
7 # feature2: Categorical feature (e.g., city, product type) - encoded as strings
8 # target: Target variable (e.g., churn, conversion) - binary (0 or 1)
9 feature1,feature2,target
10 25,New York,0
11 30,Los Angeles,1
12 40,Chicago,0
13 22,Houston,0
14 35,Phoenix,1
15 48,Philadelphia,1
16 28,San Antonio,0
17 32,San Diego,1
18 45,Dallas,0
19 27,San Jose,0
20 31,Austin,1
21 38,Jacksonville,0
22 24,Fort Worth,0
23 41,Columbus,1
24 29,Charlotte,0
25 33,San Francisco,1
26 46,Indianapolis,1
27 23,Seattle,0
28 36,Denver,1
29 49,Washington,1
30 # Add more data rows here. Aim for a larger dataset (hundreds or thousands of rows) for better AutoML performance.
31 # Example:
32 # 52,Miami,0
33 # 39,Boston,1
34 # Consider adding missing values (e.g., empty strings) to test the pipeline's handling of missing data.
35 # For categorical features with many unique values, consider using techniques like one-hot encoding or target encoding.

View File

@@ -0,0 +1,69 @@
# pipeline_template.yaml
# --- General Pipeline Configuration ---
pipeline_name: "AutoML Pipeline - REPLACE_ME" # Name of the pipeline (e.g., Customer Churn Prediction)
description: "Automated Machine Learning pipeline for REPLACE_ME." # Short description of the pipeline's purpose
version: "1.0.0" # Pipeline version
# --- Data Source Configuration ---
data_source:
type: "csv" # Type of data source (e.g., csv, database, api)
location: "data/YOUR_DATASET.csv" # Path to the data file or connection string
target_column: "target" # Name of the target variable column
index_column: null # Name of the index column (optional)
delimiter: "," # Delimiter for CSV files (e.g., ",", ";", "\t")
quotechar: '"' # Quote character for CSV files
encoding: "utf-8" # Encoding of the data file
# --- Feature Engineering Configuration ---
feature_engineering:
enabled: true # Enable or disable feature engineering
numeric_imputation: "mean" # Strategy for handling missing numerical values (e.g., mean, median, most_frequent, constant)
categorical_encoding: "onehot" # Method for encoding categorical features (e.g., onehot, ordinal, target)
feature_scaling: "standard" # Scaling method for numeric features (e.g., standard, minmax, robust)
feature_selection:
enabled: false # Enable or disable feature selection
method: "variance_threshold" # Feature selection method (e.g., variance_threshold, selectkbest)
threshold: 0.01 # Threshold for feature selection (depends on the method)
# --- Model Training Configuration ---
model_training:
algorithm: "xgboost" # Machine learning algorithm to use (e.g., xgboost, lightgbm, randomforest, logisticregression)
hyperparameter_tuning:
enabled: true # Enable or disable hyperparameter tuning
method: "random_search" # Hyperparameter tuning method (e.g., random_search, grid_search, bayesian_optimization)
n_trials: 50 # Number of trials for hyperparameter tuning
scoring_metric: "roc_auc" # Metric to optimize for (e.g., roc_auc, accuracy, f1, precision, recall)
hyperparameter_space: # Define hyperparameter ranges for each algorithm
xgboost: # Example for XGBoost
n_estimators: [100, 200, 300]
learning_rate: [0.01, 0.1, 0.2]
max_depth: [3, 5, 7]
# Add hyperparameter spaces for other algorithms as needed
# --- Model Evaluation Configuration ---
model_evaluation:
split_ratio: 0.2 # Ratio for splitting data into training and validation sets
scoring_metrics: ["roc_auc", "accuracy", "f1", "precision", "recall"] # List of metrics to evaluate the model
cross_validation:
enabled: true # Enable or disable cross-validation
n_folds: 5 # Number of folds for cross-validation
# --- Model Deployment Configuration ---
model_deployment:
enabled: false # Enable or disable model deployment
environment: "staging" # Target deployment environment (e.g., staging, production)
model_registry: "local" # Location to store the trained model (e.g., local, s3, gcp)
model_path: "models/YOUR_MODEL.pkl" # Path to save the trained model
api_endpoint: "YOUR_API_ENDPOINT" # API endpoint for model deployment (if applicable)
# --- Logging Configuration ---
logging:
level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR)
format: "%(asctime)s - %(levelname)s - %(message)s" # Logging format
file_path: "logs/pipeline.log" # Path to the log file
# --- Error Handling Configuration ---
error_handling:
on_failure: "email_notification" # Action to take on pipeline failure (e.g., email_notification, retry, stop)
email_recipients: ["YOUR_EMAIL@example.com"] # List of email addresses to notify on failure