Initial commit

2025-11-29 18:51:02 +08:00
commit b092d0393c
11 changed files with 371 additions and 0 deletions
--- a/skills/clustering-algorithm-runner/assets/README.md
+++ b/skills/clustering-algorithm-runner/assets/README.md
@@ -0,0 +1,7 @@
+# Assets
+
+Bundled resources for clustering-algorithm-runner skill
+
+- [ ] example_data.csv: Example CSV dataset for testing clustering algorithms.
+- [ ] clustering_visualization.ipynb: Jupyter notebook demonstrating how to visualize clustering results using matplotlib and seaborn.
+- [ ] config_template.json: Template JSON file for configuring clustering algorithm parameters.
--- a/skills/clustering-algorithm-runner/assets/clustering_visualization.ipynb
+++ b/skills/clustering-algorithm-runner/assets/clustering_visualization.ipynb
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+Clustering Visualization Notebook
+
+This notebook demonstrates how to visualize clustering results using matplotlib and seaborn.
+
+It assumes that you have already run a clustering algorithm and have cluster assignments for your data.
+
+Instructions:
+1.  Replace the placeholder data loading and clustering results with your actual data.
+2.  Adjust the visualization parameters (e.g., colors, markers) to suit your data and preferences.
+3.  Experiment with different visualization techniques to gain insights into your clustering results.
+"""
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import numpy as np  # Import numpy
+
+# --- Placeholder: Load your data here ---
+# Replace this with your actual data loading code
+# For example:
+# data = pd.read_csv("your_data.csv")
+# features = data[["feature1", "feature2"]] # Select the features used for clustering
+
+# Generate some sample data if no data is loaded
+np.random.seed(42)  # for reproducibility
+num_samples = 100
+features = pd.DataFrame({
+    'feature1': np.random.rand(num_samples),
+    'feature2': np.random.rand(num_samples)
+})
+
+# --- Placeholder: Load your clustering results here ---
+# Replace this with your actual cluster assignments
+# For example:
+# cluster_labels = model.labels_  # Assuming you used scikit-learn
+# Or:
+# cluster_labels = your_clustering_function(features)
+
+# Generate some sample cluster labels if no cluster labels are loaded
+num_clusters = 3
+cluster_labels = np.random.randint(0, num_clusters, num_samples)
+
+
+# --- Create a DataFrame for visualization ---
+df = features.copy()
+df['cluster'] = cluster_labels
+
+# --- Visualization using matplotlib ---
+plt.figure(figsize=(8, 6))
+plt.title("Clustering Visualization (Matplotlib)")
+
+# Define colors for each cluster
+colors = ['red', 'green', 'blue', 'purple', 'orange'] # Add more colors if needed
+
+for cluster_id in df['cluster'].unique():
+    cluster_data = df[df['cluster'] == cluster_id]
+    plt.scatter(cluster_data['feature1'], cluster_data['feature2'],
+                color=colors[cluster_id % len(colors)],  # Cycle through colors
+                label=f'Cluster {cluster_id}')
+
+plt.xlabel("Feature 1")
+plt.ylabel("Feature 2")
+plt.legend()
+plt.show()
+
+
+# --- Visualization using seaborn ---
+plt.figure(figsize=(8, 6))
+plt.title("Clustering Visualization (Seaborn)")
+sns.scatterplot(x='feature1', y='feature2', hue='cluster', data=df, palette='viridis') # or other palettes
+plt.show()
+
+# --- Additional Visualizations ---
+# You can add more visualizations here, such as:
+# - Pair plots
+# - Box plots
+# - Histograms
+
+# Example: Pair plot
+# sns.pairplot(df, hue='cluster')
+# plt.show()
+
+# --- Summary and Interpretation ---
+# Add your interpretation of the clustering results here.
+# For example:
+# "The clustering algorithm has successfully separated the data into distinct groups based on feature1 and feature2."
+# "Cluster 0 represents the group with low values for both feature1 and feature2."
+# "Cluster 1 represents the group with high values for feature1 and low values for feature2."
+# "Cluster 2 represents the group with high values for both feature1 and feature2."
+
+# --- Next Steps ---
+# Consider the following next steps:
+# - Evaluate the clustering performance using metrics like silhouette score or Davies-Bouldin index.
+# - Tune the clustering algorithm parameters to improve the results.
+# - Investigate the characteristics of each cluster to gain insights into the data.
+# - Use the clustering results for downstream tasks, such as customer segmentation or anomaly detection.
--- a/skills/clustering-algorithm-runner/assets/config_template.json
+++ b/skills/clustering-algorithm-runner/assets/config_template.json
@@ -0,0 +1,63 @@
+{
+  "_comment": "Configuration template for clustering algorithms",
+  "dataset_name": "example_dataset.csv",
+  "_comment": "Path to the dataset file (CSV format)",
+  "algorithm": "KMeans",
+  "_comment": "Clustering algorithm to use (e.g., KMeans, DBSCAN, AgglomerativeClustering)",
+  "algorithm_params": {
+    "_comment": "Parameters specific to the chosen algorithm",
+    "KMeans": {
+      "n_clusters": 3,
+      "_comment": "Number of clusters for KMeans",
+      "init": "k-means++",
+      "_comment": "Initialization method for KMeans",
+      "max_iter": 300,
+      "_comment": "Maximum number of iterations for KMeans"
+    },
+    "DBSCAN": {
+      "eps": 0.5,
+      "_comment": "Maximum distance between two samples for DBSCAN to be considered as in the same neighborhood",
+      "min_samples": 5,
+      "_comment": "The number of samples in a neighborhood for a point to be considered as a core point for DBSCAN"
+    },
+    "AgglomerativeClustering": {
+      "n_clusters": 3,
+      "_comment": "The number of clusters to find for AgglomerativeClustering",
+      "linkage": "ward",
+      "_comment": "Which linkage criterion to use for AgglomerativeClustering. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion.",
+      "affinity": "euclidean"
+      "_comment": "Metric used to compute the linkage. Can be 'euclidean', 'l1', 'l2', 'manhattan', 'cosine', or 'precomputed'. If linkage is 'ward', only 'euclidean' is accepted."
+    }
+  },
+  "preprocessing": {
+    "_comment": "Preprocessing steps to apply to the dataset",
+    "scale_data": true,
+    "_comment": "Whether to scale the data using StandardScaler",
+    "handle_missing_values": "impute",
+    "_comment": "How to handle missing values ('drop', 'impute', or null for none)",
+    "imputation_strategy": "mean"
+    "_comment": "Strategy to use for imputation (e.g., 'mean', 'median', 'most_frequent')"
+  },
+  "output": {
+    "_comment": "Output configuration",
+    "output_file": "clustering_results.csv",
+    "_comment": "Name of the output CSV file to store the cluster assignments",
+    "include_original_data": true
+    "_comment": "Whether to include the original data in the output file along with cluster assignments"
+  },
+  "evaluation_metrics": [
+    "silhouette_score",
+    "_comment": "Silhouette Score (requires ground truth labels)",
+    "davies_bouldin_score",
+    "_comment": "Davies-Bouldin Index",
+    "calinski_harabasz_score"
+    "_comment": "Calinski-Harabasz Index"
+  ],
+  "advanced_options": {
+    "_comment": "Advanced options for controlling the execution",
+    "random_state": 42,
+    "_comment": "Random seed for reproducibility",
+    "n_jobs": -1,
+    "_comment": "Number of CPU cores to use (-1 for all cores)"
+  }
+}
--- a/skills/clustering-algorithm-runner/assets/example_data.csv
+++ b/skills/clustering-algorithm-runner/assets/example_data.csv
@@ -0,0 +1,24 @@
+# Example CSV data for clustering algorithm testing.
+#
+# This file demonstrates the expected format for input data to the clustering algorithms.
+# Each row represents a data point, and each column represents a feature.
+#
+# Replace the placeholder data below with your own dataset.
+# You can add or remove columns as needed, but ensure the column headers are descriptive.
+#
+# Note: The first row MUST be the header row containing column names.
+#
+# Example usage:
+# 1. Save this file as 'example_data.csv' or a similar descriptive name.
+# 2. Run the clustering algorithm using the command: /run-clustering
+#    (The plugin will prompt you for the data file name if it isn't the default).
+
+feature_1,feature_2,feature_3
+1.0,2.0,3.0
+4.0,5.0,6.0
+7.0,8.0,9.0
+10.0,11.0,12.0
+13.0,14.0,15.0
+# Add more data points here...
+# Example: 16.0,17.0,18.0
+# Example: 19.0,20.0,21.0