Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:51:02 +08:00
commit b092d0393c
11 changed files with 371 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
# Assets
Bundled resources for clustering-algorithm-runner skill
- [ ] example_data.csv: Example CSV dataset for testing clustering algorithms.
- [ ] clustering_visualization.ipynb: Jupyter notebook demonstrating how to visualize clustering results using matplotlib and seaborn.
- [ ] config_template.json: Template JSON file for configuring clustering algorithm parameters.

View File

@@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
"""
Clustering Visualization Notebook
This notebook demonstrates how to visualize clustering results using matplotlib and seaborn.
It assumes that you have already run a clustering algorithm and have cluster assignments for your data.
Instructions:
1. Replace the placeholder data loading and clustering results with your actual data.
2. Adjust the visualization parameters (e.g., colors, markers) to suit your data and preferences.
3. Experiment with different visualization techniques to gain insights into your clustering results.
"""
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np # Import numpy
# --- Placeholder: Load your data here ---
# Replace this with your actual data loading code
# For example:
# data = pd.read_csv("your_data.csv")
# features = data[["feature1", "feature2"]] # Select the features used for clustering
# Generate some sample data if no data is loaded
np.random.seed(42) # for reproducibility
num_samples = 100
features = pd.DataFrame({
'feature1': np.random.rand(num_samples),
'feature2': np.random.rand(num_samples)
})
# --- Placeholder: Load your clustering results here ---
# Replace this with your actual cluster assignments
# For example:
# cluster_labels = model.labels_ # Assuming you used scikit-learn
# Or:
# cluster_labels = your_clustering_function(features)
# Generate some sample cluster labels if no cluster labels are loaded
num_clusters = 3
cluster_labels = np.random.randint(0, num_clusters, num_samples)
# --- Create a DataFrame for visualization ---
df = features.copy()
df['cluster'] = cluster_labels
# --- Visualization using matplotlib ---
plt.figure(figsize=(8, 6))
plt.title("Clustering Visualization (Matplotlib)")
# Define colors for each cluster
colors = ['red', 'green', 'blue', 'purple', 'orange'] # Add more colors if needed
for cluster_id in df['cluster'].unique():
cluster_data = df[df['cluster'] == cluster_id]
plt.scatter(cluster_data['feature1'], cluster_data['feature2'],
color=colors[cluster_id % len(colors)], # Cycle through colors
label=f'Cluster {cluster_id}')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.show()
# --- Visualization using seaborn ---
plt.figure(figsize=(8, 6))
plt.title("Clustering Visualization (Seaborn)")
sns.scatterplot(x='feature1', y='feature2', hue='cluster', data=df, palette='viridis') # or other palettes
plt.show()
# --- Additional Visualizations ---
# You can add more visualizations here, such as:
# - Pair plots
# - Box plots
# - Histograms
# Example: Pair plot
# sns.pairplot(df, hue='cluster')
# plt.show()
# --- Summary and Interpretation ---
# Add your interpretation of the clustering results here.
# For example:
# "The clustering algorithm has successfully separated the data into distinct groups based on feature1 and feature2."
# "Cluster 0 represents the group with low values for both feature1 and feature2."
# "Cluster 1 represents the group with high values for feature1 and low values for feature2."
# "Cluster 2 represents the group with high values for both feature1 and feature2."
# --- Next Steps ---
# Consider the following next steps:
# - Evaluate the clustering performance using metrics like silhouette score or Davies-Bouldin index.
# - Tune the clustering algorithm parameters to improve the results.
# - Investigate the characteristics of each cluster to gain insights into the data.
# - Use the clustering results for downstream tasks, such as customer segmentation or anomaly detection.

View File

@@ -0,0 +1,63 @@
{
"_comment": "Configuration template for clustering algorithms",
"dataset_name": "example_dataset.csv",
"_comment": "Path to the dataset file (CSV format)",
"algorithm": "KMeans",
"_comment": "Clustering algorithm to use (e.g., KMeans, DBSCAN, AgglomerativeClustering)",
"algorithm_params": {
"_comment": "Parameters specific to the chosen algorithm",
"KMeans": {
"n_clusters": 3,
"_comment": "Number of clusters for KMeans",
"init": "k-means++",
"_comment": "Initialization method for KMeans",
"max_iter": 300,
"_comment": "Maximum number of iterations for KMeans"
},
"DBSCAN": {
"eps": 0.5,
"_comment": "Maximum distance between two samples for DBSCAN to be considered as in the same neighborhood",
"min_samples": 5,
"_comment": "The number of samples in a neighborhood for a point to be considered as a core point for DBSCAN"
},
"AgglomerativeClustering": {
"n_clusters": 3,
"_comment": "The number of clusters to find for AgglomerativeClustering",
"linkage": "ward",
"_comment": "Which linkage criterion to use for AgglomerativeClustering. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion.",
"affinity": "euclidean"
"_comment": "Metric used to compute the linkage. Can be 'euclidean', 'l1', 'l2', 'manhattan', 'cosine', or 'precomputed'. If linkage is 'ward', only 'euclidean' is accepted."
}
},
"preprocessing": {
"_comment": "Preprocessing steps to apply to the dataset",
"scale_data": true,
"_comment": "Whether to scale the data using StandardScaler",
"handle_missing_values": "impute",
"_comment": "How to handle missing values ('drop', 'impute', or null for none)",
"imputation_strategy": "mean"
"_comment": "Strategy to use for imputation (e.g., 'mean', 'median', 'most_frequent')"
},
"output": {
"_comment": "Output configuration",
"output_file": "clustering_results.csv",
"_comment": "Name of the output CSV file to store the cluster assignments",
"include_original_data": true
"_comment": "Whether to include the original data in the output file along with cluster assignments"
},
"evaluation_metrics": [
"silhouette_score",
"_comment": "Silhouette Score (requires ground truth labels)",
"davies_bouldin_score",
"_comment": "Davies-Bouldin Index",
"calinski_harabasz_score"
"_comment": "Calinski-Harabasz Index"
],
"advanced_options": {
"_comment": "Advanced options for controlling the execution",
"random_state": 42,
"_comment": "Random seed for reproducibility",
"n_jobs": -1,
"_comment": "Number of CPU cores to use (-1 for all cores)"
}
}

View File

@@ -0,0 +1,24 @@
# Example CSV data for clustering algorithm testing.
#
# This file demonstrates the expected format for input data to the clustering algorithms.
# Each row represents a data point, and each column represents a feature.
#
# Replace the placeholder data below with your own dataset.
# You can add or remove columns as needed, but ensure the column headers are descriptive.
#
# Note: The first row MUST be the header row containing column names.
#
# Example usage:
# 1. Save this file as 'example_data.csv' or a similar descriptive name.
# 2. Run the clustering algorithm using the command: /run-clustering
# (The plugin will prompt you for the data file name if it isn't the default).
feature_1,feature_2,feature_3
1.0,2.0,3.0
4.0,5.0,6.0
7.0,8.0,9.0
10.0,11.0,12.0
13.0,14.0,15.0
# Add more data points here...
# Example: 16.0,17.0,18.0
# Example: 19.0,20.0,21.0
1 # Example CSV data for clustering algorithm testing.
2 #
3 # This file demonstrates the expected format for input data to the clustering algorithms.
4 # Each row represents a data point, and each column represents a feature.
5 #
6 # Replace the placeholder data below with your own dataset.
7 # You can add or remove columns as needed, but ensure the column headers are descriptive.
8 #
9 # Note: The first row MUST be the header row containing column names.
10 #
11 # Example usage:
12 # 1. Save this file as 'example_data.csv' or a similar descriptive name.
13 # 2. Run the clustering algorithm using the command: /run-clustering
14 # (The plugin will prompt you for the data file name if it isn't the default).
15 feature_1,feature_2,feature_3
16 1.0,2.0,3.0
17 4.0,5.0,6.0
18 7.0,8.0,9.0
19 10.0,11.0,12.0
20 13.0,14.0,15.0
21 # Add more data points here...
22 # Example: 16.0,17.0,18.0
23 # Example: 19.0,20.0,21.0