98 lines
3.5 KiB
Plaintext
98 lines
3.5 KiB
Plaintext
# -*- coding: utf-8 -*-
|
|
"""
|
|
Clustering Visualization Notebook
|
|
|
|
This notebook demonstrates how to visualize clustering results using matplotlib and seaborn.
|
|
|
|
It assumes that you have already run a clustering algorithm and have cluster assignments for your data.
|
|
|
|
Instructions:
|
|
1. Replace the placeholder data loading and clustering results with your actual data.
|
|
2. Adjust the visualization parameters (e.g., colors, markers) to suit your data and preferences.
|
|
3. Experiment with different visualization techniques to gain insights into your clustering results.
|
|
"""
|
|
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import pandas as pd
|
|
import numpy as np # Import numpy
|
|
|
|
# --- Placeholder: Load your data here ---
|
|
# Replace this with your actual data loading code
|
|
# For example:
|
|
# data = pd.read_csv("your_data.csv")
|
|
# features = data[["feature1", "feature2"]] # Select the features used for clustering
|
|
|
|
# Generate some sample data if no data is loaded
|
|
np.random.seed(42) # for reproducibility
|
|
num_samples = 100
|
|
features = pd.DataFrame({
|
|
'feature1': np.random.rand(num_samples),
|
|
'feature2': np.random.rand(num_samples)
|
|
})
|
|
|
|
# --- Placeholder: Load your clustering results here ---
|
|
# Replace this with your actual cluster assignments
|
|
# For example:
|
|
# cluster_labels = model.labels_ # Assuming you used scikit-learn
|
|
# Or:
|
|
# cluster_labels = your_clustering_function(features)
|
|
|
|
# Generate some sample cluster labels if no cluster labels are loaded
|
|
num_clusters = 3
|
|
cluster_labels = np.random.randint(0, num_clusters, num_samples)
|
|
|
|
|
|
# --- Create a DataFrame for visualization ---
|
|
df = features.copy()
|
|
df['cluster'] = cluster_labels
|
|
|
|
# --- Visualization using matplotlib ---
|
|
plt.figure(figsize=(8, 6))
|
|
plt.title("Clustering Visualization (Matplotlib)")
|
|
|
|
# Define colors for each cluster
|
|
colors = ['red', 'green', 'blue', 'purple', 'orange'] # Add more colors if needed
|
|
|
|
for cluster_id in df['cluster'].unique():
|
|
cluster_data = df[df['cluster'] == cluster_id]
|
|
plt.scatter(cluster_data['feature1'], cluster_data['feature2'],
|
|
color=colors[cluster_id % len(colors)], # Cycle through colors
|
|
label=f'Cluster {cluster_id}')
|
|
|
|
plt.xlabel("Feature 1")
|
|
plt.ylabel("Feature 2")
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
|
|
# --- Visualization using seaborn ---
|
|
plt.figure(figsize=(8, 6))
|
|
plt.title("Clustering Visualization (Seaborn)")
|
|
sns.scatterplot(x='feature1', y='feature2', hue='cluster', data=df, palette='viridis') # or other palettes
|
|
plt.show()
|
|
|
|
# --- Additional Visualizations ---
|
|
# You can add more visualizations here, such as:
|
|
# - Pair plots
|
|
# - Box plots
|
|
# - Histograms
|
|
|
|
# Example: Pair plot
|
|
# sns.pairplot(df, hue='cluster')
|
|
# plt.show()
|
|
|
|
# --- Summary and Interpretation ---
|
|
# Add your interpretation of the clustering results here.
|
|
# For example:
|
|
# "The clustering algorithm has successfully separated the data into distinct groups based on feature1 and feature2."
|
|
# "Cluster 0 represents the group with low values for both feature1 and feature2."
|
|
# "Cluster 1 represents the group with high values for feature1 and low values for feature2."
|
|
# "Cluster 2 represents the group with high values for both feature1 and feature2."
|
|
|
|
# --- Next Steps ---
|
|
# Consider the following next steps:
|
|
# - Evaluate the clustering performance using metrics like silhouette score or Davies-Bouldin index.
|
|
# - Tune the clustering algorithm parameters to improve the results.
|
|
# - Investigate the characteristics of each cluster to gain insights into the data.
|
|
# - Use the clustering results for downstream tasks, such as customer segmentation or anomaly detection. |