Initial commit
This commit is contained in:
311
templates/clustering.ts
Normal file
311
templates/clustering.ts
Normal file
@@ -0,0 +1,311 @@
|
||||
/**
|
||||
* Document Clustering with Gemini Embeddings
|
||||
*
|
||||
* Demonstrates automatic grouping of similar documents using K-means clustering.
|
||||
* Useful for topic modeling, content organization, and duplicate detection.
|
||||
*
|
||||
* Setup:
|
||||
* 1. npm install @google/genai@^1.27.0
|
||||
* 2. export GEMINI_API_KEY="your-api-key"
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx clustering.ts
|
||||
*/
|
||||
|
||||
import { GoogleGenAI } from "@google/genai";
|
||||
|
||||
interface Document {
|
||||
id: string;
|
||||
text: string;
|
||||
embedding?: number[];
|
||||
}
|
||||
|
||||
interface Cluster {
|
||||
id: number;
|
||||
centroid: number[];
|
||||
documents: Document[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate cosine similarity
|
||||
*/
|
||||
function cosineSimilarity(a: number[], b: number[]): number {
|
||||
if (a.length !== b.length) {
|
||||
throw new Error('Vector dimensions must match');
|
||||
}
|
||||
|
||||
let dotProduct = 0, magA = 0, magB = 0;
|
||||
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
dotProduct += a[i] * b[i];
|
||||
magA += a[i] * a[i];
|
||||
magB += b[i] * b[i];
|
||||
}
|
||||
|
||||
return dotProduct / (Math.sqrt(magA) * Math.sqrt(magB));
|
||||
}
|
||||
|
||||
/**
|
||||
* K-means clustering algorithm
|
||||
*/
|
||||
function kMeansClustering(
|
||||
documents: Document[],
|
||||
k: number = 3,
|
||||
maxIterations: number = 100
|
||||
): Cluster[] {
|
||||
if (documents.length === 0 || !documents[0].embedding) {
|
||||
throw new Error('Documents must have embeddings');
|
||||
}
|
||||
|
||||
const embeddings = documents.map(d => d.embedding!);
|
||||
|
||||
// 1. Initialize centroids randomly
|
||||
const centroids: number[][] = [];
|
||||
const usedIndices = new Set<number>();
|
||||
|
||||
for (let i = 0; i < k; i++) {
|
||||
let randomIndex: number;
|
||||
do {
|
||||
randomIndex = Math.floor(Math.random() * embeddings.length);
|
||||
} while (usedIndices.has(randomIndex));
|
||||
|
||||
usedIndices.add(randomIndex);
|
||||
centroids.push([...embeddings[randomIndex]]);
|
||||
}
|
||||
|
||||
console.log(`🔄 Starting K-means clustering (k=${k}, max iterations=${maxIterations})\n`);
|
||||
|
||||
// 2. Iterate until convergence
|
||||
let iteration = 0;
|
||||
let converged = false;
|
||||
|
||||
while (iteration < maxIterations && !converged) {
|
||||
// Assign each document to nearest centroid
|
||||
const clusters: Document[][] = Array(k).fill(null).map(() => []);
|
||||
|
||||
documents.forEach((doc, idx) => {
|
||||
const embedding = embeddings[idx];
|
||||
let maxSimilarity = -Infinity;
|
||||
let closestCluster = 0;
|
||||
|
||||
centroids.forEach((centroid, i) => {
|
||||
const similarity = cosineSimilarity(embedding, centroid);
|
||||
if (similarity > maxSimilarity) {
|
||||
maxSimilarity = similarity;
|
||||
closestCluster = i;
|
||||
}
|
||||
});
|
||||
|
||||
clusters[closestCluster].push(doc);
|
||||
});
|
||||
|
||||
// Update centroids (average of cluster members)
|
||||
converged = true;
|
||||
clusters.forEach((cluster, i) => {
|
||||
if (cluster.length === 0) return;
|
||||
|
||||
const newCentroid = cluster[0].embedding!.map((_, dim) =>
|
||||
cluster.reduce((sum, doc) => sum + doc.embedding![dim], 0) / cluster.length
|
||||
);
|
||||
|
||||
// Check if centroid changed significantly
|
||||
const similarity = cosineSimilarity(centroids[i], newCentroid);
|
||||
if (similarity < 0.9999) {
|
||||
converged = false;
|
||||
}
|
||||
|
||||
centroids[i] = newCentroid;
|
||||
});
|
||||
|
||||
iteration++;
|
||||
|
||||
if (iteration % 10 === 0) {
|
||||
console.log(`Iteration ${iteration}...`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`✅ Converged after ${iteration} iterations\n`);
|
||||
|
||||
// Build final clusters
|
||||
const finalClusters: Cluster[] = centroids.map((centroid, i) => ({
|
||||
id: i,
|
||||
centroid,
|
||||
documents: documents.filter((doc) => {
|
||||
const similarities = centroids.map(c => cosineSimilarity(doc.embedding!, c));
|
||||
return similarities.indexOf(Math.max(...similarities)) === i;
|
||||
})
|
||||
}));
|
||||
|
||||
return finalClusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clustering by similarity threshold (alternative to K-means)
|
||||
*/
|
||||
function clusterByThreshold(
|
||||
documents: Document[],
|
||||
threshold: number = 0.8
|
||||
): Cluster[] {
|
||||
if (documents.length === 0 || !documents[0].embedding) {
|
||||
throw new Error('Documents must have embeddings');
|
||||
}
|
||||
|
||||
const clusters: Cluster[] = [];
|
||||
const assigned = new Set<number>();
|
||||
|
||||
documents.forEach((doc, idx) => {
|
||||
if (assigned.has(idx)) return;
|
||||
|
||||
const clusterDocs = [doc];
|
||||
assigned.add(idx);
|
||||
|
||||
documents.forEach((otherDoc, otherIdx) => {
|
||||
if (idx !== otherIdx && !assigned.has(otherIdx)) {
|
||||
const similarity = cosineSimilarity(doc.embedding!, otherDoc.embedding!);
|
||||
|
||||
if (similarity >= threshold) {
|
||||
clusterDocs.push(otherDoc);
|
||||
assigned.add(otherIdx);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
clusters.push({
|
||||
id: clusters.length,
|
||||
centroid: doc.embedding!,
|
||||
documents: clusterDocs
|
||||
});
|
||||
});
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print cluster summary
|
||||
*/
|
||||
function printClusters(clusters: Cluster[], method: string): void {
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log(`${method} Results`);
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
||||
|
||||
clusters.forEach(cluster => {
|
||||
console.log(`📁 Cluster ${cluster.id + 1} (${cluster.documents.length} documents):`);
|
||||
console.log(`${'─'.repeat(50)}`);
|
||||
|
||||
cluster.documents.forEach(doc => {
|
||||
const preview = doc.text.substring(0, 80) + (doc.text.length > 80 ? '...' : '');
|
||||
console.log(` • [${doc.id}] ${preview}`);
|
||||
});
|
||||
|
||||
console.log('');
|
||||
});
|
||||
|
||||
console.log(`Total clusters: ${clusters.length}\n`);
|
||||
}
|
||||
|
||||
// Example usage
|
||||
async function main() {
|
||||
try {
|
||||
const apiKey = process.env.GEMINI_API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error('GEMINI_API_KEY environment variable not set');
|
||||
}
|
||||
|
||||
const ai = new GoogleGenAI({ apiKey });
|
||||
|
||||
// Sample documents (3 topics: Geography, AI/ML, Food)
|
||||
const documents: Document[] = [
|
||||
// Geography
|
||||
{ id: 'doc1', text: 'Paris is the capital of France. It is known for the Eiffel Tower and the Louvre Museum.' },
|
||||
{ id: 'doc2', text: 'London is the capital of the United Kingdom and home to Big Ben and Buckingham Palace.' },
|
||||
{ id: 'doc3', text: 'Rome is the capital of Italy and famous for the Colosseum and Vatican City.' },
|
||||
|
||||
// AI/ML
|
||||
{ id: 'doc4', text: 'Machine learning is a subset of artificial intelligence that enables computers to learn from data.' },
|
||||
{ id: 'doc5', text: 'Deep learning uses neural networks with multiple layers to learn complex patterns in data.' },
|
||||
{ id: 'doc6', text: 'Natural language processing is a branch of AI that helps computers understand human language.' },
|
||||
|
||||
// Food
|
||||
{ id: 'doc7', text: 'Pizza originated in Italy and is now popular worldwide. It typically has a tomato base and cheese.' },
|
||||
{ id: 'doc8', text: 'Sushi is a Japanese dish made with vinegared rice and various ingredients like raw fish.' },
|
||||
{ id: 'doc9', text: 'Tacos are a traditional Mexican food consisting of a tortilla filled with various ingredients.' }
|
||||
];
|
||||
|
||||
console.log(`\n📚 Generating embeddings for ${documents.length} documents...\n`);
|
||||
|
||||
// Generate embeddings
|
||||
for (const doc of documents) {
|
||||
const response = await ai.models.embedContent({
|
||||
model: 'gemini-embedding-001',
|
||||
content: doc.text,
|
||||
config: {
|
||||
taskType: 'CLUSTERING', // ← Optimized for clustering
|
||||
outputDimensionality: 768
|
||||
}
|
||||
});
|
||||
|
||||
doc.embedding = response.embedding.values;
|
||||
console.log(`✅ Embedded: ${doc.id}`);
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// Method 1: K-means clustering
|
||||
const kMeansClusters = kMeansClustering(documents, 3, 100);
|
||||
printClusters(kMeansClusters, 'K-Means Clustering (k=3)');
|
||||
|
||||
// Method 2: Threshold-based clustering
|
||||
console.log('🔄 Running threshold-based clustering (threshold=0.7)...\n');
|
||||
const thresholdClusters = clusterByThreshold(documents, 0.7);
|
||||
printClusters(thresholdClusters, 'Threshold-Based Clustering (≥70% similarity)');
|
||||
|
||||
// Example: Find intra-cluster similarities
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
||||
console.log('Cluster Quality Analysis');
|
||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
||||
|
||||
kMeansClusters.forEach(cluster => {
|
||||
if (cluster.documents.length < 2) return;
|
||||
|
||||
const similarities: number[] = [];
|
||||
|
||||
for (let i = 0; i < cluster.documents.length; i++) {
|
||||
for (let j = i + 1; j < cluster.documents.length; j++) {
|
||||
const sim = cosineSimilarity(
|
||||
cluster.documents[i].embedding!,
|
||||
cluster.documents[j].embedding!
|
||||
);
|
||||
similarities.push(sim);
|
||||
}
|
||||
}
|
||||
|
||||
const avgSimilarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
|
||||
const minSimilarity = Math.min(...similarities);
|
||||
const maxSimilarity = Math.max(...similarities);
|
||||
|
||||
console.log(`Cluster ${cluster.id + 1}:`);
|
||||
console.log(` Documents: ${cluster.documents.map(d => d.id).join(', ')}`);
|
||||
console.log(` Avg similarity: ${(avgSimilarity * 100).toFixed(1)}%`);
|
||||
console.log(` Min similarity: ${(minSimilarity * 100).toFixed(1)}%`);
|
||||
console.log(` Max similarity: ${(maxSimilarity * 100).toFixed(1)}%`);
|
||||
console.log('');
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('❌ Error:', error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
|
||||
/**
|
||||
* Expected output:
|
||||
*
|
||||
* Cluster 1: Geography documents (Paris, London, Rome)
|
||||
* Cluster 2: AI/ML documents (Machine learning, Deep learning, NLP)
|
||||
* Cluster 3: Food documents (Pizza, Sushi, Tacos)
|
||||
*
|
||||
* This demonstrates how embeddings capture semantic meaning,
|
||||
* allowing automatic topic discovery without manual labeling.
|
||||
*/
|
||||
Reference in New Issue
Block a user