Initial commit

2025-11-30 08:24:54 +08:00
commit 7927519669
17 changed files with 4377 additions and 0 deletions
--- a/templates/clustering.ts
+++ b/templates/clustering.ts
@@ -0,0 +1,311 @@
+/**
+ * Document Clustering with Gemini Embeddings
+ *
+ * Demonstrates automatic grouping of similar documents using K-means clustering.
+ * Useful for topic modeling, content organization, and duplicate detection.
+ *
+ * Setup:
+ * 1. npm install @google/genai@^1.27.0
+ * 2. export GEMINI_API_KEY="your-api-key"
+ *
+ * Usage:
+ * npx tsx clustering.ts
+ */
+
+import { GoogleGenAI } from "@google/genai";
+
+interface Document {
+  id: string;
+  text: string;
+  embedding?: number[];
+}
+
+interface Cluster {
+  id: number;
+  centroid: number[];
+  documents: Document[];
+}
+
+/**
+ * Calculate cosine similarity
+ */
+function cosineSimilarity(a: number[], b: number[]): number {
+  if (a.length !== b.length) {
+    throw new Error('Vector dimensions must match');
+  }
+
+  let dotProduct = 0, magA = 0, magB = 0;
+
+  for (let i = 0; i < a.length; i++) {
+    dotProduct += a[i] * b[i];
+    magA += a[i] * a[i];
+    magB += b[i] * b[i];
+  }
+
+  return dotProduct / (Math.sqrt(magA) * Math.sqrt(magB));
+}
+
+/**
+ * K-means clustering algorithm
+ */
+function kMeansClustering(
+  documents: Document[],
+  k: number = 3,
+  maxIterations: number = 100
+): Cluster[] {
+  if (documents.length === 0 || !documents[0].embedding) {
+    throw new Error('Documents must have embeddings');
+  }
+
+  const embeddings = documents.map(d => d.embedding!);
+
+  // 1. Initialize centroids randomly
+  const centroids: number[][] = [];
+  const usedIndices = new Set<number>();
+
+  for (let i = 0; i < k; i++) {
+    let randomIndex: number;
+    do {
+      randomIndex = Math.floor(Math.random() * embeddings.length);
+    } while (usedIndices.has(randomIndex));
+
+    usedIndices.add(randomIndex);
+    centroids.push([...embeddings[randomIndex]]);
+  }
+
+  console.log(`🔄 Starting K-means clustering (k=${k}, max iterations=${maxIterations})\n`);
+
+  // 2. Iterate until convergence
+  let iteration = 0;
+  let converged = false;
+
+  while (iteration < maxIterations && !converged) {
+    // Assign each document to nearest centroid
+    const clusters: Document[][] = Array(k).fill(null).map(() => []);
+
+    documents.forEach((doc, idx) => {
+      const embedding = embeddings[idx];
+      let maxSimilarity = -Infinity;
+      let closestCluster = 0;
+
+      centroids.forEach((centroid, i) => {
+        const similarity = cosineSimilarity(embedding, centroid);
+        if (similarity > maxSimilarity) {
+          maxSimilarity = similarity;
+          closestCluster = i;
+        }
+      });
+
+      clusters[closestCluster].push(doc);
+    });
+
+    // Update centroids (average of cluster members)
+    converged = true;
+    clusters.forEach((cluster, i) => {
+      if (cluster.length === 0) return;
+
+      const newCentroid = cluster[0].embedding!.map((_, dim) =>
+        cluster.reduce((sum, doc) => sum + doc.embedding![dim], 0) / cluster.length
+      );
+
+      // Check if centroid changed significantly
+      const similarity = cosineSimilarity(centroids[i], newCentroid);
+      if (similarity < 0.9999) {
+        converged = false;
+      }
+
+      centroids[i] = newCentroid;
+    });
+
+    iteration++;
+
+    if (iteration % 10 === 0) {
+      console.log(`Iteration ${iteration}...`);
+    }
+  }
+
+  console.log(`✅ Converged after ${iteration} iterations\n`);
+
+  // Build final clusters
+  const finalClusters: Cluster[] = centroids.map((centroid, i) => ({
+    id: i,
+    centroid,
+    documents: documents.filter((doc) => {
+      const similarities = centroids.map(c => cosineSimilarity(doc.embedding!, c));
+      return similarities.indexOf(Math.max(...similarities)) === i;
+    })
+  }));
+
+  return finalClusters;
+}
+
+/**
+ * Clustering by similarity threshold (alternative to K-means)
+ */
+function clusterByThreshold(
+  documents: Document[],
+  threshold: number = 0.8
+): Cluster[] {
+  if (documents.length === 0 || !documents[0].embedding) {
+    throw new Error('Documents must have embeddings');
+  }
+
+  const clusters: Cluster[] = [];
+  const assigned = new Set<number>();
+
+  documents.forEach((doc, idx) => {
+    if (assigned.has(idx)) return;
+
+    const clusterDocs = [doc];
+    assigned.add(idx);
+
+    documents.forEach((otherDoc, otherIdx) => {
+      if (idx !== otherIdx && !assigned.has(otherIdx)) {
+        const similarity = cosineSimilarity(doc.embedding!, otherDoc.embedding!);
+
+        if (similarity >= threshold) {
+          clusterDocs.push(otherDoc);
+          assigned.add(otherIdx);
+        }
+      }
+    });
+
+    clusters.push({
+      id: clusters.length,
+      centroid: doc.embedding!,
+      documents: clusterDocs
+    });
+  });
+
+  return clusters;
+}
+
+/**
+ * Print cluster summary
+ */
+function printClusters(clusters: Cluster[], method: string): void {
+  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+  console.log(`${method} Results`);
+  console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+
+  clusters.forEach(cluster => {
+    console.log(`📁 Cluster ${cluster.id + 1} (${cluster.documents.length} documents):`);
+    console.log(`${'─'.repeat(50)}`);
+
+    cluster.documents.forEach(doc => {
+      const preview = doc.text.substring(0, 80) + (doc.text.length > 80 ? '...' : '');
+      console.log(`  • [${doc.id}] ${preview}`);
+    });
+
+    console.log('');
+  });
+
+  console.log(`Total clusters: ${clusters.length}\n`);
+}
+
+// Example usage
+async function main() {
+  try {
+    const apiKey = process.env.GEMINI_API_KEY;
+    if (!apiKey) {
+      throw new Error('GEMINI_API_KEY environment variable not set');
+    }
+
+    const ai = new GoogleGenAI({ apiKey });
+
+    // Sample documents (3 topics: Geography, AI/ML, Food)
+    const documents: Document[] = [
+      // Geography
+      { id: 'doc1', text: 'Paris is the capital of France. It is known for the Eiffel Tower and the Louvre Museum.' },
+      { id: 'doc2', text: 'London is the capital of the United Kingdom and home to Big Ben and Buckingham Palace.' },
+      { id: 'doc3', text: 'Rome is the capital of Italy and famous for the Colosseum and Vatican City.' },
+
+      // AI/ML
+      { id: 'doc4', text: 'Machine learning is a subset of artificial intelligence that enables computers to learn from data.' },
+      { id: 'doc5', text: 'Deep learning uses neural networks with multiple layers to learn complex patterns in data.' },
+      { id: 'doc6', text: 'Natural language processing is a branch of AI that helps computers understand human language.' },
+
+      // Food
+      { id: 'doc7', text: 'Pizza originated in Italy and is now popular worldwide. It typically has a tomato base and cheese.' },
+      { id: 'doc8', text: 'Sushi is a Japanese dish made with vinegared rice and various ingredients like raw fish.' },
+      { id: 'doc9', text: 'Tacos are a traditional Mexican food consisting of a tortilla filled with various ingredients.' }
+    ];
+
+    console.log(`\n📚 Generating embeddings for ${documents.length} documents...\n`);
+
+    // Generate embeddings
+    for (const doc of documents) {
+      const response = await ai.models.embedContent({
+        model: 'gemini-embedding-001',
+        content: doc.text,
+        config: {
+          taskType: 'CLUSTERING', // ← Optimized for clustering
+          outputDimensionality: 768
+        }
+      });
+
+      doc.embedding = response.embedding.values;
+      console.log(`✅ Embedded: ${doc.id}`);
+    }
+
+    console.log('');
+
+    // Method 1: K-means clustering
+    const kMeansClusters = kMeansClustering(documents, 3, 100);
+    printClusters(kMeansClusters, 'K-Means Clustering (k=3)');
+
+    // Method 2: Threshold-based clustering
+    console.log('🔄 Running threshold-based clustering (threshold=0.7)...\n');
+    const thresholdClusters = clusterByThreshold(documents, 0.7);
+    printClusters(thresholdClusters, 'Threshold-Based Clustering (≥70% similarity)');
+
+    // Example: Find intra-cluster similarities
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
+    console.log('Cluster Quality Analysis');
+    console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
+
+    kMeansClusters.forEach(cluster => {
+      if (cluster.documents.length < 2) return;
+
+      const similarities: number[] = [];
+
+      for (let i = 0; i < cluster.documents.length; i++) {
+        for (let j = i + 1; j < cluster.documents.length; j++) {
+          const sim = cosineSimilarity(
+            cluster.documents[i].embedding!,
+            cluster.documents[j].embedding!
+          );
+          similarities.push(sim);
+        }
+      }
+
+      const avgSimilarity = similarities.reduce((a, b) => a + b, 0) / similarities.length;
+      const minSimilarity = Math.min(...similarities);
+      const maxSimilarity = Math.max(...similarities);
+
+      console.log(`Cluster ${cluster.id + 1}:`);
+      console.log(`  Documents: ${cluster.documents.map(d => d.id).join(', ')}`);
+      console.log(`  Avg similarity: ${(avgSimilarity * 100).toFixed(1)}%`);
+      console.log(`  Min similarity: ${(minSimilarity * 100).toFixed(1)}%`);
+      console.log(`  Max similarity: ${(maxSimilarity * 100).toFixed(1)}%`);
+      console.log('');
+    });
+
+  } catch (error: any) {
+    console.error('❌ Error:', error.message);
+    process.exit(1);
+  }
+}
+
+main();
+
+/**
+ * Expected output:
+ *
+ * Cluster 1: Geography documents (Paris, London, Rome)
+ * Cluster 2: AI/ML documents (Machine learning, Deep learning, NLP)
+ * Cluster 3: Food documents (Pizza, Sushi, Tacos)
+ *
+ * This demonstrates how embeddings capture semantic meaning,
+ * allowing automatic topic discovery without manual labeling.
+ */