gh-jezweb-claude-skills-ski…/templates/semantic-search.ts

/**
 * Semantic Search with Gemini Embeddings
 *
 * Demonstrates semantic similarity search using cosine similarity.
 * Finds documents based on meaning, not just keyword matching.
 *
 * Setup:
 * 1. npm install @google/genai@^1.27.0
 * 2. export GEMINI_API_KEY="your-api-key"
 *
 * Usage:
 * npx tsx semantic-search.ts
 */

import { GoogleGenAI } from "@google/genai";

interface Document {
  id: string;
  text: string;
  embedding?: number[];
}

interface SearchResult {
  document: Document;
  similarity: number;
}

/**
 * Calculate cosine similarity between two vectors
 * Returns value between -1 and 1, where 1 = identical
 */
function cosineSimilarity(a: number[], b: number[]): number {
  if (a.length !== b.length) {
    throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
  }

  let dotProduct = 0;
  let magnitudeA = 0;
  let magnitudeB = 0;

  for (let i = 0; i < a.length; i++) {
    dotProduct += a[i] * b[i];
    magnitudeA += a[i] * a[i];
    magnitudeB += b[i] * b[i];
  }

  if (magnitudeA === 0 || magnitudeB === 0) {
    return 0;
  }

  return dotProduct / (Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB));
}

/**
 * Normalize vector to unit length
 * Useful for faster similarity calculations
 */
function normalizeVector(vector: number[]): number[] {
  const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0));

  if (magnitude === 0) {
    return vector;
  }

  return vector.map(v => v / magnitude);
}

/**
 * Calculate dot product (for normalized vectors only)
 */
function dotProduct(a: number[], b: number[]): number {
  return a.reduce((sum, val, i) => sum + val * b[i], 0);
}

class SemanticSearch {
  private ai: GoogleGenAI;
  private documents: Document[] = [];
  private normalized: boolean = false;

  constructor(apiKey: string, normalized: boolean = false) {
    this.ai = new GoogleGenAI({ apiKey });
    this.normalized = normalized;
  }

  /**
   * Index documents (generate and store embeddings)
   */
  async indexDocuments(documents: Array<{ id: string; text: string }>): Promise<void> {
    console.log(`\n📚 Indexing ${documents.length} documents...\n`);

    for (const doc of documents) {
      const response = await this.ai.models.embedContent({
        model: 'gemini-embedding-001',
        content: doc.text,
        config: {
          taskType: 'RETRIEVAL_DOCUMENT', // ← Documents for indexing
          outputDimensionality: 768
        }
      });

      let embedding = response.embedding.values;

      // Normalize if requested (faster similarity calculation)
      if (this.normalized) {
        embedding = normalizeVector(embedding);
      }

      this.documents.push({
        id: doc.id,
        text: doc.text,
        embedding
      });

      console.log(`✅ Indexed: ${doc.id}`);
    }

    console.log(`\n✨ Indexing complete! ${this.documents.length} documents ready.\n`);
  }

  /**
   * Search for similar documents
   */
  async search(query: string, topK: number = 5): Promise<SearchResult[]> {
    if (this.documents.length === 0) {
      throw new Error('No documents indexed. Call indexDocuments() first.');
    }

    console.log(`🔍 Searching for: "${query}"\n`);

    // Generate query embedding
    const response = await this.ai.models.embedContent({
      model: 'gemini-embedding-001',
      content: query,
      config: {
        taskType: 'RETRIEVAL_QUERY', // ← Query, not document
        outputDimensionality: 768
      }
    });

    let queryEmbedding = response.embedding.values;

    if (this.normalized) {
      queryEmbedding = normalizeVector(queryEmbedding);
    }

    // Calculate similarity for each document
    const results: SearchResult[] = this.documents.map(doc => ({
      document: doc,
      similarity: this.normalized
        ? dotProduct(queryEmbedding, doc.embedding!)
        : cosineSimilarity(queryEmbedding, doc.embedding!)
    }));

    // Sort by similarity (descending) and return top K
    return results
      .sort((a, b) => b.similarity - a.similarity)
      .slice(0, topK);
  }

  /**
   * Find similar documents to a given document
   */
  findSimilar(documentId: string, topK: number = 5): SearchResult[] {
    const doc = this.documents.find(d => d.id === documentId);

    if (!doc || !doc.embedding) {
      throw new Error(`Document not found: ${documentId}`);
    }

    const results: SearchResult[] = this.documents
      .filter(d => d.id !== documentId) // Exclude the document itself
      .map(d => ({
        document: d,
        similarity: this.normalized
          ? dotProduct(doc.embedding!, d.embedding!)
          : cosineSimilarity(doc.embedding!, d.embedding!)
      }));

    return results
      .sort((a, b) => b.similarity - a.similarity)
      .slice(0, topK);
  }
}

// Example usage
async function main() {
  try {
    const apiKey = process.env.GEMINI_API_KEY;
    if (!apiKey) {
      throw new Error('GEMINI_API_KEY environment variable not set');
    }

    // Initialize search engine
    const search = new SemanticSearch(apiKey, false); // Set true for normalized vectors

    // Sample documents
    const documents = [
      {
        id: 'doc1',
        text: 'Paris is the capital of France. It is known for the Eiffel Tower and the Louvre Museum.'
      },
      {
        id: 'doc2',
        text: 'Machine learning is a subset of artificial intelligence that enables computers to learn from data.'
      },
      {
        id: 'doc3',
        text: 'The Eiffel Tower is an iconic landmark in Paris, France, built in 1889.'
      },
      {
        id: 'doc4',
        text: 'Deep learning uses neural networks with multiple layers to learn complex patterns in data.'
      },
      {
        id: 'doc5',
        text: 'London is the capital of the United Kingdom and home to Big Ben and Buckingham Palace.'
      },
      {
        id: 'doc6',
        text: 'Natural language processing is a branch of AI that helps computers understand human language.'
      }
    ];

    // Index documents
    await search.indexDocuments(documents);

    // Example 1: Search by query
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('Example 1: Search by Query');
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');

    const query1 = "What is the capital of France?";
    const results1 = await search.search(query1, 3);

    results1.forEach((result, i) => {
      console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
      console.log(`   ${result.document.text}\n`);
    });

    // Example 2: Different query
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('Example 2: AI-related Query');
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');

    const query2 = "Tell me about artificial intelligence";
    const results2 = await search.search(query2, 3);

    results2.forEach((result, i) => {
      console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
      console.log(`   ${result.document.text}\n`);
    });

    // Example 3: Find similar documents
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('Example 3: Find Similar to doc1 (Paris)');
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');

    const similar = search.findSimilar('doc1', 3);

    similar.forEach((result, i) => {
      console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
      console.log(`   ${result.document.text}\n`);
    });

    // Example 4: Demonstrate semantic vs keyword matching
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
    console.log('Example 4: Semantic Understanding');
    console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
    console.log('Query: "neural networks" (no exact keyword match in any document)\n');

    const query3 = "neural networks";
    const results3 = await search.search(query3, 3);

    results3.forEach((result, i) => {
      const hasKeyword = result.document.text.toLowerCase().includes('neural');
      console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id} ${hasKeyword ? '✓ keyword' : '✗ no keyword'}`);
      console.log(`   ${result.document.text}\n`);
    });

    console.log('📊 Note: High similarity even without exact keyword match!');
    console.log('This demonstrates semantic understanding.\n');

  } catch (error: any) {
    console.error('❌ Error:', error.message);
    process.exit(1);
  }
}

main();