/** * Semantic Search with Gemini Embeddings * * Demonstrates semantic similarity search using cosine similarity. * Finds documents based on meaning, not just keyword matching. * * Setup: * 1. npm install @google/genai@^1.27.0 * 2. export GEMINI_API_KEY="your-api-key" * * Usage: * npx tsx semantic-search.ts */ import { GoogleGenAI } from "@google/genai"; interface Document { id: string; text: string; embedding?: number[]; } interface SearchResult { document: Document; similarity: number; } /** * Calculate cosine similarity between two vectors * Returns value between -1 and 1, where 1 = identical */ function cosineSimilarity(a: number[], b: number[]): number { if (a.length !== b.length) { throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`); } let dotProduct = 0; let magnitudeA = 0; let magnitudeB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; magnitudeA += a[i] * a[i]; magnitudeB += b[i] * b[i]; } if (magnitudeA === 0 || magnitudeB === 0) { return 0; } return dotProduct / (Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB)); } /** * Normalize vector to unit length * Useful for faster similarity calculations */ function normalizeVector(vector: number[]): number[] { const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0)); if (magnitude === 0) { return vector; } return vector.map(v => v / magnitude); } /** * Calculate dot product (for normalized vectors only) */ function dotProduct(a: number[], b: number[]): number { return a.reduce((sum, val, i) => sum + val * b[i], 0); } class SemanticSearch { private ai: GoogleGenAI; private documents: Document[] = []; private normalized: boolean = false; constructor(apiKey: string, normalized: boolean = false) { this.ai = new GoogleGenAI({ apiKey }); this.normalized = normalized; } /** * Index documents (generate and store embeddings) */ async indexDocuments(documents: Array<{ id: string; text: string }>): Promise { console.log(`\nšŸ“š Indexing ${documents.length} documents...\n`); for (const doc of documents) { const response = await this.ai.models.embedContent({ model: 'gemini-embedding-001', content: doc.text, config: { taskType: 'RETRIEVAL_DOCUMENT', // ← Documents for indexing outputDimensionality: 768 } }); let embedding = response.embedding.values; // Normalize if requested (faster similarity calculation) if (this.normalized) { embedding = normalizeVector(embedding); } this.documents.push({ id: doc.id, text: doc.text, embedding }); console.log(`āœ… Indexed: ${doc.id}`); } console.log(`\n✨ Indexing complete! ${this.documents.length} documents ready.\n`); } /** * Search for similar documents */ async search(query: string, topK: number = 5): Promise { if (this.documents.length === 0) { throw new Error('No documents indexed. Call indexDocuments() first.'); } console.log(`šŸ” Searching for: "${query}"\n`); // Generate query embedding const response = await this.ai.models.embedContent({ model: 'gemini-embedding-001', content: query, config: { taskType: 'RETRIEVAL_QUERY', // ← Query, not document outputDimensionality: 768 } }); let queryEmbedding = response.embedding.values; if (this.normalized) { queryEmbedding = normalizeVector(queryEmbedding); } // Calculate similarity for each document const results: SearchResult[] = this.documents.map(doc => ({ document: doc, similarity: this.normalized ? dotProduct(queryEmbedding, doc.embedding!) : cosineSimilarity(queryEmbedding, doc.embedding!) })); // Sort by similarity (descending) and return top K return results .sort((a, b) => b.similarity - a.similarity) .slice(0, topK); } /** * Find similar documents to a given document */ findSimilar(documentId: string, topK: number = 5): SearchResult[] { const doc = this.documents.find(d => d.id === documentId); if (!doc || !doc.embedding) { throw new Error(`Document not found: ${documentId}`); } const results: SearchResult[] = this.documents .filter(d => d.id !== documentId) // Exclude the document itself .map(d => ({ document: d, similarity: this.normalized ? dotProduct(doc.embedding!, d.embedding!) : cosineSimilarity(doc.embedding!, d.embedding!) })); return results .sort((a, b) => b.similarity - a.similarity) .slice(0, topK); } } // Example usage async function main() { try { const apiKey = process.env.GEMINI_API_KEY; if (!apiKey) { throw new Error('GEMINI_API_KEY environment variable not set'); } // Initialize search engine const search = new SemanticSearch(apiKey, false); // Set true for normalized vectors // Sample documents const documents = [ { id: 'doc1', text: 'Paris is the capital of France. It is known for the Eiffel Tower and the Louvre Museum.' }, { id: 'doc2', text: 'Machine learning is a subset of artificial intelligence that enables computers to learn from data.' }, { id: 'doc3', text: 'The Eiffel Tower is an iconic landmark in Paris, France, built in 1889.' }, { id: 'doc4', text: 'Deep learning uses neural networks with multiple layers to learn complex patterns in data.' }, { id: 'doc5', text: 'London is the capital of the United Kingdom and home to Big Ben and Buckingham Palace.' }, { id: 'doc6', text: 'Natural language processing is a branch of AI that helps computers understand human language.' } ]; // Index documents await search.indexDocuments(documents); // Example 1: Search by query console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('Example 1: Search by Query'); console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); const query1 = "What is the capital of France?"; const results1 = await search.search(query1, 3); results1.forEach((result, i) => { console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`); console.log(` ${result.document.text}\n`); }); // Example 2: Different query console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('Example 2: AI-related Query'); console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); const query2 = "Tell me about artificial intelligence"; const results2 = await search.search(query2, 3); results2.forEach((result, i) => { console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`); console.log(` ${result.document.text}\n`); }); // Example 3: Find similar documents console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('Example 3: Find Similar to doc1 (Paris)'); console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); const similar = search.findSimilar('doc1', 3); similar.forEach((result, i) => { console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`); console.log(` ${result.document.text}\n`); }); // Example 4: Demonstrate semantic vs keyword matching console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━'); console.log('Example 4: Semantic Understanding'); console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n'); console.log('Query: "neural networks" (no exact keyword match in any document)\n'); const query3 = "neural networks"; const results3 = await search.search(query3, 3); results3.forEach((result, i) => { const hasKeyword = result.document.text.toLowerCase().includes('neural'); console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id} ${hasKeyword ? 'āœ“ keyword' : 'āœ— no keyword'}`); console.log(` ${result.document.text}\n`); }); console.log('šŸ“Š Note: High similarity even without exact keyword match!'); console.log('This demonstrates semantic understanding.\n'); } catch (error: any) { console.error('āŒ Error:', error.message); process.exit(1); } } main();