290 lines
9.0 KiB
TypeScript
290 lines
9.0 KiB
TypeScript
/**
|
|
* Semantic Search with Gemini Embeddings
|
|
*
|
|
* Demonstrates semantic similarity search using cosine similarity.
|
|
* Finds documents based on meaning, not just keyword matching.
|
|
*
|
|
* Setup:
|
|
* 1. npm install @google/genai@^1.27.0
|
|
* 2. export GEMINI_API_KEY="your-api-key"
|
|
*
|
|
* Usage:
|
|
* npx tsx semantic-search.ts
|
|
*/
|
|
|
|
import { GoogleGenAI } from "@google/genai";
|
|
|
|
interface Document {
|
|
id: string;
|
|
text: string;
|
|
embedding?: number[];
|
|
}
|
|
|
|
interface SearchResult {
|
|
document: Document;
|
|
similarity: number;
|
|
}
|
|
|
|
/**
|
|
* Calculate cosine similarity between two vectors
|
|
* Returns value between -1 and 1, where 1 = identical
|
|
*/
|
|
function cosineSimilarity(a: number[], b: number[]): number {
|
|
if (a.length !== b.length) {
|
|
throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
|
|
}
|
|
|
|
let dotProduct = 0;
|
|
let magnitudeA = 0;
|
|
let magnitudeB = 0;
|
|
|
|
for (let i = 0; i < a.length; i++) {
|
|
dotProduct += a[i] * b[i];
|
|
magnitudeA += a[i] * a[i];
|
|
magnitudeB += b[i] * b[i];
|
|
}
|
|
|
|
if (magnitudeA === 0 || magnitudeB === 0) {
|
|
return 0;
|
|
}
|
|
|
|
return dotProduct / (Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB));
|
|
}
|
|
|
|
/**
|
|
* Normalize vector to unit length
|
|
* Useful for faster similarity calculations
|
|
*/
|
|
function normalizeVector(vector: number[]): number[] {
|
|
const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0));
|
|
|
|
if (magnitude === 0) {
|
|
return vector;
|
|
}
|
|
|
|
return vector.map(v => v / magnitude);
|
|
}
|
|
|
|
/**
|
|
* Calculate dot product (for normalized vectors only)
|
|
*/
|
|
function dotProduct(a: number[], b: number[]): number {
|
|
return a.reduce((sum, val, i) => sum + val * b[i], 0);
|
|
}
|
|
|
|
class SemanticSearch {
|
|
private ai: GoogleGenAI;
|
|
private documents: Document[] = [];
|
|
private normalized: boolean = false;
|
|
|
|
constructor(apiKey: string, normalized: boolean = false) {
|
|
this.ai = new GoogleGenAI({ apiKey });
|
|
this.normalized = normalized;
|
|
}
|
|
|
|
/**
|
|
* Index documents (generate and store embeddings)
|
|
*/
|
|
async indexDocuments(documents: Array<{ id: string; text: string }>): Promise<void> {
|
|
console.log(`\n📚 Indexing ${documents.length} documents...\n`);
|
|
|
|
for (const doc of documents) {
|
|
const response = await this.ai.models.embedContent({
|
|
model: 'gemini-embedding-001',
|
|
content: doc.text,
|
|
config: {
|
|
taskType: 'RETRIEVAL_DOCUMENT', // ← Documents for indexing
|
|
outputDimensionality: 768
|
|
}
|
|
});
|
|
|
|
let embedding = response.embedding.values;
|
|
|
|
// Normalize if requested (faster similarity calculation)
|
|
if (this.normalized) {
|
|
embedding = normalizeVector(embedding);
|
|
}
|
|
|
|
this.documents.push({
|
|
id: doc.id,
|
|
text: doc.text,
|
|
embedding
|
|
});
|
|
|
|
console.log(`✅ Indexed: ${doc.id}`);
|
|
}
|
|
|
|
console.log(`\n✨ Indexing complete! ${this.documents.length} documents ready.\n`);
|
|
}
|
|
|
|
/**
|
|
* Search for similar documents
|
|
*/
|
|
async search(query: string, topK: number = 5): Promise<SearchResult[]> {
|
|
if (this.documents.length === 0) {
|
|
throw new Error('No documents indexed. Call indexDocuments() first.');
|
|
}
|
|
|
|
console.log(`🔍 Searching for: "${query}"\n`);
|
|
|
|
// Generate query embedding
|
|
const response = await this.ai.models.embedContent({
|
|
model: 'gemini-embedding-001',
|
|
content: query,
|
|
config: {
|
|
taskType: 'RETRIEVAL_QUERY', // ← Query, not document
|
|
outputDimensionality: 768
|
|
}
|
|
});
|
|
|
|
let queryEmbedding = response.embedding.values;
|
|
|
|
if (this.normalized) {
|
|
queryEmbedding = normalizeVector(queryEmbedding);
|
|
}
|
|
|
|
// Calculate similarity for each document
|
|
const results: SearchResult[] = this.documents.map(doc => ({
|
|
document: doc,
|
|
similarity: this.normalized
|
|
? dotProduct(queryEmbedding, doc.embedding!)
|
|
: cosineSimilarity(queryEmbedding, doc.embedding!)
|
|
}));
|
|
|
|
// Sort by similarity (descending) and return top K
|
|
return results
|
|
.sort((a, b) => b.similarity - a.similarity)
|
|
.slice(0, topK);
|
|
}
|
|
|
|
/**
|
|
* Find similar documents to a given document
|
|
*/
|
|
findSimilar(documentId: string, topK: number = 5): SearchResult[] {
|
|
const doc = this.documents.find(d => d.id === documentId);
|
|
|
|
if (!doc || !doc.embedding) {
|
|
throw new Error(`Document not found: ${documentId}`);
|
|
}
|
|
|
|
const results: SearchResult[] = this.documents
|
|
.filter(d => d.id !== documentId) // Exclude the document itself
|
|
.map(d => ({
|
|
document: d,
|
|
similarity: this.normalized
|
|
? dotProduct(doc.embedding!, d.embedding!)
|
|
: cosineSimilarity(doc.embedding!, d.embedding!)
|
|
}));
|
|
|
|
return results
|
|
.sort((a, b) => b.similarity - a.similarity)
|
|
.slice(0, topK);
|
|
}
|
|
}
|
|
|
|
// Example usage
|
|
async function main() {
|
|
try {
|
|
const apiKey = process.env.GEMINI_API_KEY;
|
|
if (!apiKey) {
|
|
throw new Error('GEMINI_API_KEY environment variable not set');
|
|
}
|
|
|
|
// Initialize search engine
|
|
const search = new SemanticSearch(apiKey, false); // Set true for normalized vectors
|
|
|
|
// Sample documents
|
|
const documents = [
|
|
{
|
|
id: 'doc1',
|
|
text: 'Paris is the capital of France. It is known for the Eiffel Tower and the Louvre Museum.'
|
|
},
|
|
{
|
|
id: 'doc2',
|
|
text: 'Machine learning is a subset of artificial intelligence that enables computers to learn from data.'
|
|
},
|
|
{
|
|
id: 'doc3',
|
|
text: 'The Eiffel Tower is an iconic landmark in Paris, France, built in 1889.'
|
|
},
|
|
{
|
|
id: 'doc4',
|
|
text: 'Deep learning uses neural networks with multiple layers to learn complex patterns in data.'
|
|
},
|
|
{
|
|
id: 'doc5',
|
|
text: 'London is the capital of the United Kingdom and home to Big Ben and Buckingham Palace.'
|
|
},
|
|
{
|
|
id: 'doc6',
|
|
text: 'Natural language processing is a branch of AI that helps computers understand human language.'
|
|
}
|
|
];
|
|
|
|
// Index documents
|
|
await search.indexDocuments(documents);
|
|
|
|
// Example 1: Search by query
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
console.log('Example 1: Search by Query');
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
|
|
const query1 = "What is the capital of France?";
|
|
const results1 = await search.search(query1, 3);
|
|
|
|
results1.forEach((result, i) => {
|
|
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
|
|
console.log(` ${result.document.text}\n`);
|
|
});
|
|
|
|
// Example 2: Different query
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
console.log('Example 2: AI-related Query');
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
|
|
const query2 = "Tell me about artificial intelligence";
|
|
const results2 = await search.search(query2, 3);
|
|
|
|
results2.forEach((result, i) => {
|
|
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
|
|
console.log(` ${result.document.text}\n`);
|
|
});
|
|
|
|
// Example 3: Find similar documents
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
console.log('Example 3: Find Similar to doc1 (Paris)');
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
|
|
const similar = search.findSimilar('doc1', 3);
|
|
|
|
similar.forEach((result, i) => {
|
|
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
|
|
console.log(` ${result.document.text}\n`);
|
|
});
|
|
|
|
// Example 4: Demonstrate semantic vs keyword matching
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
console.log('Example 4: Semantic Understanding');
|
|
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
console.log('Query: "neural networks" (no exact keyword match in any document)\n');
|
|
|
|
const query3 = "neural networks";
|
|
const results3 = await search.search(query3, 3);
|
|
|
|
results3.forEach((result, i) => {
|
|
const hasKeyword = result.document.text.toLowerCase().includes('neural');
|
|
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id} ${hasKeyword ? '✓ keyword' : '✗ no keyword'}`);
|
|
console.log(` ${result.document.text}\n`);
|
|
});
|
|
|
|
console.log('📊 Note: High similarity even without exact keyword match!');
|
|
console.log('This demonstrates semantic understanding.\n');
|
|
|
|
} catch (error: any) {
|
|
console.error('❌ Error:', error.message);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main();
|