Files
gh-jezweb-claude-skills-ski…/templates/semantic-search.ts
2025-11-30 08:24:54 +08:00

290 lines
9.0 KiB
TypeScript

/**
* Semantic Search with Gemini Embeddings
*
* Demonstrates semantic similarity search using cosine similarity.
* Finds documents based on meaning, not just keyword matching.
*
* Setup:
* 1. npm install @google/genai@^1.27.0
* 2. export GEMINI_API_KEY="your-api-key"
*
* Usage:
* npx tsx semantic-search.ts
*/
import { GoogleGenAI } from "@google/genai";
interface Document {
id: string;
text: string;
embedding?: number[];
}
interface SearchResult {
document: Document;
similarity: number;
}
/**
* Calculate cosine similarity between two vectors
* Returns value between -1 and 1, where 1 = identical
*/
function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) {
throw new Error(`Vector dimensions must match: ${a.length} vs ${b.length}`);
}
let dotProduct = 0;
let magnitudeA = 0;
let magnitudeB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
magnitudeA += a[i] * a[i];
magnitudeB += b[i] * b[i];
}
if (magnitudeA === 0 || magnitudeB === 0) {
return 0;
}
return dotProduct / (Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB));
}
/**
* Normalize vector to unit length
* Useful for faster similarity calculations
*/
function normalizeVector(vector: number[]): number[] {
const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v * v, 0));
if (magnitude === 0) {
return vector;
}
return vector.map(v => v / magnitude);
}
/**
* Calculate dot product (for normalized vectors only)
*/
function dotProduct(a: number[], b: number[]): number {
return a.reduce((sum, val, i) => sum + val * b[i], 0);
}
class SemanticSearch {
private ai: GoogleGenAI;
private documents: Document[] = [];
private normalized: boolean = false;
constructor(apiKey: string, normalized: boolean = false) {
this.ai = new GoogleGenAI({ apiKey });
this.normalized = normalized;
}
/**
* Index documents (generate and store embeddings)
*/
async indexDocuments(documents: Array<{ id: string; text: string }>): Promise<void> {
console.log(`\n📚 Indexing ${documents.length} documents...\n`);
for (const doc of documents) {
const response = await this.ai.models.embedContent({
model: 'gemini-embedding-001',
content: doc.text,
config: {
taskType: 'RETRIEVAL_DOCUMENT', // ← Documents for indexing
outputDimensionality: 768
}
});
let embedding = response.embedding.values;
// Normalize if requested (faster similarity calculation)
if (this.normalized) {
embedding = normalizeVector(embedding);
}
this.documents.push({
id: doc.id,
text: doc.text,
embedding
});
console.log(`✅ Indexed: ${doc.id}`);
}
console.log(`\n✨ Indexing complete! ${this.documents.length} documents ready.\n`);
}
/**
* Search for similar documents
*/
async search(query: string, topK: number = 5): Promise<SearchResult[]> {
if (this.documents.length === 0) {
throw new Error('No documents indexed. Call indexDocuments() first.');
}
console.log(`🔍 Searching for: "${query}"\n`);
// Generate query embedding
const response = await this.ai.models.embedContent({
model: 'gemini-embedding-001',
content: query,
config: {
taskType: 'RETRIEVAL_QUERY', // ← Query, not document
outputDimensionality: 768
}
});
let queryEmbedding = response.embedding.values;
if (this.normalized) {
queryEmbedding = normalizeVector(queryEmbedding);
}
// Calculate similarity for each document
const results: SearchResult[] = this.documents.map(doc => ({
document: doc,
similarity: this.normalized
? dotProduct(queryEmbedding, doc.embedding!)
: cosineSimilarity(queryEmbedding, doc.embedding!)
}));
// Sort by similarity (descending) and return top K
return results
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK);
}
/**
* Find similar documents to a given document
*/
findSimilar(documentId: string, topK: number = 5): SearchResult[] {
const doc = this.documents.find(d => d.id === documentId);
if (!doc || !doc.embedding) {
throw new Error(`Document not found: ${documentId}`);
}
const results: SearchResult[] = this.documents
.filter(d => d.id !== documentId) // Exclude the document itself
.map(d => ({
document: d,
similarity: this.normalized
? dotProduct(doc.embedding!, d.embedding!)
: cosineSimilarity(doc.embedding!, d.embedding!)
}));
return results
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK);
}
}
// Example usage
async function main() {
try {
const apiKey = process.env.GEMINI_API_KEY;
if (!apiKey) {
throw new Error('GEMINI_API_KEY environment variable not set');
}
// Initialize search engine
const search = new SemanticSearch(apiKey, false); // Set true for normalized vectors
// Sample documents
const documents = [
{
id: 'doc1',
text: 'Paris is the capital of France. It is known for the Eiffel Tower and the Louvre Museum.'
},
{
id: 'doc2',
text: 'Machine learning is a subset of artificial intelligence that enables computers to learn from data.'
},
{
id: 'doc3',
text: 'The Eiffel Tower is an iconic landmark in Paris, France, built in 1889.'
},
{
id: 'doc4',
text: 'Deep learning uses neural networks with multiple layers to learn complex patterns in data.'
},
{
id: 'doc5',
text: 'London is the capital of the United Kingdom and home to Big Ben and Buckingham Palace.'
},
{
id: 'doc6',
text: 'Natural language processing is a branch of AI that helps computers understand human language.'
}
];
// Index documents
await search.indexDocuments(documents);
// Example 1: Search by query
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('Example 1: Search by Query');
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
const query1 = "What is the capital of France?";
const results1 = await search.search(query1, 3);
results1.forEach((result, i) => {
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
console.log(` ${result.document.text}\n`);
});
// Example 2: Different query
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('Example 2: AI-related Query');
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
const query2 = "Tell me about artificial intelligence";
const results2 = await search.search(query2, 3);
results2.forEach((result, i) => {
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
console.log(` ${result.document.text}\n`);
});
// Example 3: Find similar documents
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('Example 3: Find Similar to doc1 (Paris)');
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
const similar = search.findSimilar('doc1', 3);
similar.forEach((result, i) => {
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id}`);
console.log(` ${result.document.text}\n`);
});
// Example 4: Demonstrate semantic vs keyword matching
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('Example 4: Semantic Understanding');
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
console.log('Query: "neural networks" (no exact keyword match in any document)\n');
const query3 = "neural networks";
const results3 = await search.search(query3, 3);
results3.forEach((result, i) => {
const hasKeyword = result.document.text.toLowerCase().includes('neural');
console.log(`${i + 1}. [${(result.similarity * 100).toFixed(1)}%] ${result.document.id} ${hasKeyword ? '✓ keyword' : '✗ no keyword'}`);
console.log(` ${result.document.text}\n`);
});
console.log('📊 Note: High similarity even without exact keyword match!');
console.log('This demonstrates semantic understanding.\n');
} catch (error: any) {
console.error('❌ Error:', error.message);
process.exit(1);
}
}
main();