Initial commit

2025-11-30 08:24:54 +08:00
commit 7927519669
17 changed files with 4377 additions and 0 deletions
--- a/references/rag-patterns.md
+++ b/references/rag-patterns.md
@@ -0,0 +1,483 @@
+# RAG Implementation Patterns
+
+Complete guide to Retrieval Augmented Generation patterns using Gemini embeddings and Cloudflare Vectorize.
+
+---
+
+## RAG Workflow Overview
+
+```
+┌─────────────────────────────────────────────────────────┐
+│              DOCUMENT INGESTION (Offline)                │
+└─────────────────────────────────────────────────────────┘
+   Documents
+      ↓
+   Chunking (500 words)
+      ↓
+   Generate Embeddings (RETRIEVAL_DOCUMENT)
+      ↓
+   Store in Vectorize + Metadata
+
+┌─────────────────────────────────────────────────────────┐
+│              QUERY PROCESSING (Runtime)                  │
+└─────────────────────────────────────────────────────────┘
+   User Query
+      ↓
+   Generate Embedding (RETRIEVAL_QUERY)
+      ↓
+   Vector Search (top-K)
+      ↓
+   Retrieve Documents
+      ↓
+   Generate Response (LLM + Context)
+      ↓
+   Stream to User
+```
+
+---
+
+## Pattern 1: Basic RAG
+
+**Use when**: Simple Q&A over a knowledge base
+
+```typescript
+async function basicRAG(query: string, env: Env): Promise<string> {
+  // 1. Embed query
+  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+
+  // 2. Search Vectorize
+  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
+
+  // 3. Concatenate context
+  const context = results.matches
+    .map(m => m.metadata?.text)
+    .join('\n\n');
+
+  // 4. Generate response
+  const response = await generateResponse(context, query, env.GEMINI_API_KEY);
+
+  return response;
+}
+```
+
+---
+
+## Pattern 2: Chunked RAG (Recommended)
+
+**Use when**: Documents are longer than 2,048 tokens
+
+### Chunking Strategies
+
+```typescript
+// Strategy A: Fixed-size chunks with overlap
+function chunkWithOverlap(text: string, size = 500, overlap = 50): string[] {
+  const words = text.split(/\s+/);
+  const chunks: string[] = [];
+
+  for (let i = 0; i < words.length; i += size - overlap) {
+    chunks.push(words.slice(i, i + size).join(' '));
+  }
+
+  return chunks;
+}
+
+// Strategy B: Sentence-based chunks
+function chunkBySentences(text: string, maxSentences = 10): string[] {
+  const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
+  const chunks: string[] = [];
+
+  for (let i = 0; i < sentences.length; i += maxSentences) {
+    chunks.push(sentences.slice(i, i + maxSentences).join(' '));
+  }
+
+  return chunks;
+}
+
+// Strategy C: Semantic chunks (preserves paragraphs)
+function chunkByParagraphs(text: string): string[] {
+  return text.split(/\n\n+/).filter(p => p.trim().length > 50);
+}
+```
+
+### Implementation
+
+```typescript
+async function ingestWithChunking(doc: Document, env: Env) {
+  const chunks = chunkWithOverlap(doc.text, 500, 50);
+
+  const vectors = [];
+  for (let i = 0; i < chunks.length; i++) {
+    const embedding = await generateEmbedding(chunks[i], env.GEMINI_API_KEY, 'RETRIEVAL_DOCUMENT');
+
+    vectors.push({
+      id: `${doc.id}-chunk-${i}`,
+      values: embedding,
+      metadata: {
+        documentId: doc.id,
+        chunkIndex: i,
+        text: chunks[i],
+        title: doc.title
+      }
+    });
+  }
+
+  await env.VECTORIZE.insert(vectors);
+}
+```
+
+---
+
+## Pattern 3: Hybrid Search (Keyword + Semantic)
+
+**Use when**: You need both exact keyword matches and semantic understanding
+
+```typescript
+async function hybridSearch(query: string, env: Env) {
+  // 1. Vector search
+  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+  const vectorResults = await env.VECTORIZE.query(queryEmbedding, { topK: 10 });
+
+  // 2. Keyword search (using metadata or D1)
+  const keywordResults = await env.D1.prepare(
+    'SELECT * FROM documents WHERE text LIKE ? ORDER BY relevance DESC LIMIT 10'
+  ).bind(`%${query}%`).all();
+
+  // 3. Merge and re-rank
+  const combined = mergeResults(vectorResults.matches, keywordResults.results);
+
+  // 4. Generate response from top results
+  const context = combined.slice(0, 5).map(r => r.text).join('\n\n');
+  return await generateResponse(context, query, env.GEMINI_API_KEY);
+}
+```
+
+---
+
+## Pattern 4: Filtered RAG
+
+**Use when**: Need to filter by category, date, or metadata
+
+```typescript
+async function filteredRAG(query: string, filters: { category?: string; minDate?: number }, env: Env) {
+  // 1. Vector search
+  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 20 }); // Fetch more
+
+  // 2. Filter in application layer (until Vectorize supports metadata filtering)
+  const filtered = results.matches.filter(match => {
+    if (filters.category && match.metadata?.category !== filters.category) return false;
+    if (filters.minDate && match.metadata?.timestamp < filters.minDate) return false;
+    return true;
+  });
+
+  // 3. Take top 5 after filtering
+  const topResults = filtered.slice(0, 5);
+
+  // 4. Generate response
+  const context = topResults.map(r => r.metadata?.text).join('\n\n');
+  return await generateResponse(context, query, env.GEMINI_API_KEY);
+}
+```
+
+---
+
+## Pattern 5: Streaming RAG
+
+**Use when**: Real-time responses with immediate feedback
+
+```typescript
+async function streamingRAG(query: string, env: Env): Promise<ReadableStream> {
+  // 1. Embed query and search
+  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
+
+  const context = results.matches.map(m => m.metadata?.text).join('\n\n');
+
+  // 2. Stream response from Gemini
+  const response = await fetch(
+    'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent',
+    {
+      method: 'POST',
+      headers: {
+        'x-goog-api-key': env.GEMINI_API_KEY,
+        'Content-Type': 'application/json'
+      },
+      body: JSON.stringify({
+        contents: [{
+          parts: [{ text: `Context:\n${context}\n\nQuestion: ${query}\n\nAnswer:` }]
+        }]
+      })
+    }
+  );
+
+  return response.body!;
+}
+```
+
+---
+
+## Pattern 6: Multi-Query RAG
+
+**Use when**: Query might be ambiguous or multi-faceted
+
+```typescript
+async function multiQueryRAG(query: string, env: Env) {
+  // 1. Generate multiple query variations
+  const queryVariations = await generateQueryVariations(query, env.GEMINI_API_KEY);
+  // Returns: ["original query", "rephrased version 1", "rephrased version 2"]
+
+  // 2. Search with each variation
+  const allResults = await Promise.all(
+    queryVariations.map(async q => {
+      const embedding = await generateEmbedding(q, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+      return await env.VECTORIZE.query(embedding, { topK: 3 });
+    })
+  );
+
+  // 3. Merge and deduplicate
+  const uniqueResults = deduplicateById(allResults.flatMap(r => r.matches));
+
+  // 4. Generate response
+  const context = uniqueResults.slice(0, 5).map(r => r.metadata?.text).join('\n\n');
+  return await generateResponse(context, query, env.GEMINI_API_KEY);
+}
+```
+
+---
+
+## Pattern 7: Conversational RAG
+
+**Use when**: Multi-turn conversations with context
+
+```typescript
+interface ConversationHistory {
+  role: 'user' | 'assistant';
+  content: string;
+}
+
+async function conversationalRAG(
+  query: string,
+  history: ConversationHistory[],
+  env: Env
+) {
+  // 1. Create contextualized query from history
+  const contextualizedQuery = await reformulateQuery(query, history, env.GEMINI_API_KEY);
+
+  // 2. Search with contextualized query
+  const embedding = await generateEmbedding(contextualizedQuery, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+  const results = await env.VECTORIZE.query(embedding, { topK: 3 });
+
+  const retrievedContext = results.matches.map(m => m.metadata?.text).join('\n\n');
+
+  // 3. Generate response with conversation history
+  const prompt = `
+Conversation history:
+${history.map(h => `${h.role}: ${h.content}`).join('\n')}
+
+Retrieved context:
+${retrievedContext}
+
+User: ${query}
+Assistant:`;
+
+  return await generateResponse(prompt, query, env.GEMINI_API_KEY);
+}
+```
+
+---
+
+## Pattern 8: Citation RAG
+
+**Use when**: Need to cite sources in responses
+
+```typescript
+async function citationRAG(query: string, env: Env) {
+  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
+  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 5, returnMetadata: true });
+
+  // Build context with citations
+  const contextWithCitations = results.matches.map((match, i) =>
+    `[${i + 1}] ${match.metadata?.text}\nSource: ${match.metadata?.url || match.id}`
+  ).join('\n\n');
+
+  const prompt = `Answer the question using the provided sources. Include citations [1], [2], etc. in your answer.
+
+Sources:
+${contextWithCitations}
+
+Question: ${query}
+
+Answer (with citations):`;
+
+  const response = await generateResponse(prompt, query, env.GEMINI_API_KEY);
+
+  return {
+    answer: response,
+    sources: results.matches.map((m, i) => ({
+      citation: i + 1,
+      text: m.metadata?.text,
+      url: m.metadata?.url,
+      score: m.score
+    }))
+  };
+}
+```
+
+---
+
+## Best Practices
+
+### 1. Chunk Size Optimization
+
+```typescript
+// Test different chunk sizes for your use case
+const chunkSizes = [200, 500, 1000, 1500];
+
+for (const size of chunkSizes) {
+  const accuracy = await testRetrievalAccuracy(size);
+  console.log(`Chunk size ${size}: ${accuracy}% accuracy`);
+}
+
+// Recommendation: 500-1000 words with 10% overlap
+```
+
+### 2. Context Window Management
+
+```typescript
+// Don't exceed LLM context window
+function truncateContext(chunks: string[], maxTokens = 4000): string {
+  let context = '';
+  let estimatedTokens = 0;
+
+  for (const chunk of chunks) {
+    const chunkTokens = chunk.split(/\s+/).length * 1.3; // Rough estimate
+    if (estimatedTokens + chunkTokens > maxTokens) break;
+
+    context += chunk + '\n\n';
+    estimatedTokens += chunkTokens;
+  }
+
+  return context;
+}
+```
+
+### 3. Re-ranking
+
+```typescript
+// Re-rank results after retrieval
+function rerank(results: VectorizeMatch[], query: string): VectorizeMatch[] {
+  return results
+    .map(result => ({
+      ...result,
+      rerankScore: calculateRelevance(result.metadata?.text, query)
+    }))
+    .sort((a, b) => b.rerankScore - a.rerankScore);
+}
+```
+
+### 4. Fallback Strategies
+
+```typescript
+async function ragWithFallback(query: string, env: Env) {
+  const results = await searchVectorize(query, env);
+
+  if (results.matches.length === 0 || results.matches[0].score < 0.7) {
+    // Fallback: Use LLM without RAG
+    return await generateResponse('', query, env.GEMINI_API_KEY);
+  }
+
+  // Normal RAG flow
+  const context = results.matches.map(m => m.metadata?.text).join('\n\n');
+  return await generateResponse(context, query, env.GEMINI_API_KEY);
+}
+```
+
+---
+
+## Performance Optimization
+
+### 1. Caching
+
+```typescript
+// Cache embeddings
+const embeddingCache = new Map<string, number[]>();
+
+async function getCachedEmbedding(text: string, apiKey: string) {
+  const key = hashText(text);
+
+  if (embeddingCache.has(key)) {
+    return embeddingCache.get(key)!;
+  }
+
+  const embedding = await generateEmbedding(text, apiKey, 'RETRIEVAL_QUERY');
+  embeddingCache.set(key, embedding);
+
+  return embedding;
+}
+```
+
+### 2. Batch Processing
+
+```typescript
+// Ingest documents in parallel
+async function batchIngest(documents: Document[], env: Env, concurrency = 5) {
+  for (let i = 0; i < documents.length; i += concurrency) {
+    const batch = documents.slice(i, i + concurrency);
+
+    await Promise.all(
+      batch.map(doc => ingestDocument(doc, env))
+    );
+  }
+}
+```
+
+---
+
+## Common Pitfalls
+
+### ❌ Don't: Use same task type for queries and documents
+
+```typescript
+// Wrong
+const embedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_DOCUMENT');
+```
+
+### ✅ Do: Use correct task types
+
+```typescript
+// Correct
+const queryEmbedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_QUERY');
+const docEmbedding = await generateEmbedding(doc, apiKey, 'RETRIEVAL_DOCUMENT');
+```
+
+### ❌ Don't: Return too many or too few results
+
+```typescript
+// Too few (might miss relevant info)
+const results = await env.VECTORIZE.query(embedding, { topK: 1 });
+
+// Too many (noise, cost)
+const results = await env.VECTORIZE.query(embedding, { topK: 50 });
+```
+
+### ✅ Do: Find optimal topK for your use case
+
+```typescript
+// Test different topK values
+const topK = 5; // Good default for most use cases
+const results = await env.VECTORIZE.query(embedding, { topK });
+```
+
+---
+
+## Complete Example
+
+See `templates/rag-with-vectorize.ts` for a production-ready implementation combining these patterns.
+
+---
+
+## Official Documentation
+
+- **Gemini Embeddings**: https://ai.google.dev/gemini-api/docs/embeddings
+- **Vectorize**: https://developers.cloudflare.com/vectorize/
+- **RAG Best Practices**: https://ai.google.dev/gemini-api/docs/document-processing