484 lines
13 KiB
Markdown
484 lines
13 KiB
Markdown
# RAG Implementation Patterns
|
|
|
|
Complete guide to Retrieval Augmented Generation patterns using Gemini embeddings and Cloudflare Vectorize.
|
|
|
|
---
|
|
|
|
## RAG Workflow Overview
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────┐
|
|
│ DOCUMENT INGESTION (Offline) │
|
|
└─────────────────────────────────────────────────────────┘
|
|
Documents
|
|
↓
|
|
Chunking (500 words)
|
|
↓
|
|
Generate Embeddings (RETRIEVAL_DOCUMENT)
|
|
↓
|
|
Store in Vectorize + Metadata
|
|
|
|
┌─────────────────────────────────────────────────────────┐
|
|
│ QUERY PROCESSING (Runtime) │
|
|
└─────────────────────────────────────────────────────────┘
|
|
User Query
|
|
↓
|
|
Generate Embedding (RETRIEVAL_QUERY)
|
|
↓
|
|
Vector Search (top-K)
|
|
↓
|
|
Retrieve Documents
|
|
↓
|
|
Generate Response (LLM + Context)
|
|
↓
|
|
Stream to User
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 1: Basic RAG
|
|
|
|
**Use when**: Simple Q&A over a knowledge base
|
|
|
|
```typescript
|
|
async function basicRAG(query: string, env: Env): Promise<string> {
|
|
// 1. Embed query
|
|
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
|
|
// 2. Search Vectorize
|
|
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
|
|
|
|
// 3. Concatenate context
|
|
const context = results.matches
|
|
.map(m => m.metadata?.text)
|
|
.join('\n\n');
|
|
|
|
// 4. Generate response
|
|
const response = await generateResponse(context, query, env.GEMINI_API_KEY);
|
|
|
|
return response;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 2: Chunked RAG (Recommended)
|
|
|
|
**Use when**: Documents are longer than 2,048 tokens
|
|
|
|
### Chunking Strategies
|
|
|
|
```typescript
|
|
// Strategy A: Fixed-size chunks with overlap
|
|
function chunkWithOverlap(text: string, size = 500, overlap = 50): string[] {
|
|
const words = text.split(/\s+/);
|
|
const chunks: string[] = [];
|
|
|
|
for (let i = 0; i < words.length; i += size - overlap) {
|
|
chunks.push(words.slice(i, i + size).join(' '));
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
// Strategy B: Sentence-based chunks
|
|
function chunkBySentences(text: string, maxSentences = 10): string[] {
|
|
const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
|
|
const chunks: string[] = [];
|
|
|
|
for (let i = 0; i < sentences.length; i += maxSentences) {
|
|
chunks.push(sentences.slice(i, i + maxSentences).join(' '));
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
// Strategy C: Semantic chunks (preserves paragraphs)
|
|
function chunkByParagraphs(text: string): string[] {
|
|
return text.split(/\n\n+/).filter(p => p.trim().length > 50);
|
|
}
|
|
```
|
|
|
|
### Implementation
|
|
|
|
```typescript
|
|
async function ingestWithChunking(doc: Document, env: Env) {
|
|
const chunks = chunkWithOverlap(doc.text, 500, 50);
|
|
|
|
const vectors = [];
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
const embedding = await generateEmbedding(chunks[i], env.GEMINI_API_KEY, 'RETRIEVAL_DOCUMENT');
|
|
|
|
vectors.push({
|
|
id: `${doc.id}-chunk-${i}`,
|
|
values: embedding,
|
|
metadata: {
|
|
documentId: doc.id,
|
|
chunkIndex: i,
|
|
text: chunks[i],
|
|
title: doc.title
|
|
}
|
|
});
|
|
}
|
|
|
|
await env.VECTORIZE.insert(vectors);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 3: Hybrid Search (Keyword + Semantic)
|
|
|
|
**Use when**: You need both exact keyword matches and semantic understanding
|
|
|
|
```typescript
|
|
async function hybridSearch(query: string, env: Env) {
|
|
// 1. Vector search
|
|
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
const vectorResults = await env.VECTORIZE.query(queryEmbedding, { topK: 10 });
|
|
|
|
// 2. Keyword search (using metadata or D1)
|
|
const keywordResults = await env.D1.prepare(
|
|
'SELECT * FROM documents WHERE text LIKE ? ORDER BY relevance DESC LIMIT 10'
|
|
).bind(`%${query}%`).all();
|
|
|
|
// 3. Merge and re-rank
|
|
const combined = mergeResults(vectorResults.matches, keywordResults.results);
|
|
|
|
// 4. Generate response from top results
|
|
const context = combined.slice(0, 5).map(r => r.text).join('\n\n');
|
|
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 4: Filtered RAG
|
|
|
|
**Use when**: Need to filter by category, date, or metadata
|
|
|
|
```typescript
|
|
async function filteredRAG(query: string, filters: { category?: string; minDate?: number }, env: Env) {
|
|
// 1. Vector search
|
|
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 20 }); // Fetch more
|
|
|
|
// 2. Filter in application layer (until Vectorize supports metadata filtering)
|
|
const filtered = results.matches.filter(match => {
|
|
if (filters.category && match.metadata?.category !== filters.category) return false;
|
|
if (filters.minDate && match.metadata?.timestamp < filters.minDate) return false;
|
|
return true;
|
|
});
|
|
|
|
// 3. Take top 5 after filtering
|
|
const topResults = filtered.slice(0, 5);
|
|
|
|
// 4. Generate response
|
|
const context = topResults.map(r => r.metadata?.text).join('\n\n');
|
|
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 5: Streaming RAG
|
|
|
|
**Use when**: Real-time responses with immediate feedback
|
|
|
|
```typescript
|
|
async function streamingRAG(query: string, env: Env): Promise<ReadableStream> {
|
|
// 1. Embed query and search
|
|
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
|
|
|
|
const context = results.matches.map(m => m.metadata?.text).join('\n\n');
|
|
|
|
// 2. Stream response from Gemini
|
|
const response = await fetch(
|
|
'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent',
|
|
{
|
|
method: 'POST',
|
|
headers: {
|
|
'x-goog-api-key': env.GEMINI_API_KEY,
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify({
|
|
contents: [{
|
|
parts: [{ text: `Context:\n${context}\n\nQuestion: ${query}\n\nAnswer:` }]
|
|
}]
|
|
})
|
|
}
|
|
);
|
|
|
|
return response.body!;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 6: Multi-Query RAG
|
|
|
|
**Use when**: Query might be ambiguous or multi-faceted
|
|
|
|
```typescript
|
|
async function multiQueryRAG(query: string, env: Env) {
|
|
// 1. Generate multiple query variations
|
|
const queryVariations = await generateQueryVariations(query, env.GEMINI_API_KEY);
|
|
// Returns: ["original query", "rephrased version 1", "rephrased version 2"]
|
|
|
|
// 2. Search with each variation
|
|
const allResults = await Promise.all(
|
|
queryVariations.map(async q => {
|
|
const embedding = await generateEmbedding(q, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
return await env.VECTORIZE.query(embedding, { topK: 3 });
|
|
})
|
|
);
|
|
|
|
// 3. Merge and deduplicate
|
|
const uniqueResults = deduplicateById(allResults.flatMap(r => r.matches));
|
|
|
|
// 4. Generate response
|
|
const context = uniqueResults.slice(0, 5).map(r => r.metadata?.text).join('\n\n');
|
|
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 7: Conversational RAG
|
|
|
|
**Use when**: Multi-turn conversations with context
|
|
|
|
```typescript
|
|
interface ConversationHistory {
|
|
role: 'user' | 'assistant';
|
|
content: string;
|
|
}
|
|
|
|
async function conversationalRAG(
|
|
query: string,
|
|
history: ConversationHistory[],
|
|
env: Env
|
|
) {
|
|
// 1. Create contextualized query from history
|
|
const contextualizedQuery = await reformulateQuery(query, history, env.GEMINI_API_KEY);
|
|
|
|
// 2. Search with contextualized query
|
|
const embedding = await generateEmbedding(contextualizedQuery, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
const results = await env.VECTORIZE.query(embedding, { topK: 3 });
|
|
|
|
const retrievedContext = results.matches.map(m => m.metadata?.text).join('\n\n');
|
|
|
|
// 3. Generate response with conversation history
|
|
const prompt = `
|
|
Conversation history:
|
|
${history.map(h => `${h.role}: ${h.content}`).join('\n')}
|
|
|
|
Retrieved context:
|
|
${retrievedContext}
|
|
|
|
User: ${query}
|
|
Assistant:`;
|
|
|
|
return await generateResponse(prompt, query, env.GEMINI_API_KEY);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Pattern 8: Citation RAG
|
|
|
|
**Use when**: Need to cite sources in responses
|
|
|
|
```typescript
|
|
async function citationRAG(query: string, env: Env) {
|
|
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
|
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 5, returnMetadata: true });
|
|
|
|
// Build context with citations
|
|
const contextWithCitations = results.matches.map((match, i) =>
|
|
`[${i + 1}] ${match.metadata?.text}\nSource: ${match.metadata?.url || match.id}`
|
|
).join('\n\n');
|
|
|
|
const prompt = `Answer the question using the provided sources. Include citations [1], [2], etc. in your answer.
|
|
|
|
Sources:
|
|
${contextWithCitations}
|
|
|
|
Question: ${query}
|
|
|
|
Answer (with citations):`;
|
|
|
|
const response = await generateResponse(prompt, query, env.GEMINI_API_KEY);
|
|
|
|
return {
|
|
answer: response,
|
|
sources: results.matches.map((m, i) => ({
|
|
citation: i + 1,
|
|
text: m.metadata?.text,
|
|
url: m.metadata?.url,
|
|
score: m.score
|
|
}))
|
|
};
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Best Practices
|
|
|
|
### 1. Chunk Size Optimization
|
|
|
|
```typescript
|
|
// Test different chunk sizes for your use case
|
|
const chunkSizes = [200, 500, 1000, 1500];
|
|
|
|
for (const size of chunkSizes) {
|
|
const accuracy = await testRetrievalAccuracy(size);
|
|
console.log(`Chunk size ${size}: ${accuracy}% accuracy`);
|
|
}
|
|
|
|
// Recommendation: 500-1000 words with 10% overlap
|
|
```
|
|
|
|
### 2. Context Window Management
|
|
|
|
```typescript
|
|
// Don't exceed LLM context window
|
|
function truncateContext(chunks: string[], maxTokens = 4000): string {
|
|
let context = '';
|
|
let estimatedTokens = 0;
|
|
|
|
for (const chunk of chunks) {
|
|
const chunkTokens = chunk.split(/\s+/).length * 1.3; // Rough estimate
|
|
if (estimatedTokens + chunkTokens > maxTokens) break;
|
|
|
|
context += chunk + '\n\n';
|
|
estimatedTokens += chunkTokens;
|
|
}
|
|
|
|
return context;
|
|
}
|
|
```
|
|
|
|
### 3. Re-ranking
|
|
|
|
```typescript
|
|
// Re-rank results after retrieval
|
|
function rerank(results: VectorizeMatch[], query: string): VectorizeMatch[] {
|
|
return results
|
|
.map(result => ({
|
|
...result,
|
|
rerankScore: calculateRelevance(result.metadata?.text, query)
|
|
}))
|
|
.sort((a, b) => b.rerankScore - a.rerankScore);
|
|
}
|
|
```
|
|
|
|
### 4. Fallback Strategies
|
|
|
|
```typescript
|
|
async function ragWithFallback(query: string, env: Env) {
|
|
const results = await searchVectorize(query, env);
|
|
|
|
if (results.matches.length === 0 || results.matches[0].score < 0.7) {
|
|
// Fallback: Use LLM without RAG
|
|
return await generateResponse('', query, env.GEMINI_API_KEY);
|
|
}
|
|
|
|
// Normal RAG flow
|
|
const context = results.matches.map(m => m.metadata?.text).join('\n\n');
|
|
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Performance Optimization
|
|
|
|
### 1. Caching
|
|
|
|
```typescript
|
|
// Cache embeddings
|
|
const embeddingCache = new Map<string, number[]>();
|
|
|
|
async function getCachedEmbedding(text: string, apiKey: string) {
|
|
const key = hashText(text);
|
|
|
|
if (embeddingCache.has(key)) {
|
|
return embeddingCache.get(key)!;
|
|
}
|
|
|
|
const embedding = await generateEmbedding(text, apiKey, 'RETRIEVAL_QUERY');
|
|
embeddingCache.set(key, embedding);
|
|
|
|
return embedding;
|
|
}
|
|
```
|
|
|
|
### 2. Batch Processing
|
|
|
|
```typescript
|
|
// Ingest documents in parallel
|
|
async function batchIngest(documents: Document[], env: Env, concurrency = 5) {
|
|
for (let i = 0; i < documents.length; i += concurrency) {
|
|
const batch = documents.slice(i, i + concurrency);
|
|
|
|
await Promise.all(
|
|
batch.map(doc => ingestDocument(doc, env))
|
|
);
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Common Pitfalls
|
|
|
|
### ❌ Don't: Use same task type for queries and documents
|
|
|
|
```typescript
|
|
// Wrong
|
|
const embedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_DOCUMENT');
|
|
```
|
|
|
|
### ✅ Do: Use correct task types
|
|
|
|
```typescript
|
|
// Correct
|
|
const queryEmbedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_QUERY');
|
|
const docEmbedding = await generateEmbedding(doc, apiKey, 'RETRIEVAL_DOCUMENT');
|
|
```
|
|
|
|
### ❌ Don't: Return too many or too few results
|
|
|
|
```typescript
|
|
// Too few (might miss relevant info)
|
|
const results = await env.VECTORIZE.query(embedding, { topK: 1 });
|
|
|
|
// Too many (noise, cost)
|
|
const results = await env.VECTORIZE.query(embedding, { topK: 50 });
|
|
```
|
|
|
|
### ✅ Do: Find optimal topK for your use case
|
|
|
|
```typescript
|
|
// Test different topK values
|
|
const topK = 5; // Good default for most use cases
|
|
const results = await env.VECTORIZE.query(embedding, { topK });
|
|
```
|
|
|
|
---
|
|
|
|
## Complete Example
|
|
|
|
See `templates/rag-with-vectorize.ts` for a production-ready implementation combining these patterns.
|
|
|
|
---
|
|
|
|
## Official Documentation
|
|
|
|
- **Gemini Embeddings**: https://ai.google.dev/gemini-api/docs/embeddings
|
|
- **Vectorize**: https://developers.cloudflare.com/vectorize/
|
|
- **RAG Best Practices**: https://ai.google.dev/gemini-api/docs/document-processing
|