13 KiB
13 KiB
RAG Implementation Patterns
Complete guide to Retrieval Augmented Generation patterns using Gemini embeddings and Cloudflare Vectorize.
RAG Workflow Overview
┌─────────────────────────────────────────────────────────┐
│ DOCUMENT INGESTION (Offline) │
└─────────────────────────────────────────────────────────┘
Documents
↓
Chunking (500 words)
↓
Generate Embeddings (RETRIEVAL_DOCUMENT)
↓
Store in Vectorize + Metadata
┌─────────────────────────────────────────────────────────┐
│ QUERY PROCESSING (Runtime) │
└─────────────────────────────────────────────────────────┘
User Query
↓
Generate Embedding (RETRIEVAL_QUERY)
↓
Vector Search (top-K)
↓
Retrieve Documents
↓
Generate Response (LLM + Context)
↓
Stream to User
Pattern 1: Basic RAG
Use when: Simple Q&A over a knowledge base
async function basicRAG(query: string, env: Env): Promise<string> {
// 1. Embed query
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
// 2. Search Vectorize
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
// 3. Concatenate context
const context = results.matches
.map(m => m.metadata?.text)
.join('\n\n');
// 4. Generate response
const response = await generateResponse(context, query, env.GEMINI_API_KEY);
return response;
}
Pattern 2: Chunked RAG (Recommended)
Use when: Documents are longer than 2,048 tokens
Chunking Strategies
// Strategy A: Fixed-size chunks with overlap
function chunkWithOverlap(text: string, size = 500, overlap = 50): string[] {
const words = text.split(/\s+/);
const chunks: string[] = [];
for (let i = 0; i < words.length; i += size - overlap) {
chunks.push(words.slice(i, i + size).join(' '));
}
return chunks;
}
// Strategy B: Sentence-based chunks
function chunkBySentences(text: string, maxSentences = 10): string[] {
const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
const chunks: string[] = [];
for (let i = 0; i < sentences.length; i += maxSentences) {
chunks.push(sentences.slice(i, i + maxSentences).join(' '));
}
return chunks;
}
// Strategy C: Semantic chunks (preserves paragraphs)
function chunkByParagraphs(text: string): string[] {
return text.split(/\n\n+/).filter(p => p.trim().length > 50);
}
Implementation
async function ingestWithChunking(doc: Document, env: Env) {
const chunks = chunkWithOverlap(doc.text, 500, 50);
const vectors = [];
for (let i = 0; i < chunks.length; i++) {
const embedding = await generateEmbedding(chunks[i], env.GEMINI_API_KEY, 'RETRIEVAL_DOCUMENT');
vectors.push({
id: `${doc.id}-chunk-${i}`,
values: embedding,
metadata: {
documentId: doc.id,
chunkIndex: i,
text: chunks[i],
title: doc.title
}
});
}
await env.VECTORIZE.insert(vectors);
}
Pattern 3: Hybrid Search (Keyword + Semantic)
Use when: You need both exact keyword matches and semantic understanding
async function hybridSearch(query: string, env: Env) {
// 1. Vector search
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
const vectorResults = await env.VECTORIZE.query(queryEmbedding, { topK: 10 });
// 2. Keyword search (using metadata or D1)
const keywordResults = await env.D1.prepare(
'SELECT * FROM documents WHERE text LIKE ? ORDER BY relevance DESC LIMIT 10'
).bind(`%${query}%`).all();
// 3. Merge and re-rank
const combined = mergeResults(vectorResults.matches, keywordResults.results);
// 4. Generate response from top results
const context = combined.slice(0, 5).map(r => r.text).join('\n\n');
return await generateResponse(context, query, env.GEMINI_API_KEY);
}
Pattern 4: Filtered RAG
Use when: Need to filter by category, date, or metadata
async function filteredRAG(query: string, filters: { category?: string; minDate?: number }, env: Env) {
// 1. Vector search
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 20 }); // Fetch more
// 2. Filter in application layer (until Vectorize supports metadata filtering)
const filtered = results.matches.filter(match => {
if (filters.category && match.metadata?.category !== filters.category) return false;
if (filters.minDate && match.metadata?.timestamp < filters.minDate) return false;
return true;
});
// 3. Take top 5 after filtering
const topResults = filtered.slice(0, 5);
// 4. Generate response
const context = topResults.map(r => r.metadata?.text).join('\n\n');
return await generateResponse(context, query, env.GEMINI_API_KEY);
}
Pattern 5: Streaming RAG
Use when: Real-time responses with immediate feedback
async function streamingRAG(query: string, env: Env): Promise<ReadableStream> {
// 1. Embed query and search
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
const context = results.matches.map(m => m.metadata?.text).join('\n\n');
// 2. Stream response from Gemini
const response = await fetch(
'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent',
{
method: 'POST',
headers: {
'x-goog-api-key': env.GEMINI_API_KEY,
'Content-Type': 'application/json'
},
body: JSON.stringify({
contents: [{
parts: [{ text: `Context:\n${context}\n\nQuestion: ${query}\n\nAnswer:` }]
}]
})
}
);
return response.body!;
}
Pattern 6: Multi-Query RAG
Use when: Query might be ambiguous or multi-faceted
async function multiQueryRAG(query: string, env: Env) {
// 1. Generate multiple query variations
const queryVariations = await generateQueryVariations(query, env.GEMINI_API_KEY);
// Returns: ["original query", "rephrased version 1", "rephrased version 2"]
// 2. Search with each variation
const allResults = await Promise.all(
queryVariations.map(async q => {
const embedding = await generateEmbedding(q, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
return await env.VECTORIZE.query(embedding, { topK: 3 });
})
);
// 3. Merge and deduplicate
const uniqueResults = deduplicateById(allResults.flatMap(r => r.matches));
// 4. Generate response
const context = uniqueResults.slice(0, 5).map(r => r.metadata?.text).join('\n\n');
return await generateResponse(context, query, env.GEMINI_API_KEY);
}
Pattern 7: Conversational RAG
Use when: Multi-turn conversations with context
interface ConversationHistory {
role: 'user' | 'assistant';
content: string;
}
async function conversationalRAG(
query: string,
history: ConversationHistory[],
env: Env
) {
// 1. Create contextualized query from history
const contextualizedQuery = await reformulateQuery(query, history, env.GEMINI_API_KEY);
// 2. Search with contextualized query
const embedding = await generateEmbedding(contextualizedQuery, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
const results = await env.VECTORIZE.query(embedding, { topK: 3 });
const retrievedContext = results.matches.map(m => m.metadata?.text).join('\n\n');
// 3. Generate response with conversation history
const prompt = `
Conversation history:
${history.map(h => `${h.role}: ${h.content}`).join('\n')}
Retrieved context:
${retrievedContext}
User: ${query}
Assistant:`;
return await generateResponse(prompt, query, env.GEMINI_API_KEY);
}
Pattern 8: Citation RAG
Use when: Need to cite sources in responses
async function citationRAG(query: string, env: Env) {
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 5, returnMetadata: true });
// Build context with citations
const contextWithCitations = results.matches.map((match, i) =>
`[${i + 1}] ${match.metadata?.text}\nSource: ${match.metadata?.url || match.id}`
).join('\n\n');
const prompt = `Answer the question using the provided sources. Include citations [1], [2], etc. in your answer.
Sources:
${contextWithCitations}
Question: ${query}
Answer (with citations):`;
const response = await generateResponse(prompt, query, env.GEMINI_API_KEY);
return {
answer: response,
sources: results.matches.map((m, i) => ({
citation: i + 1,
text: m.metadata?.text,
url: m.metadata?.url,
score: m.score
}))
};
}
Best Practices
1. Chunk Size Optimization
// Test different chunk sizes for your use case
const chunkSizes = [200, 500, 1000, 1500];
for (const size of chunkSizes) {
const accuracy = await testRetrievalAccuracy(size);
console.log(`Chunk size ${size}: ${accuracy}% accuracy`);
}
// Recommendation: 500-1000 words with 10% overlap
2. Context Window Management
// Don't exceed LLM context window
function truncateContext(chunks: string[], maxTokens = 4000): string {
let context = '';
let estimatedTokens = 0;
for (const chunk of chunks) {
const chunkTokens = chunk.split(/\s+/).length * 1.3; // Rough estimate
if (estimatedTokens + chunkTokens > maxTokens) break;
context += chunk + '\n\n';
estimatedTokens += chunkTokens;
}
return context;
}
3. Re-ranking
// Re-rank results after retrieval
function rerank(results: VectorizeMatch[], query: string): VectorizeMatch[] {
return results
.map(result => ({
...result,
rerankScore: calculateRelevance(result.metadata?.text, query)
}))
.sort((a, b) => b.rerankScore - a.rerankScore);
}
4. Fallback Strategies
async function ragWithFallback(query: string, env: Env) {
const results = await searchVectorize(query, env);
if (results.matches.length === 0 || results.matches[0].score < 0.7) {
// Fallback: Use LLM without RAG
return await generateResponse('', query, env.GEMINI_API_KEY);
}
// Normal RAG flow
const context = results.matches.map(m => m.metadata?.text).join('\n\n');
return await generateResponse(context, query, env.GEMINI_API_KEY);
}
Performance Optimization
1. Caching
// Cache embeddings
const embeddingCache = new Map<string, number[]>();
async function getCachedEmbedding(text: string, apiKey: string) {
const key = hashText(text);
if (embeddingCache.has(key)) {
return embeddingCache.get(key)!;
}
const embedding = await generateEmbedding(text, apiKey, 'RETRIEVAL_QUERY');
embeddingCache.set(key, embedding);
return embedding;
}
2. Batch Processing
// Ingest documents in parallel
async function batchIngest(documents: Document[], env: Env, concurrency = 5) {
for (let i = 0; i < documents.length; i += concurrency) {
const batch = documents.slice(i, i + concurrency);
await Promise.all(
batch.map(doc => ingestDocument(doc, env))
);
}
}
Common Pitfalls
❌ Don't: Use same task type for queries and documents
// Wrong
const embedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_DOCUMENT');
✅ Do: Use correct task types
// Correct
const queryEmbedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_QUERY');
const docEmbedding = await generateEmbedding(doc, apiKey, 'RETRIEVAL_DOCUMENT');
❌ Don't: Return too many or too few results
// Too few (might miss relevant info)
const results = await env.VECTORIZE.query(embedding, { topK: 1 });
// Too many (noise, cost)
const results = await env.VECTORIZE.query(embedding, { topK: 50 });
✅ Do: Find optimal topK for your use case
// Test different topK values
const topK = 5; // Good default for most use cases
const results = await env.VECTORIZE.query(embedding, { topK });
Complete Example
See templates/rag-with-vectorize.ts for a production-ready implementation combining these patterns.
Official Documentation
- Gemini Embeddings: https://ai.google.dev/gemini-api/docs/embeddings
- Vectorize: https://developers.cloudflare.com/vectorize/
- RAG Best Practices: https://ai.google.dev/gemini-api/docs/document-processing