Initial commit
This commit is contained in:
483
references/rag-patterns.md
Normal file
483
references/rag-patterns.md
Normal file
@@ -0,0 +1,483 @@
|
||||
# RAG Implementation Patterns
|
||||
|
||||
Complete guide to Retrieval Augmented Generation patterns using Gemini embeddings and Cloudflare Vectorize.
|
||||
|
||||
---
|
||||
|
||||
## RAG Workflow Overview
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ DOCUMENT INGESTION (Offline) │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
Documents
|
||||
↓
|
||||
Chunking (500 words)
|
||||
↓
|
||||
Generate Embeddings (RETRIEVAL_DOCUMENT)
|
||||
↓
|
||||
Store in Vectorize + Metadata
|
||||
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ QUERY PROCESSING (Runtime) │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
User Query
|
||||
↓
|
||||
Generate Embedding (RETRIEVAL_QUERY)
|
||||
↓
|
||||
Vector Search (top-K)
|
||||
↓
|
||||
Retrieve Documents
|
||||
↓
|
||||
Generate Response (LLM + Context)
|
||||
↓
|
||||
Stream to User
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 1: Basic RAG
|
||||
|
||||
**Use when**: Simple Q&A over a knowledge base
|
||||
|
||||
```typescript
|
||||
async function basicRAG(query: string, env: Env): Promise<string> {
|
||||
// 1. Embed query
|
||||
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
|
||||
// 2. Search Vectorize
|
||||
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
|
||||
|
||||
// 3. Concatenate context
|
||||
const context = results.matches
|
||||
.map(m => m.metadata?.text)
|
||||
.join('\n\n');
|
||||
|
||||
// 4. Generate response
|
||||
const response = await generateResponse(context, query, env.GEMINI_API_KEY);
|
||||
|
||||
return response;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 2: Chunked RAG (Recommended)
|
||||
|
||||
**Use when**: Documents are longer than 2,048 tokens
|
||||
|
||||
### Chunking Strategies
|
||||
|
||||
```typescript
|
||||
// Strategy A: Fixed-size chunks with overlap
|
||||
function chunkWithOverlap(text: string, size = 500, overlap = 50): string[] {
|
||||
const words = text.split(/\s+/);
|
||||
const chunks: string[] = [];
|
||||
|
||||
for (let i = 0; i < words.length; i += size - overlap) {
|
||||
chunks.push(words.slice(i, i + size).join(' '));
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Strategy B: Sentence-based chunks
|
||||
function chunkBySentences(text: string, maxSentences = 10): string[] {
|
||||
const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
|
||||
const chunks: string[] = [];
|
||||
|
||||
for (let i = 0; i < sentences.length; i += maxSentences) {
|
||||
chunks.push(sentences.slice(i, i + maxSentences).join(' '));
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// Strategy C: Semantic chunks (preserves paragraphs)
|
||||
function chunkByParagraphs(text: string): string[] {
|
||||
return text.split(/\n\n+/).filter(p => p.trim().length > 50);
|
||||
}
|
||||
```
|
||||
|
||||
### Implementation
|
||||
|
||||
```typescript
|
||||
async function ingestWithChunking(doc: Document, env: Env) {
|
||||
const chunks = chunkWithOverlap(doc.text, 500, 50);
|
||||
|
||||
const vectors = [];
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const embedding = await generateEmbedding(chunks[i], env.GEMINI_API_KEY, 'RETRIEVAL_DOCUMENT');
|
||||
|
||||
vectors.push({
|
||||
id: `${doc.id}-chunk-${i}`,
|
||||
values: embedding,
|
||||
metadata: {
|
||||
documentId: doc.id,
|
||||
chunkIndex: i,
|
||||
text: chunks[i],
|
||||
title: doc.title
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
await env.VECTORIZE.insert(vectors);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 3: Hybrid Search (Keyword + Semantic)
|
||||
|
||||
**Use when**: You need both exact keyword matches and semantic understanding
|
||||
|
||||
```typescript
|
||||
async function hybridSearch(query: string, env: Env) {
|
||||
// 1. Vector search
|
||||
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
const vectorResults = await env.VECTORIZE.query(queryEmbedding, { topK: 10 });
|
||||
|
||||
// 2. Keyword search (using metadata or D1)
|
||||
const keywordResults = await env.D1.prepare(
|
||||
'SELECT * FROM documents WHERE text LIKE ? ORDER BY relevance DESC LIMIT 10'
|
||||
).bind(`%${query}%`).all();
|
||||
|
||||
// 3. Merge and re-rank
|
||||
const combined = mergeResults(vectorResults.matches, keywordResults.results);
|
||||
|
||||
// 4. Generate response from top results
|
||||
const context = combined.slice(0, 5).map(r => r.text).join('\n\n');
|
||||
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 4: Filtered RAG
|
||||
|
||||
**Use when**: Need to filter by category, date, or metadata
|
||||
|
||||
```typescript
|
||||
async function filteredRAG(query: string, filters: { category?: string; minDate?: number }, env: Env) {
|
||||
// 1. Vector search
|
||||
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 20 }); // Fetch more
|
||||
|
||||
// 2. Filter in application layer (until Vectorize supports metadata filtering)
|
||||
const filtered = results.matches.filter(match => {
|
||||
if (filters.category && match.metadata?.category !== filters.category) return false;
|
||||
if (filters.minDate && match.metadata?.timestamp < filters.minDate) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
// 3. Take top 5 after filtering
|
||||
const topResults = filtered.slice(0, 5);
|
||||
|
||||
// 4. Generate response
|
||||
const context = topResults.map(r => r.metadata?.text).join('\n\n');
|
||||
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 5: Streaming RAG
|
||||
|
||||
**Use when**: Real-time responses with immediate feedback
|
||||
|
||||
```typescript
|
||||
async function streamingRAG(query: string, env: Env): Promise<ReadableStream> {
|
||||
// 1. Embed query and search
|
||||
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });
|
||||
|
||||
const context = results.matches.map(m => m.metadata?.text).join('\n\n');
|
||||
|
||||
// 2. Stream response from Gemini
|
||||
const response = await fetch(
|
||||
'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent',
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'x-goog-api-key': env.GEMINI_API_KEY,
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
contents: [{
|
||||
parts: [{ text: `Context:\n${context}\n\nQuestion: ${query}\n\nAnswer:` }]
|
||||
}]
|
||||
})
|
||||
}
|
||||
);
|
||||
|
||||
return response.body!;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 6: Multi-Query RAG
|
||||
|
||||
**Use when**: Query might be ambiguous or multi-faceted
|
||||
|
||||
```typescript
|
||||
async function multiQueryRAG(query: string, env: Env) {
|
||||
// 1. Generate multiple query variations
|
||||
const queryVariations = await generateQueryVariations(query, env.GEMINI_API_KEY);
|
||||
// Returns: ["original query", "rephrased version 1", "rephrased version 2"]
|
||||
|
||||
// 2. Search with each variation
|
||||
const allResults = await Promise.all(
|
||||
queryVariations.map(async q => {
|
||||
const embedding = await generateEmbedding(q, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
return await env.VECTORIZE.query(embedding, { topK: 3 });
|
||||
})
|
||||
);
|
||||
|
||||
// 3. Merge and deduplicate
|
||||
const uniqueResults = deduplicateById(allResults.flatMap(r => r.matches));
|
||||
|
||||
// 4. Generate response
|
||||
const context = uniqueResults.slice(0, 5).map(r => r.metadata?.text).join('\n\n');
|
||||
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 7: Conversational RAG
|
||||
|
||||
**Use when**: Multi-turn conversations with context
|
||||
|
||||
```typescript
|
||||
interface ConversationHistory {
|
||||
role: 'user' | 'assistant';
|
||||
content: string;
|
||||
}
|
||||
|
||||
async function conversationalRAG(
|
||||
query: string,
|
||||
history: ConversationHistory[],
|
||||
env: Env
|
||||
) {
|
||||
// 1. Create contextualized query from history
|
||||
const contextualizedQuery = await reformulateQuery(query, history, env.GEMINI_API_KEY);
|
||||
|
||||
// 2. Search with contextualized query
|
||||
const embedding = await generateEmbedding(contextualizedQuery, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
const results = await env.VECTORIZE.query(embedding, { topK: 3 });
|
||||
|
||||
const retrievedContext = results.matches.map(m => m.metadata?.text).join('\n\n');
|
||||
|
||||
// 3. Generate response with conversation history
|
||||
const prompt = `
|
||||
Conversation history:
|
||||
${history.map(h => `${h.role}: ${h.content}`).join('\n')}
|
||||
|
||||
Retrieved context:
|
||||
${retrievedContext}
|
||||
|
||||
User: ${query}
|
||||
Assistant:`;
|
||||
|
||||
return await generateResponse(prompt, query, env.GEMINI_API_KEY);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 8: Citation RAG
|
||||
|
||||
**Use when**: Need to cite sources in responses
|
||||
|
||||
```typescript
|
||||
async function citationRAG(query: string, env: Env) {
|
||||
const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
|
||||
const results = await env.VECTORIZE.query(queryEmbedding, { topK: 5, returnMetadata: true });
|
||||
|
||||
// Build context with citations
|
||||
const contextWithCitations = results.matches.map((match, i) =>
|
||||
`[${i + 1}] ${match.metadata?.text}\nSource: ${match.metadata?.url || match.id}`
|
||||
).join('\n\n');
|
||||
|
||||
const prompt = `Answer the question using the provided sources. Include citations [1], [2], etc. in your answer.
|
||||
|
||||
Sources:
|
||||
${contextWithCitations}
|
||||
|
||||
Question: ${query}
|
||||
|
||||
Answer (with citations):`;
|
||||
|
||||
const response = await generateResponse(prompt, query, env.GEMINI_API_KEY);
|
||||
|
||||
return {
|
||||
answer: response,
|
||||
sources: results.matches.map((m, i) => ({
|
||||
citation: i + 1,
|
||||
text: m.metadata?.text,
|
||||
url: m.metadata?.url,
|
||||
score: m.score
|
||||
}))
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Chunk Size Optimization
|
||||
|
||||
```typescript
|
||||
// Test different chunk sizes for your use case
|
||||
const chunkSizes = [200, 500, 1000, 1500];
|
||||
|
||||
for (const size of chunkSizes) {
|
||||
const accuracy = await testRetrievalAccuracy(size);
|
||||
console.log(`Chunk size ${size}: ${accuracy}% accuracy`);
|
||||
}
|
||||
|
||||
// Recommendation: 500-1000 words with 10% overlap
|
||||
```
|
||||
|
||||
### 2. Context Window Management
|
||||
|
||||
```typescript
|
||||
// Don't exceed LLM context window
|
||||
function truncateContext(chunks: string[], maxTokens = 4000): string {
|
||||
let context = '';
|
||||
let estimatedTokens = 0;
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const chunkTokens = chunk.split(/\s+/).length * 1.3; // Rough estimate
|
||||
if (estimatedTokens + chunkTokens > maxTokens) break;
|
||||
|
||||
context += chunk + '\n\n';
|
||||
estimatedTokens += chunkTokens;
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Re-ranking
|
||||
|
||||
```typescript
|
||||
// Re-rank results after retrieval
|
||||
function rerank(results: VectorizeMatch[], query: string): VectorizeMatch[] {
|
||||
return results
|
||||
.map(result => ({
|
||||
...result,
|
||||
rerankScore: calculateRelevance(result.metadata?.text, query)
|
||||
}))
|
||||
.sort((a, b) => b.rerankScore - a.rerankScore);
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Fallback Strategies
|
||||
|
||||
```typescript
|
||||
async function ragWithFallback(query: string, env: Env) {
|
||||
const results = await searchVectorize(query, env);
|
||||
|
||||
if (results.matches.length === 0 || results.matches[0].score < 0.7) {
|
||||
// Fallback: Use LLM without RAG
|
||||
return await generateResponse('', query, env.GEMINI_API_KEY);
|
||||
}
|
||||
|
||||
// Normal RAG flow
|
||||
const context = results.matches.map(m => m.metadata?.text).join('\n\n');
|
||||
return await generateResponse(context, query, env.GEMINI_API_KEY);
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### 1. Caching
|
||||
|
||||
```typescript
|
||||
// Cache embeddings
|
||||
const embeddingCache = new Map<string, number[]>();
|
||||
|
||||
async function getCachedEmbedding(text: string, apiKey: string) {
|
||||
const key = hashText(text);
|
||||
|
||||
if (embeddingCache.has(key)) {
|
||||
return embeddingCache.get(key)!;
|
||||
}
|
||||
|
||||
const embedding = await generateEmbedding(text, apiKey, 'RETRIEVAL_QUERY');
|
||||
embeddingCache.set(key, embedding);
|
||||
|
||||
return embedding;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Batch Processing
|
||||
|
||||
```typescript
|
||||
// Ingest documents in parallel
|
||||
async function batchIngest(documents: Document[], env: Env, concurrency = 5) {
|
||||
for (let i = 0; i < documents.length; i += concurrency) {
|
||||
const batch = documents.slice(i, i + concurrency);
|
||||
|
||||
await Promise.all(
|
||||
batch.map(doc => ingestDocument(doc, env))
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
### ❌ Don't: Use same task type for queries and documents
|
||||
|
||||
```typescript
|
||||
// Wrong
|
||||
const embedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_DOCUMENT');
|
||||
```
|
||||
|
||||
### ✅ Do: Use correct task types
|
||||
|
||||
```typescript
|
||||
// Correct
|
||||
const queryEmbedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_QUERY');
|
||||
const docEmbedding = await generateEmbedding(doc, apiKey, 'RETRIEVAL_DOCUMENT');
|
||||
```
|
||||
|
||||
### ❌ Don't: Return too many or too few results
|
||||
|
||||
```typescript
|
||||
// Too few (might miss relevant info)
|
||||
const results = await env.VECTORIZE.query(embedding, { topK: 1 });
|
||||
|
||||
// Too many (noise, cost)
|
||||
const results = await env.VECTORIZE.query(embedding, { topK: 50 });
|
||||
```
|
||||
|
||||
### ✅ Do: Find optimal topK for your use case
|
||||
|
||||
```typescript
|
||||
// Test different topK values
|
||||
const topK = 5; // Good default for most use cases
|
||||
const results = await env.VECTORIZE.query(embedding, { topK });
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Complete Example
|
||||
|
||||
See `templates/rag-with-vectorize.ts` for a production-ready implementation combining these patterns.
|
||||
|
||||
---
|
||||
|
||||
## Official Documentation
|
||||
|
||||
- **Gemini Embeddings**: https://ai.google.dev/gemini-api/docs/embeddings
|
||||
- **Vectorize**: https://developers.cloudflare.com/vectorize/
|
||||
- **RAG Best Practices**: https://ai.google.dev/gemini-api/docs/document-processing
|
||||
Reference in New Issue
Block a user