Files
2025-11-30 08:24:54 +08:00

13 KiB

RAG Implementation Patterns

Complete guide to Retrieval Augmented Generation patterns using Gemini embeddings and Cloudflare Vectorize.


RAG Workflow Overview

┌─────────────────────────────────────────────────────────┐
│              DOCUMENT INGESTION (Offline)                │
└─────────────────────────────────────────────────────────┘
   Documents
      ↓
   Chunking (500 words)
      ↓
   Generate Embeddings (RETRIEVAL_DOCUMENT)
      ↓
   Store in Vectorize + Metadata

┌─────────────────────────────────────────────────────────┐
│              QUERY PROCESSING (Runtime)                  │
└─────────────────────────────────────────────────────────┘
   User Query
      ↓
   Generate Embedding (RETRIEVAL_QUERY)
      ↓
   Vector Search (top-K)
      ↓
   Retrieve Documents
      ↓
   Generate Response (LLM + Context)
      ↓
   Stream to User

Pattern 1: Basic RAG

Use when: Simple Q&A over a knowledge base

async function basicRAG(query: string, env: Env): Promise<string> {
  // 1. Embed query
  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');

  // 2. Search Vectorize
  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });

  // 3. Concatenate context
  const context = results.matches
    .map(m => m.metadata?.text)
    .join('\n\n');

  // 4. Generate response
  const response = await generateResponse(context, query, env.GEMINI_API_KEY);

  return response;
}

Use when: Documents are longer than 2,048 tokens

Chunking Strategies

// Strategy A: Fixed-size chunks with overlap
function chunkWithOverlap(text: string, size = 500, overlap = 50): string[] {
  const words = text.split(/\s+/);
  const chunks: string[] = [];

  for (let i = 0; i < words.length; i += size - overlap) {
    chunks.push(words.slice(i, i + size).join(' '));
  }

  return chunks;
}

// Strategy B: Sentence-based chunks
function chunkBySentences(text: string, maxSentences = 10): string[] {
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [];
  const chunks: string[] = [];

  for (let i = 0; i < sentences.length; i += maxSentences) {
    chunks.push(sentences.slice(i, i + maxSentences).join(' '));
  }

  return chunks;
}

// Strategy C: Semantic chunks (preserves paragraphs)
function chunkByParagraphs(text: string): string[] {
  return text.split(/\n\n+/).filter(p => p.trim().length > 50);
}

Implementation

async function ingestWithChunking(doc: Document, env: Env) {
  const chunks = chunkWithOverlap(doc.text, 500, 50);

  const vectors = [];
  for (let i = 0; i < chunks.length; i++) {
    const embedding = await generateEmbedding(chunks[i], env.GEMINI_API_KEY, 'RETRIEVAL_DOCUMENT');

    vectors.push({
      id: `${doc.id}-chunk-${i}`,
      values: embedding,
      metadata: {
        documentId: doc.id,
        chunkIndex: i,
        text: chunks[i],
        title: doc.title
      }
    });
  }

  await env.VECTORIZE.insert(vectors);
}

Pattern 3: Hybrid Search (Keyword + Semantic)

Use when: You need both exact keyword matches and semantic understanding

async function hybridSearch(query: string, env: Env) {
  // 1. Vector search
  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
  const vectorResults = await env.VECTORIZE.query(queryEmbedding, { topK: 10 });

  // 2. Keyword search (using metadata or D1)
  const keywordResults = await env.D1.prepare(
    'SELECT * FROM documents WHERE text LIKE ? ORDER BY relevance DESC LIMIT 10'
  ).bind(`%${query}%`).all();

  // 3. Merge and re-rank
  const combined = mergeResults(vectorResults.matches, keywordResults.results);

  // 4. Generate response from top results
  const context = combined.slice(0, 5).map(r => r.text).join('\n\n');
  return await generateResponse(context, query, env.GEMINI_API_KEY);
}

Pattern 4: Filtered RAG

Use when: Need to filter by category, date, or metadata

async function filteredRAG(query: string, filters: { category?: string; minDate?: number }, env: Env) {
  // 1. Vector search
  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 20 }); // Fetch more

  // 2. Filter in application layer (until Vectorize supports metadata filtering)
  const filtered = results.matches.filter(match => {
    if (filters.category && match.metadata?.category !== filters.category) return false;
    if (filters.minDate && match.metadata?.timestamp < filters.minDate) return false;
    return true;
  });

  // 3. Take top 5 after filtering
  const topResults = filtered.slice(0, 5);

  // 4. Generate response
  const context = topResults.map(r => r.metadata?.text).join('\n\n');
  return await generateResponse(context, query, env.GEMINI_API_KEY);
}

Pattern 5: Streaming RAG

Use when: Real-time responses with immediate feedback

async function streamingRAG(query: string, env: Env): Promise<ReadableStream> {
  // 1. Embed query and search
  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 3 });

  const context = results.matches.map(m => m.metadata?.text).join('\n\n');

  // 2. Stream response from Gemini
  const response = await fetch(
    'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent',
    {
      method: 'POST',
      headers: {
        'x-goog-api-key': env.GEMINI_API_KEY,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({
        contents: [{
          parts: [{ text: `Context:\n${context}\n\nQuestion: ${query}\n\nAnswer:` }]
        }]
      })
    }
  );

  return response.body!;
}

Pattern 6: Multi-Query RAG

Use when: Query might be ambiguous or multi-faceted

async function multiQueryRAG(query: string, env: Env) {
  // 1. Generate multiple query variations
  const queryVariations = await generateQueryVariations(query, env.GEMINI_API_KEY);
  // Returns: ["original query", "rephrased version 1", "rephrased version 2"]

  // 2. Search with each variation
  const allResults = await Promise.all(
    queryVariations.map(async q => {
      const embedding = await generateEmbedding(q, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
      return await env.VECTORIZE.query(embedding, { topK: 3 });
    })
  );

  // 3. Merge and deduplicate
  const uniqueResults = deduplicateById(allResults.flatMap(r => r.matches));

  // 4. Generate response
  const context = uniqueResults.slice(0, 5).map(r => r.metadata?.text).join('\n\n');
  return await generateResponse(context, query, env.GEMINI_API_KEY);
}

Pattern 7: Conversational RAG

Use when: Multi-turn conversations with context

interface ConversationHistory {
  role: 'user' | 'assistant';
  content: string;
}

async function conversationalRAG(
  query: string,
  history: ConversationHistory[],
  env: Env
) {
  // 1. Create contextualized query from history
  const contextualizedQuery = await reformulateQuery(query, history, env.GEMINI_API_KEY);

  // 2. Search with contextualized query
  const embedding = await generateEmbedding(contextualizedQuery, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
  const results = await env.VECTORIZE.query(embedding, { topK: 3 });

  const retrievedContext = results.matches.map(m => m.metadata?.text).join('\n\n');

  // 3. Generate response with conversation history
  const prompt = `
Conversation history:
${history.map(h => `${h.role}: ${h.content}`).join('\n')}

Retrieved context:
${retrievedContext}

User: ${query}
Assistant:`;

  return await generateResponse(prompt, query, env.GEMINI_API_KEY);
}

Pattern 8: Citation RAG

Use when: Need to cite sources in responses

async function citationRAG(query: string, env: Env) {
  const queryEmbedding = await generateEmbedding(query, env.GEMINI_API_KEY, 'RETRIEVAL_QUERY');
  const results = await env.VECTORIZE.query(queryEmbedding, { topK: 5, returnMetadata: true });

  // Build context with citations
  const contextWithCitations = results.matches.map((match, i) =>
    `[${i + 1}] ${match.metadata?.text}\nSource: ${match.metadata?.url || match.id}`
  ).join('\n\n');

  const prompt = `Answer the question using the provided sources. Include citations [1], [2], etc. in your answer.

Sources:
${contextWithCitations}

Question: ${query}

Answer (with citations):`;

  const response = await generateResponse(prompt, query, env.GEMINI_API_KEY);

  return {
    answer: response,
    sources: results.matches.map((m, i) => ({
      citation: i + 1,
      text: m.metadata?.text,
      url: m.metadata?.url,
      score: m.score
    }))
  };
}

Best Practices

1. Chunk Size Optimization

// Test different chunk sizes for your use case
const chunkSizes = [200, 500, 1000, 1500];

for (const size of chunkSizes) {
  const accuracy = await testRetrievalAccuracy(size);
  console.log(`Chunk size ${size}: ${accuracy}% accuracy`);
}

// Recommendation: 500-1000 words with 10% overlap

2. Context Window Management

// Don't exceed LLM context window
function truncateContext(chunks: string[], maxTokens = 4000): string {
  let context = '';
  let estimatedTokens = 0;

  for (const chunk of chunks) {
    const chunkTokens = chunk.split(/\s+/).length * 1.3; // Rough estimate
    if (estimatedTokens + chunkTokens > maxTokens) break;

    context += chunk + '\n\n';
    estimatedTokens += chunkTokens;
  }

  return context;
}

3. Re-ranking

// Re-rank results after retrieval
function rerank(results: VectorizeMatch[], query: string): VectorizeMatch[] {
  return results
    .map(result => ({
      ...result,
      rerankScore: calculateRelevance(result.metadata?.text, query)
    }))
    .sort((a, b) => b.rerankScore - a.rerankScore);
}

4. Fallback Strategies

async function ragWithFallback(query: string, env: Env) {
  const results = await searchVectorize(query, env);

  if (results.matches.length === 0 || results.matches[0].score < 0.7) {
    // Fallback: Use LLM without RAG
    return await generateResponse('', query, env.GEMINI_API_KEY);
  }

  // Normal RAG flow
  const context = results.matches.map(m => m.metadata?.text).join('\n\n');
  return await generateResponse(context, query, env.GEMINI_API_KEY);
}

Performance Optimization

1. Caching

// Cache embeddings
const embeddingCache = new Map<string, number[]>();

async function getCachedEmbedding(text: string, apiKey: string) {
  const key = hashText(text);

  if (embeddingCache.has(key)) {
    return embeddingCache.get(key)!;
  }

  const embedding = await generateEmbedding(text, apiKey, 'RETRIEVAL_QUERY');
  embeddingCache.set(key, embedding);

  return embedding;
}

2. Batch Processing

// Ingest documents in parallel
async function batchIngest(documents: Document[], env: Env, concurrency = 5) {
  for (let i = 0; i < documents.length; i += concurrency) {
    const batch = documents.slice(i, i + concurrency);

    await Promise.all(
      batch.map(doc => ingestDocument(doc, env))
    );
  }
}

Common Pitfalls

Don't: Use same task type for queries and documents

// Wrong
const embedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_DOCUMENT');

Do: Use correct task types

// Correct
const queryEmbedding = await generateEmbedding(query, apiKey, 'RETRIEVAL_QUERY');
const docEmbedding = await generateEmbedding(doc, apiKey, 'RETRIEVAL_DOCUMENT');

Don't: Return too many or too few results

// Too few (might miss relevant info)
const results = await env.VECTORIZE.query(embedding, { topK: 1 });

// Too many (noise, cost)
const results = await env.VECTORIZE.query(embedding, { topK: 50 });

Do: Find optimal topK for your use case

// Test different topK values
const topK = 5; // Good default for most use cases
const results = await env.VECTORIZE.query(embedding, { topK });

Complete Example

See templates/rag-with-vectorize.ts for a production-ready implementation combining these patterns.


Official Documentation