commit b41966ed51cda5597292032152015ab6f78c7814 Author: Zhongwei Li Date: Sun Nov 30 08:24:38 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..bcfd1fb --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "cloudflare-workers-ai", + "description": "Run LLMs and AI models on Cloudflares global GPU network with Workers AI. Includes Llama, Flux image generation, BGE embeddings, and streaming support with AI Gateway for caching and logging. Use when: implementing LLM inference, generating images with Flux/Stable Diffusion, building RAG with embeddings, streaming AI responses, using AI Gateway for cost tracking, or troubleshooting AI_ERROR, rate limits, model not found, token limits, or neurons exceeded.", + "version": "1.0.0", + "author": { + "name": "Jeremy Dawes", + "email": "jeremy@jezweb.net" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..61228cf --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# cloudflare-workers-ai + +Run LLMs and AI models on Cloudflares global GPU network with Workers AI. Includes Llama, Flux image generation, BGE embeddings, and streaming support with AI Gateway for caching and logging. Use when: implementing LLM inference, generating images with Flux/Stable Diffusion, building RAG with embeddings, streaming AI responses, using AI Gateway for cost tracking, or troubleshooting AI_ERROR, rate limits, model not found, token limits, or neurons exceeded. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..51ff71c --- /dev/null +++ b/SKILL.md @@ -0,0 +1,341 @@ +--- +name: cloudflare-workers-ai +description: | + Run LLMs and AI models on Cloudflare's global GPU network with Workers AI. Includes Llama 4, Gemma 3, Mistral 3.1, + Flux image generation, BGE embeddings (2x faster, 2025), streaming support, and AI Gateway for cost tracking. + + Use when: implementing LLM inference, generating images, building RAG with embeddings, streaming AI responses, + using AI Gateway, troubleshooting max_tokens defaults (breaking change 2025), BGE pooling parameter (not backwards + compatible), or handling AI_ERROR, rate limits, model deprecations, token limits. + + Keywords: workers ai, cloudflare ai, ai bindings, llm workers, @cf/meta/llama-4-scout, @cf/google/gemma-3-12b-it, + @cf/mistralai/mistral-small-3.1-24b-instruct, @cf/openai/gpt-oss-120b, workers ai models, ai inference, + cloudflare llm, ai streaming, text generation ai, ai embeddings, bge pooling cls mean, image generation ai, + workers ai rag, ai gateway, llama workers, flux image generation, deepgram aura, leonardo image generation, + vision models ai, ai chat completion, AI_ERROR, rate limit ai, model not found, max_tokens breaking change, + bge pooling backwards compatibility, model deprecations october 2025, token limit exceeded, neurons exceeded, + workers ai hono, ai gateway workers, vercel ai sdk workers, openai compatible workers, workers ai vectorize, + workers-ai-provider v2, ai sdk v5, lora adapters rank 32 +license: MIT +--- + +# Cloudflare Workers AI + +**Status**: Production Ready ✅ +**Last Updated**: 2025-11-25 +**Dependencies**: cloudflare-worker-base (for Worker setup) +**Latest Versions**: wrangler@4.50.0, @cloudflare/workers-types@4.20251125.0 + +**Recent Updates (2025)**: +- **April 2025 - Performance**: Llama 3.3 70B 2-4x faster (speculative decoding, prefix caching), BGE embeddings 2x faster +- **April 2025 - Breaking Changes**: max_tokens now correctly defaults to 256 (was not respected), BGE pooling parameter (cls NOT backwards compatible with mean) +- **2025 - New Models (14)**: Mistral 3.1 24B (vision+tools), Gemma 3 12B (128K context), EmbeddingGemma 300M, Llama 4 Scout, GPT-OSS 120B/20B, Qwen models (QwQ 32B, Coder 32B), Leonardo image gen, Deepgram Aura 2, Whisper v3 Turbo, IBM Granite, Nova 3 +- **2025 - Platform**: Context windows API change (tokens not chars), unit-based pricing with per-model granularity, workers-ai-provider v2.0.0 (AI SDK v5), LoRA rank up to 32 (was 8), 100 adapters per account +- **October 2025**: Model deprecations (use Llama 4, GPT-OSS instead) + +--- + +## Quick Start (5 Minutes) + +```typescript +// 1. Add AI binding to wrangler.jsonc +{ "ai": { "binding": "AI" } } + +// 2. Run model with streaming (recommended) +export default { + async fetch(request: Request, env: Env): Promise { + const stream = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [{ role: 'user', content: 'Tell me a story' }], + stream: true, // Always stream for text generation! + }); + + return new Response(stream, { + headers: { 'content-type': 'text/event-stream' }, + }); + }, +}; +``` + +**Why streaming?** Prevents buffering in memory, faster time-to-first-token, avoids Worker timeout issues. + +--- + +## API Reference + +```typescript +env.AI.run( + model: string, + inputs: ModelInputs, + options?: { gateway?: { id: string; skipCache?: boolean } } +): Promise +``` + +--- + +## Model Selection Guide (Updated 2025) + +### Text Generation (LLMs) + +| Model | Best For | Rate Limit | Size | Notes | +|-------|----------|------------|------|-------| +| **2025 Models** | +| `@cf/meta/llama-4-scout-17b-16e-instruct` | Latest Llama, general purpose | 300/min | 17B | NEW 2025 | +| `@cf/openai/gpt-oss-120b` | Largest open-source GPT | 300/min | 120B | NEW 2025 | +| `@cf/openai/gpt-oss-20b` | Smaller open-source GPT | 300/min | 20B | NEW 2025 | +| `@cf/google/gemma-3-12b-it` | 128K context, 140+ languages | 300/min | 12B | NEW 2025, vision | +| `@cf/mistralai/mistral-small-3.1-24b-instruct` | Vision + tool calling | 300/min | 24B | NEW 2025 | +| `@cf/qwen/qwq-32b` | Reasoning, complex tasks | 300/min | 32B | NEW 2025 | +| `@cf/qwen/qwen2.5-coder-32b-instruct` | Coding specialist | 300/min | 32B | NEW 2025 | +| `@cf/qwen/qwen3-30b-a3b-fp8` | Fast quantized | 300/min | 30B | NEW 2025 | +| `@cf/ibm-granite/granite-4.0-h-micro` | Small, efficient | 300/min | Micro | NEW 2025 | +| **Performance (2025)** | +| `@cf/meta/llama-3.3-70b-instruct-fp8-fast` | 2-4x faster (2025 update) | 300/min | 70B | Speculative decoding | +| `@cf/meta/llama-3.1-8b-instruct-fp8-fast` | Fast 8B variant | 300/min | 8B | - | +| **Standard Models** | +| `@cf/meta/llama-3.1-8b-instruct` | General purpose | 300/min | 8B | - | +| `@cf/meta/llama-3.2-1b-instruct` | Ultra-fast, simple tasks | 300/min | 1B | - | +| `@cf/deepseek-ai/deepseek-r1-distill-qwen-32b` | Coding, technical | 300/min | 32B | - | + +### Text Embeddings (2x Faster - 2025) + +| Model | Dimensions | Best For | Rate Limit | Notes | +|-------|-----------|----------|------------|-------| +| `@cf/google/embeddinggemma-300m` | 768 | Best-in-class RAG | 3000/min | **NEW 2025** | +| `@cf/baai/bge-base-en-v1.5` | 768 | General RAG (2x faster) | 3000/min | **pooling: "cls"** recommended | +| `@cf/baai/bge-large-en-v1.5` | 1024 | High accuracy (2x faster) | 1500/min | **pooling: "cls"** recommended | +| `@cf/baai/bge-small-en-v1.5` | 384 | Fast, low storage (2x faster) | 3000/min | **pooling: "cls"** recommended | +| `@cf/qwen/qwen3-embedding-0.6b` | 768 | Qwen embeddings | 3000/min | NEW 2025 | + +**CRITICAL (2025)**: BGE models now support `pooling: "cls"` parameter (recommended) but NOT backwards compatible with `pooling: "mean"` (default). + +### Image Generation + +| Model | Best For | Rate Limit | Notes | +|-------|----------|------------|-------| +| `@cf/black-forest-labs/flux-1-schnell` | High quality, photorealistic | 720/min | - | +| `@cf/leonardo/lucid-origin` | Leonardo AI style | 720/min | NEW 2025 | +| `@cf/leonardo/phoenix-1.0` | Leonardo AI variant | 720/min | NEW 2025 | +| `@cf/stabilityai/stable-diffusion-xl-base-1.0` | General purpose | 720/min | - | + +### Vision Models + +| Model | Best For | Rate Limit | Notes | +|-------|----------|------------|-------| +| `@cf/meta/llama-3.2-11b-vision-instruct` | Image understanding | 720/min | - | +| `@cf/google/gemma-3-12b-it` | Vision + text (128K context) | 300/min | NEW 2025 | + +### Audio Models (2025) + +| Model | Type | Rate Limit | Notes | +|-------|------|------------|-------| +| `@cf/deepgram/aura-2-en` | Text-to-speech (English) | 720/min | NEW 2025 | +| `@cf/deepgram/aura-2-es` | Text-to-speech (Spanish) | 720/min | NEW 2025 | +| `@cf/deepgram/nova-3` | Speech-to-text (+ WebSocket) | 720/min | NEW 2025 | +| `@cf/openai/whisper-large-v3-turbo` | Speech-to-text (faster) | 720/min | NEW 2025 | + +--- + +## Common Patterns + +### RAG (Retrieval Augmented Generation) + +```typescript +// 1. Generate embeddings +const embeddings = await env.AI.run('@cf/baai/bge-base-en-v1.5', { text: [userQuery] }); + +// 2. Search Vectorize +const matches = await env.VECTORIZE.query(embeddings.data[0], { topK: 3 }); +const context = matches.matches.map((m) => m.metadata.text).join('\n\n'); + +// 3. Generate with context +const response = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { role: 'system', content: `Answer using this context:\n${context}` }, + { role: 'user', content: userQuery }, + ], + stream: true, +}); +``` + +--- + +### Structured Output with Zod + +```typescript +import { z } from 'zod'; + +const Schema = z.object({ name: z.string(), items: z.array(z.string()) }); + +const response = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [{ + role: 'user', + content: `Generate JSON matching: ${JSON.stringify(Schema.shape)}` + }], +}); + +const validated = Schema.parse(JSON.parse(response.response)); +``` + +--- + +## AI Gateway Integration + +Provides caching, logging, cost tracking, and analytics for AI requests. + +```typescript +const response = await env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { prompt: 'Hello' }, + { gateway: { id: 'my-gateway', skipCache: false } } +); + +// Access logs and send feedback +const gateway = env.AI.gateway('my-gateway'); +await gateway.patchLog(env.AI.aiGatewayLogId, { + feedback: { rating: 1, comment: 'Great response' }, +}); +``` + +**Benefits:** Cost tracking, caching (reduces duplicate inference), logging, rate limiting, analytics. + +--- + +## Rate Limits & Pricing (Updated 2025) + +### Rate Limits (per minute) + +| Task Type | Default Limit | Notes | +|-----------|---------------|-------| +| **Text Generation** | 300/min | Some fast models: 400-1500/min | +| **Text Embeddings** | 3000/min | BGE-large: 1500/min | +| **Image Generation** | 720/min | All image models | +| **Vision Models** | 720/min | Image understanding | +| **Audio (TTS/STT)** | 720/min | Deepgram, Whisper | +| **Translation** | 720/min | M2M100, Opus MT | +| **Classification** | 2000/min | Text classification | + +### Pricing (Unit-Based, Billed in Neurons - 2025) + +**Free Tier:** +- 10,000 neurons per day +- Resets daily at 00:00 UTC + +**Paid Tier ($0.011 per 1,000 neurons):** +- 10,000 neurons/day included +- Unlimited usage above free allocation + +**2025 Model Costs (per 1M tokens):** + +| Model | Input | Output | Notes | +|-------|-------|--------|-------| +| **2025 Models** | +| Llama 4 Scout 17B | $0.270 | $0.850 | NEW 2025 | +| GPT-OSS 120B | $0.350 | $0.750 | NEW 2025 | +| GPT-OSS 20B | $0.200 | $0.300 | NEW 2025 | +| Gemma 3 12B | $0.345 | $0.556 | NEW 2025 | +| Mistral 3.1 24B | $0.351 | $0.555 | NEW 2025 | +| Qwen QwQ 32B | $0.660 | $1.000 | NEW 2025 | +| Qwen Coder 32B | $0.660 | $1.000 | NEW 2025 | +| IBM Granite Micro | $0.017 | $0.112 | NEW 2025 | +| EmbeddingGemma 300M | $0.012 | N/A | NEW 2025 | +| Qwen3 Embedding 0.6B | $0.012 | N/A | NEW 2025 | +| **Performance (2025)** | +| Llama 3.3 70B Fast | $0.293 | $2.253 | 2-4x faster | +| Llama 3.1 8B FP8 Fast | $0.045 | $0.384 | Fast variant | +| **Standard Models** | +| Llama 3.2 1B | $0.027 | $0.201 | - | +| Llama 3.1 8B | $0.282 | $0.827 | - | +| Deepseek R1 32B | $0.497 | $4.881 | - | +| BGE-base (2x faster) | $0.067 | N/A | 2025 speedup | +| BGE-large (2x faster) | $0.204 | N/A | 2025 speedup | +| **Image Models (2025)** | +| Flux 1 Schnell | $0.0000528 per 512x512 tile | - | +| Leonardo Lucid | $0.006996 per 512x512 tile | NEW 2025 | +| Leonardo Phoenix | $0.005830 per 512x512 tile | NEW 2025 | +| **Audio Models (2025)** | +| Deepgram Aura 2 | $0.030 per 1k chars | NEW 2025 | +| Deepgram Nova 3 | $0.0052 per audio min | NEW 2025 | +| Whisper v3 Turbo | $0.0005 per audio min | NEW 2025 | + +--- + +## Error Handling with Retry + +```typescript +async function runAIWithRetry( + env: Env, + model: string, + inputs: any, + maxRetries = 3 +): Promise { + let lastError: Error; + + for (let i = 0; i < maxRetries; i++) { + try { + return await env.AI.run(model, inputs); + } catch (error) { + lastError = error as Error; + + // Rate limit - retry with exponential backoff + if (lastError.message.toLowerCase().includes('rate limit')) { + await new Promise((resolve) => setTimeout(resolve, Math.pow(2, i) * 1000)); + continue; + } + + throw error; // Other errors - fail immediately + } + } + + throw lastError!; +} +``` + +--- + +## OpenAI Compatibility + +```typescript +import OpenAI from 'openai'; + +const openai = new OpenAI({ + apiKey: env.CLOUDFLARE_API_KEY, + baseURL: `https://api.cloudflare.com/client/v4/accounts/${env.ACCOUNT_ID}/ai/v1`, +}); + +// Chat completions +await openai.chat.completions.create({ + model: '@cf/meta/llama-3.1-8b-instruct', + messages: [{ role: 'user', content: 'Hello!' }], +}); +``` + +**Endpoints:** `/v1/chat/completions`, `/v1/embeddings` + +--- + +## Vercel AI SDK Integration (workers-ai-provider v2.0.0) + +```typescript +import { createWorkersAI } from 'workers-ai-provider'; // v2.0.0 with AI SDK v5 +import { generateText, streamText } from 'ai'; + +const workersai = createWorkersAI({ binding: env.AI }); + +// Generate or stream +await generateText({ + model: workersai('@cf/meta/llama-3.1-8b-instruct'), + prompt: 'Write a poem', +}); +``` + +--- + +## References + +- [Workers AI Docs](https://developers.cloudflare.com/workers-ai/) +- [Models Catalog](https://developers.cloudflare.com/workers-ai/models/) +- [AI Gateway](https://developers.cloudflare.com/ai-gateway/) +- [Pricing](https://developers.cloudflare.com/workers-ai/platform/pricing/) +- [Changelog](https://developers.cloudflare.com/workers-ai/changelog/) +- [LoRA Adapters](https://developers.cloudflare.com/workers-ai/features/fine-tunes/loras/) +- **MCP Tool**: Use `mcp__cloudflare-docs__search_cloudflare_documentation` for latest docs diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..5f3d32f --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,77 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:jezweb/claude-skills:skills/cloudflare-workers-ai", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "c6c7cff5470919f90577b7a30ea3eeb6143e2569", + "treeHash": "f913bdf9a31445d82cd5c28b5af4dda3c82a3b1cc0b23d23a829f0169ce10929", + "generatedAt": "2025-11-28T10:18:56.601166Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "cloudflare-workers-ai", + "description": "Run LLMs and AI models on Cloudflares global GPU network with Workers AI. Includes Llama, Flux image generation, BGE embeddings, and streaming support with AI Gateway for caching and logging. Use when: implementing LLM inference, generating images with Flux/Stable Diffusion, building RAG with embeddings, streaming AI responses, using AI Gateway for cost tracking, or troubleshooting AI_ERROR, rate limits, model not found, token limits, or neurons exceeded.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "0b93156a6b43248434bbc80d931f1b8fe0e7fbd89bc00d5208751a29037fe268" + }, + { + "path": "SKILL.md", + "sha256": "e15ece7e98ff516abe2eb35395ac994378e199009d96b394cc512c8ecf8a1f64" + }, + { + "path": "references/best-practices.md", + "sha256": "127736f98f45c2b16691d702b2fdececcf7ace4f9deda400ad66c631c88395de" + }, + { + "path": "references/models-catalog.md", + "sha256": "a4326bf852d2b1146df5b29af14f501fc88b0877d6e895875303d9c813437d12" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "0f88205d95ea55547ab81c2d50a91e1eef419cf79f0899f824482bbd668c10f0" + }, + { + "path": "templates/ai-gateway-integration.ts", + "sha256": "9f1b79932e782d87c71932ec26653fd917e327e2b206f4ae7d7c64b5ef78c38f" + }, + { + "path": "templates/wrangler-ai-config.jsonc", + "sha256": "b398ebe0160c8e61e055f25720fe5d40e13a473ddde8edf09ceeb4c92c8f283a" + }, + { + "path": "templates/ai-vision-models.ts", + "sha256": "c1a712ba5d0c83611f3e25ed927d3059a55317e8d4ce9a8e9604492708befbff" + }, + { + "path": "templates/ai-embeddings-rag.ts", + "sha256": "48b8b9e515479cc946af8712d98807e5f8bc5dd9c5384d2c4f350a5228e2ff07" + }, + { + "path": "templates/ai-text-generation.ts", + "sha256": "a1710cdf6deebb3d70f870af076e87fd56174b730e14aa238b4777e4f41a830a" + }, + { + "path": "templates/ai-image-generation.ts", + "sha256": "90f82643ab1fec94c2c3cbe27ff4e9b3a2f6bf230026da89e3ea1ffb3b912a36" + } + ], + "dirSha256": "f913bdf9a31445d82cd5c28b5af4dda3c82a3b1cc0b23d23a829f0169ce10929" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/best-practices.md b/references/best-practices.md new file mode 100644 index 0000000..c488bf8 --- /dev/null +++ b/references/best-practices.md @@ -0,0 +1,524 @@ +# Cloudflare Workers AI - Best Practices + +Production-tested patterns for building reliable, cost-effective AI applications with Workers AI. + +--- + +## Table of Contents + +1. [Streaming Best Practices](#streaming-best-practices) +2. [Error Handling](#error-handling) +3. [Cost Optimization](#cost-optimization) +4. [Performance Optimization](#performance-optimization) +5. [Security](#security) +6. [Monitoring & Observability](#monitoring--observability) +7. [Production Checklist](#production-checklist) + +--- + +## Streaming Best Practices + +### Why Streaming is Essential + +**❌ Without streaming:** +- Buffers entire response in memory +- Higher latency (wait for complete response) +- Risk of Worker timeout (30s default) +- Poor user experience for long content + +**✅ With streaming:** +- Immediate first token +- Lower memory usage +- Better UX (progressive rendering) +- No timeout issues + +### Implementation + +```typescript +// Always use stream: true for text generation +const stream = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [{ role: 'user', content: prompt }], + stream: true, // CRITICAL +}); + +return new Response(stream, { + headers: { 'content-type': 'text/event-stream' }, +}); +``` + +### Client-Side Handling + +```typescript +const response = await fetch('/api/chat', { + method: 'POST', + body: JSON.stringify({ prompt }), +}); + +const reader = response.body.getReader(); +const decoder = new TextDecoder(); + +while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value); + // Update UI with chunk +} +``` + +--- + +## Error Handling + +### 1. Rate Limit Errors (429) + +**Pattern: Exponential Backoff** + +```typescript +async function runWithRetry( + ai: Ai, + model: string, + inputs: any, + maxRetries = 3 +): Promise { + let delay = 1000; + + for (let i = 0; i < maxRetries; i++) { + try { + return await ai.run(model, inputs); + } catch (error) { + const message = (error as Error).message.toLowerCase(); + + if (message.includes('429') || message.includes('rate limit')) { + if (i < maxRetries - 1) { + await new Promise((resolve) => setTimeout(resolve, delay)); + delay *= 2; // Exponential backoff: 1s, 2s, 4s + continue; + } + } + + throw error; + } + } +} +``` + +### 2. Model Unavailable + +**Pattern: Fallback Models** + +```typescript +const models = [ + '@cf/meta/llama-3.1-8b-instruct', // Primary + '@cf/meta/llama-3.2-1b-instruct', // Fallback (faster) + '@cf/qwen/qwen1.5-7b-chat-awq', // Fallback (alternative) +]; + +async function runWithFallback(ai: Ai, inputs: any): Promise { + for (const model of models) { + try { + return await ai.run(model, inputs); + } catch (error) { + const message = (error as Error).message.toLowerCase(); + if (!message.includes('unavailable')) throw error; + // Try next model + } + } + + throw new Error('All models unavailable'); +} +``` + +### 3. Token Limit Exceeded + +**Pattern: Input Validation** + +```typescript +function estimateTokens(text: string): number { + return Math.ceil(text.length / 4); +} + +function validateInput(text: string, maxTokens = 2048): void { + const tokens = estimateTokens(text); + + if (tokens > maxTokens) { + throw new Error( + `Input too long: ${tokens} tokens (max: ${maxTokens})` + ); + } +} + +// Usage +try { + validateInput(userInput); + const response = await env.AI.run(model, { prompt: userInput }); +} catch (error) { + return c.json({ error: (error as Error).message }, 400); +} +``` + +--- + +## Cost Optimization + +### 1. Use AI Gateway for Caching + +**Without AI Gateway:** +- Same prompt = new inference = cost + +**With AI Gateway:** +- Same prompt = cached response = free + +```typescript +const response = await env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { messages }, + { gateway: { id: 'my-gateway' } } // Enable caching +); +``` + +**Savings**: 50-90% for repeated queries + +### 2. Choose the Right Model + +**Cost Comparison** (per 1M output tokens): + +| Model | Cost | Use Case | +|-------|------|----------| +| Llama 3.2 1B | $0.201 | Simple tasks, high volume | +| Llama 3.1 8B | $0.606 | General purpose | +| Qwen 1.5 14B | $1.20+ | Complex reasoning | + +**Strategy**: Use smallest model that meets quality requirements + +### 3. Limit Output Length + +```typescript +const response = await env.AI.run(model, { + messages, + max_tokens: 256, // Limit output (default: varies) +}); +``` + +### 4. Batch Embeddings + +```typescript +// ❌ Bad: 100 separate requests +for (const text of texts) { + await env.AI.run('@cf/baai/bge-base-en-v1.5', { text: [text] }); +} + +// ✅ Good: 1 batch request +await env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: texts, // Up to 100 texts per request +}); +``` + +### 5. Monitor Neurons Usage + +```typescript +app.use('*', async (c, next) => { + const start = Date.now(); + await next(); + + console.log({ + path: c.req.path, + duration: Date.now() - start, + logId: c.env.AI.aiGatewayLogId, // Check dashboard for neurons + }); +}); +``` + +--- + +## Performance Optimization + +### 1. Use Faster Models When Appropriate + +**Speed Ranking** (fastest to slowest): + +1. `@cf/qwen/qwen1.5-0.5b-chat` (1500/min limit) +2. `@cf/meta/llama-3.2-1b-instruct` +3. `@cf/tinyllama/tinyllama-1.1b-chat-v1.0` (720/min) +4. `@hf/thebloke/mistral-7b-instruct-v0.1-awq` (400/min) +5. `@cf/meta/llama-3.1-8b-instruct` +6. `@cf/qwen/qwen1.5-14b-chat-awq` (150/min) + +### 2. Parallel Requests + +```typescript +// Process multiple tasks in parallel +const [summary, keywords, sentiment] = await Promise.all([ + env.AI.run(model, { prompt: `Summarize: ${text}` }), + env.AI.run(model, { prompt: `Extract keywords: ${text}` }), + env.AI.run(model, { prompt: `Sentiment: ${text}` }), +]); +``` + +### 3. Edge Caching for Static Prompts + +```typescript +// Cache AI responses in KV +const cacheKey = `ai:${hash(prompt)}`; + +let response = await env.CACHE.get(cacheKey); +if (!response) { + const result = await env.AI.run(model, { prompt }); + response = result.response; + await env.CACHE.put(cacheKey, response, { expirationTtl: 3600 }); +} +``` + +--- + +## Security + +### 1. Never Expose API Keys + +**❌ Bad:** +```typescript +const openai = new OpenAI({ + apiKey: 'sk-1234...', // Hardcoded! +}); +``` + +**✅ Good:** +```typescript +const openai = new OpenAI({ + apiKey: env.OPENAI_API_KEY, // Environment variable +}); +``` + +### 2. Input Sanitization + +```typescript +function sanitizeInput(text: string): string { + // Remove potential prompt injection attempts + return text + .replace(/\{system\}/gi, '') + .replace(/\{assistant\}/gi, '') + .trim(); +} + +const prompt = sanitizeInput(userInput); +``` + +### 3. Rate Limiting Per User + +```typescript +import { RateLimiter } from '@/lib/rate-limiter'; + +const limiter = new RateLimiter({ + requests: 10, + window: 60, // 10 requests per minute +}); + +app.post('/chat', async (c) => { + const userId = c.req.header('x-user-id'); + + if (!await limiter.check(userId)) { + return c.json({ error: 'Rate limit exceeded' }, 429); + } + + // Process request... +}); +``` + +### 4. Content Filtering + +```typescript +const BLOCKED_PATTERNS = [ + /generate.*exploit/i, + /create.*malware/i, + /hack/i, +]; + +function isSafePrompt(prompt: string): boolean { + return !BLOCKED_PATTERNS.some((pattern) => pattern.test(prompt)); +} +``` + +--- + +## Monitoring & Observability + +### 1. Structured Logging + +```typescript +interface AILog { + timestamp: string; + model: string; + duration: number; + success: boolean; + error?: string; + logId?: string; +} + +async function logAIRequest(log: AILog): Promise { + console.log(JSON.stringify(log)); + // Or send to logging service (Datadog, Sentry, etc.) +} +``` + +### 2. Error Tracking + +```typescript +app.onError((err, c) => { + console.error({ + error: err.message, + stack: err.stack, + path: c.req.path, + timestamp: new Date().toISOString(), + }); + + return c.json({ error: 'Internal server error' }, 500); +}); +``` + +### 3. Performance Metrics + +```typescript +const metrics = { + requests: 0, + errors: 0, + totalDuration: 0, +}; + +app.use('*', async (c, next) => { + metrics.requests++; + const start = Date.now(); + + try { + await next(); + } catch (error) { + metrics.errors++; + throw error; + } finally { + metrics.totalDuration += Date.now() - start; + } +}); + +app.get('/metrics', (c) => { + return c.json({ + ...metrics, + avgDuration: metrics.totalDuration / metrics.requests, + errorRate: (metrics.errors / metrics.requests) * 100, + }); +}); +``` + +--- + +## Production Checklist + +### Before Deploying + +- [ ] **Streaming enabled** for all text generation endpoints +- [ ] **AI Gateway configured** for cost tracking and caching +- [ ] **Error handling** with retry logic for rate limits +- [ ] **Input validation** to prevent token limit errors +- [ ] **Rate limiting** implemented per user +- [ ] **Monitoring** and logging configured +- [ ] **Model selection** optimized for cost/quality balance +- [ ] **Fallback models** configured for high availability +- [ ] **Security** review completed (input sanitization, content filtering) +- [ ] **Load testing** completed with expected traffic +- [ ] **Cost estimation** based on expected usage +- [ ] **Documentation** for API endpoints and error codes + +### Cost Planning + +**Estimate your costs:** + +1. Expected requests/day: _____ +2. Avg tokens per request (input + output): _____ +3. Model neurons cost: _____ +4. Daily neurons = requests × tokens × neurons_per_token +5. Daily cost = (daily neurons - 10,000 free) / 1,000 × $0.011 + +**Example:** +- 10,000 requests/day +- 500 tokens/request (avg) +- Llama 3.1 8B: ~50 neurons/1K tokens +- Daily neurons: 10,000 × 0.5K × 50 = 250,000 +- Daily cost: (250,000 - 10,000) / 1,000 × $0.011 = **$2.64** + +### Performance Targets + +- **Time to first token**: <500ms +- **Avg response time**: <2s (streaming) +- **Error rate**: <1% +- **Cache hit rate**: >50% (with AI Gateway) + +--- + +## Common Patterns + +### 1. RAG Pattern + +```typescript +// 1. Generate query embedding +const embeddings = await env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: [query], +}); + +// 2. Search Vectorize +const results = await env.VECTORIZE.query(embeddings.data[0], { + topK: 3, +}); + +// 3. Build context +const context = results.matches.map((m) => m.metadata.text).join('\n\n'); + +// 4. Generate response with context +const stream = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { role: 'system', content: `Context:\n${context}` }, + { role: 'user', content: query }, + ], + stream: true, +}); +``` + +### 2. Multi-Model Consensus + +```typescript +const models = [ + '@cf/meta/llama-3.1-8b-instruct', + '@cf/qwen/qwen1.5-7b-chat-awq', + '@hf/thebloke/mistral-7b-instruct-v0.1-awq', +]; + +const responses = await Promise.all( + models.map((model) => env.AI.run(model, { prompt })) +); + +// Combine or compare responses +``` + +### 3. Progressive Enhancement + +```typescript +// Start with fast model, upgrade if needed +let response = await env.AI.run('@cf/meta/llama-3.2-1b-instruct', { + prompt, +}); + +// Check quality (length, coherence, etc.) +if (response.response.length < 50) { + // Retry with better model + response = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + prompt, + }); +} +``` + +--- + +## References + +- [Workers AI Documentation](https://developers.cloudflare.com/workers-ai/) +- [AI Gateway](https://developers.cloudflare.com/ai-gateway/) +- [Pricing](https://developers.cloudflare.com/workers-ai/platform/pricing/) +- [Limits](https://developers.cloudflare.com/workers-ai/platform/limits/) +- [Models Catalog](https://developers.cloudflare.com/workers-ai/models/) diff --git a/references/models-catalog.md b/references/models-catalog.md new file mode 100644 index 0000000..67a3592 --- /dev/null +++ b/references/models-catalog.md @@ -0,0 +1,245 @@ +# Cloudflare Workers AI - Models Catalog + +Complete catalog of Workers AI models organized by task type. + +**Last Updated**: 2025-10-21 +**Official Catalog**: https://developers.cloudflare.com/workers-ai/models/ + +--- + +## Text Generation (LLMs) + +### Meta Llama Models + +| Model ID | Size | Best For | Rate Limit | +|----------|------|----------|------------| +| `@cf/meta/llama-3.1-8b-instruct` | 8B | General purpose, balanced | 300/min | +| `@cf/meta/llama-3.1-8b-instruct-fast` | 8B | Faster inference | 300/min | +| `@cf/meta/llama-3.2-1b-instruct` | 1B | Ultra-fast, simple tasks | 300/min | +| `@cf/meta/llama-3.2-3b-instruct` | 3B | Fast, good quality | 300/min | +| `@cf/meta/llama-2-7b-chat-int8` | 7B | Legacy, reliable | 300/min | +| `@cf/meta/llama-2-13b-chat-awq` | 13B | Higher quality (slower) | 300/min | + +### Qwen Models + +| Model ID | Size | Best For | Rate Limit | +|----------|------|----------|------------| +| `@cf/qwen/qwen1.5-14b-chat-awq` | 14B | High quality, complex reasoning | 150/min | +| `@cf/qwen/qwen1.5-7b-chat-awq` | 7B | Balanced quality/speed | 300/min | +| `@cf/qwen/qwen1.5-1.8b-chat` | 1.8B | Fast, lightweight | 720/min | +| `@cf/qwen/qwen1.5-0.5b-chat` | 0.5B | Ultra-fast, ultra-lightweight | 1500/min | + +### Mistral Models + +| Model ID | Size | Best For | Rate Limit | +|----------|------|----------|------------| +| `@hf/thebloke/mistral-7b-instruct-v0.1-awq` | 7B | Fast, efficient | 400/min | +| `@hf/thebloke/openhermes-2.5-mistral-7b-awq` | 7B | Instruction following | 300/min | + +### DeepSeek Models + +| Model ID | Size | Best For | Rate Limit | +|----------|------|----------|------------| +| `@cf/deepseek-ai/deepseek-r1-distill-qwen-32b` | 32B | Coding, technical content | 300/min | +| `@cf/deepseek-ai/deepseek-coder-6.7b-instruct-awq` | 6.7B | Code generation | 300/min | + +### Other Models + +| Model ID | Size | Best For | Rate Limit | +|----------|------|----------|------------| +| `@cf/tinyllama/tinyllama-1.1b-chat-v1.0` | 1.1B | Extremely fast, limited capability | 720/min | +| `@cf/microsoft/phi-2` | 2.7B | Fast, efficient | 720/min | +| `@cf/google/gemma-2b-it-lora` | 2B | Instruction tuned | 300/min | +| `@cf/google/gemma-7b-it-lora` | 7B | Higher quality | 300/min | + +--- + +## Text Embeddings + +| Model ID | Dimensions | Best For | Rate Limit | +|----------|-----------|----------|------------| +| `@cf/baai/bge-base-en-v1.5` | 768 | General purpose RAG | 3000/min | +| `@cf/baai/bge-large-en-v1.5` | 1024 | High accuracy search | 1500/min | +| `@cf/baai/bge-small-en-v1.5` | 384 | Fast, low storage | 3000/min | +| `@cf/baai/bge-m3` | 1024 | Multilingual | 3000/min | + +**Use Case**: RAG, semantic search, similarity detection, clustering + +--- + +## Image Generation + +| Model ID | Type | Best For | Rate Limit | +|----------|------|----------|------------| +| `@cf/black-forest-labs/flux-1-schnell` | Text-to-Image | Photorealistic, high quality | 720/min | +| `@cf/stabilityai/stable-diffusion-xl-base-1.0` | Text-to-Image | General purpose | 720/min | +| `@cf/lykon/dreamshaper-8-lcm` | Text-to-Image | Artistic, stylized | 720/min | +| `@cf/runwayml/stable-diffusion-v1-5-img2img` | Image-to-Image | Transform images | 1500/min | +| `@cf/runwayml/stable-diffusion-v1-5-inpainting` | Inpainting | Edit specific areas | 1500/min | +| `@cf/bytedance/stable-diffusion-xl-lightning` | Text-to-Image | Fast generation | 720/min | + +**Output**: PNG images (~5 MB max) + +--- + +## Vision Models + +| Model ID | Task | Best For | Rate Limit | +|----------|------|----------|------------| +| `@cf/meta/llama-3.2-11b-vision-instruct` | Image Understanding | Q&A, captioning, analysis | 720/min | +| `@cf/unum/uform-gen2-qwen-500m` | Image Captioning | Fast captions | 720/min | + +**Input**: Base64-encoded images + +--- + +## Translation + +| Model ID | Languages | Rate Limit | +|----------|-----------|------------| +| `@cf/meta/m2m100-1.2b` | 100+ languages | 720/min | + +**Supported Language Pairs**: https://developers.cloudflare.com/workers-ai/models/m2m100-1.2b/ + +--- + +## Text Classification + +| Model ID | Task | Rate Limit | +|----------|------|------------| +| `@cf/huggingface/distilbert-sst-2-int8` | Sentiment analysis | 2000/min | +| `@hf/thebloke/openhermes-2.5-mistral-7b-awq` | General classification | 300/min | + +**Output**: Label + confidence score + +--- + +## Automatic Speech Recognition + +| Model ID | Best For | Rate Limit | +|----------|----------|------------| +| `@cf/openai/whisper` | General transcription | 720/min | +| `@cf/openai/whisper-tiny-en` | English only, fast | 720/min | + +**Input**: Audio files (MP3, WAV, etc.) + +--- + +## Object Detection + +| Model ID | Task | Rate Limit | +|----------|------|------------| +| `@cf/facebook/detr-resnet-50` | Object detection | 3000/min | + +**Output**: Bounding boxes + labels + +--- + +## Image Classification + +| Model ID | Classes | Rate Limit | +|----------|---------|------------| +| `@cf/microsoft/resnet-50` | 1000 ImageNet classes | 3000/min | + +**Output**: Top-5 predictions with probabilities + +--- + +## Summarization + +| Model ID | Best For | Rate Limit | +|----------|----------|------------| +| `@cf/facebook/bart-large-cnn` | News articles, documents | 1500/min | + +--- + +## Text-to-Image (Legacy) + +| Model ID | Type | Rate Limit | +|----------|------|------------| +| `@cf/stabilityai/stable-diffusion-v1-5-img2img` | Image-to-Image | 1500/min | + +--- + +## Model Selection Guide + +### For Text Generation + +**Speed Priority:** +1. `@cf/qwen/qwen1.5-0.5b-chat` (1500/min) +2. `@cf/meta/llama-3.2-1b-instruct` (300/min) +3. `@cf/tinyllama/tinyllama-1.1b-chat-v1.0` (720/min) + +**Quality Priority:** +1. `@cf/qwen/qwen1.5-14b-chat-awq` (150/min) +2. `@cf/deepseek-ai/deepseek-r1-distill-qwen-32b` (300/min) +3. `@cf/meta/llama-3.1-8b-instruct` (300/min) + +**Balanced:** +1. `@cf/meta/llama-3.1-8b-instruct` (300/min) +2. `@hf/thebloke/mistral-7b-instruct-v0.1-awq` (400/min) +3. `@cf/qwen/qwen1.5-7b-chat-awq` (300/min) + +### For Embeddings + +**General Purpose RAG:** +- `@cf/baai/bge-base-en-v1.5` (768 dims, 3000/min) + +**High Accuracy:** +- `@cf/baai/bge-large-en-v1.5` (1024 dims, 1500/min) + +**Fast/Low Storage:** +- `@cf/baai/bge-small-en-v1.5` (384 dims, 3000/min) + +### For Image Generation + +**Best Quality:** +- `@cf/black-forest-labs/flux-1-schnell` + +**General Purpose:** +- `@cf/stabilityai/stable-diffusion-xl-base-1.0` + +**Artistic/Stylized:** +- `@cf/lykon/dreamshaper-8-lcm` + +**Fast:** +- `@cf/bytedance/stable-diffusion-xl-lightning` + +--- + +## Rate Limits Summary + +| Task Type | Default Limit | High-Speed Models | +|-----------|---------------|-------------------| +| Text Generation | 300/min | 400-1500/min | +| Text Embeddings | 3000/min | 1500/min (large) | +| Image Generation | 720/min | 720/min | +| Vision Models | 720/min | 720/min | +| Translation | 720/min | 720/min | +| Classification | 2000/min | 2000/min | +| Speech Recognition | 720/min | 720/min | +| Object Detection | 3000/min | 3000/min | + +--- + +## Pricing (Neurons) + +Pricing varies by model. Common examples: + +| Model | Input (1M tokens) | Output (1M tokens) | +|-------|-------------------|-------------------| +| Llama 3.2 1B | $0.027 | $0.201 | +| Llama 3.1 8B | $0.088 | $0.606 | +| BGE-base embeddings | $0.005 | N/A | +| Flux image gen | ~$0.011/image | N/A | + +**Free Tier**: 10,000 neurons/day +**Paid Tier**: $0.011 per 1,000 neurons + +--- + +## References + +- [Official Models Catalog](https://developers.cloudflare.com/workers-ai/models/) +- [Rate Limits](https://developers.cloudflare.com/workers-ai/platform/limits/) +- [Pricing](https://developers.cloudflare.com/workers-ai/platform/pricing/) diff --git a/templates/ai-embeddings-rag.ts b/templates/ai-embeddings-rag.ts new file mode 100644 index 0000000..69e565d --- /dev/null +++ b/templates/ai-embeddings-rag.ts @@ -0,0 +1,491 @@ +/** + * Cloudflare Workers AI - Embeddings & RAG Examples + * + * This template demonstrates: + * - Generating text embeddings with BGE models + * - Storing embeddings in Vectorize + * - Semantic search with vector similarity + * - Complete RAG (Retrieval Augmented Generation) pattern + * - Document chunking strategies + */ + +import { Hono } from 'hono'; + +type Bindings = { + AI: Ai; + VECTORIZE: Vectorize; + DB?: D1Database; +}; + +const app = new Hono<{ Bindings: Bindings }>(); + +// ============================================================================ +// Generate Embeddings +// ============================================================================ + +/** + * Generate embeddings for text + * BGE-base: 768 dimensions, good balance + * BGE-large: 1024 dimensions, higher accuracy + * BGE-small: 384 dimensions, faster/smaller + */ + +app.post('/embeddings', async (c) => { + try { + const { text } = await c.req.json<{ text: string | string[] }>(); + + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: Array.isArray(text) ? text : [text], + }); + + return c.json({ + success: true, + shape: embeddings.shape, // [batch_size, dimensions] + data: embeddings.data, // Array of vectors + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Batch Embeddings +// ============================================================================ + +/** + * Generate embeddings for multiple texts in one request + * More efficient than individual requests + */ + +app.post('/embeddings/batch', async (c) => { + try { + const { texts } = await c.req.json<{ texts: string[] }>(); + + if (!texts || texts.length === 0) { + return c.json({ error: 'texts array is required' }, 400); + } + + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: texts, + }); + + return c.json({ + success: true, + count: texts.length, + shape: embeddings.shape, + embeddings: embeddings.data, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Store Embeddings in Vectorize +// ============================================================================ + +app.post('/documents', async (c) => { + try { + const { id, text, metadata } = await c.req.json<{ + id: string; + text: string; + metadata?: Record; + }>(); + + // Generate embedding + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: [text], + }); + + const vector = embeddings.data[0]; + + // Store in Vectorize + await c.env.VECTORIZE.upsert([ + { + id, + values: vector, + metadata: { + text, + ...metadata, + createdAt: Date.now(), + }, + }, + ]); + + return c.json({ + success: true, + message: 'Document indexed', + id, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Semantic Search +// ============================================================================ + +app.post('/search', async (c) => { + try { + const { query, topK = 5 } = await c.req.json<{ + query: string; + topK?: number; + }>(); + + // Convert query to embedding + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: [query], + }); + + const vector = embeddings.data[0]; + + // Search Vectorize + const results = await c.env.VECTORIZE.query(vector, { + topK, + returnMetadata: true, + }); + + return c.json({ + success: true, + query, + results: results.matches.map((match) => ({ + id: match.id, + score: match.score, + text: match.metadata?.text, + metadata: match.metadata, + })), + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// RAG Pattern: Query with Context +// ============================================================================ + +app.post('/rag/ask', async (c) => { + try { + const { question, topK = 3 } = await c.req.json<{ + question: string; + topK?: number; + }>(); + + // Step 1: Convert question to embedding + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: [question], + }); + + const vector = embeddings.data[0]; + + // Step 2: Find relevant documents + const results = await c.env.VECTORIZE.query(vector, { + topK, + returnMetadata: true, + }); + + // Step 3: Build context from matches + const context = results.matches + .map((match) => match.metadata?.text) + .filter(Boolean) + .join('\n\n'); + + // Step 4: Generate answer with context + const stream = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { + role: 'system', + content: `Answer the question using ONLY the following context. If the context doesn't contain relevant information, say "I don't have enough information to answer that."\n\nContext:\n${context}`, + }, + { + role: 'user', + content: question, + }, + ], + stream: true, + }); + + return new Response(stream, { + headers: { + 'content-type': 'text/event-stream', + 'x-sources': JSON.stringify( + results.matches.map((m) => ({ id: m.id, score: m.score })) + ), + }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Document Chunking +// ============================================================================ + +/** + * Split long documents into chunks for better embedding quality + * Recommended: 200-500 tokens per chunk + */ + +function chunkText(text: string, chunkSize = 500, overlap = 50): string[] { + const words = text.split(/\s+/); + const chunks: string[] = []; + + for (let i = 0; i < words.length; i += chunkSize - overlap) { + const chunk = words.slice(i, i + chunkSize).join(' '); + chunks.push(chunk); + } + + return chunks; +} + +app.post('/documents/long', async (c) => { + try { + const { id, text, chunkSize = 500 } = await c.req.json<{ + id: string; + text: string; + chunkSize?: number; + }>(); + + // Split into chunks + const chunks = chunkText(text, chunkSize); + + // Generate embeddings for all chunks + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: chunks, + }); + + // Store each chunk in Vectorize + const vectors = chunks.map((chunk, index) => ({ + id: `${id}-chunk-${index}`, + values: embeddings.data[index], + metadata: { + documentId: id, + chunkIndex: index, + text: chunk, + totalChunks: chunks.length, + }, + })); + + await c.env.VECTORIZE.upsert(vectors); + + return c.json({ + success: true, + message: 'Document indexed with chunks', + documentId: id, + chunks: chunks.length, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// RAG with Citations +// ============================================================================ + +app.post('/rag/ask-with-citations', async (c) => { + try { + const { question } = await c.req.json<{ question: string }>(); + + // Find relevant chunks + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: [question], + }); + + const results = await c.env.VECTORIZE.query(embeddings.data[0], { + topK: 5, + returnMetadata: true, + }); + + // Build context with citations + const context = results.matches + .map( + (match, i) => + `[Source ${i + 1}] ${match.metadata?.text} (Relevance: ${(match.score * 100).toFixed(1)}%)` + ) + .join('\n\n'); + + // Generate answer + const response = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { + role: 'system', + content: `Answer the question using the provided sources. Cite sources using [Source N] format. + +${context}`, + }, + { + role: 'user', + content: question, + }, + ], + }); + + return c.json({ + success: true, + answer: response.response, + sources: results.matches.map((m, i) => ({ + id: i + 1, + documentId: m.metadata?.documentId, + text: m.metadata?.text, + score: m.score, + })), + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Hybrid Search (Keyword + Semantic) +// ============================================================================ + +/** + * Combine keyword search (D1) with semantic search (Vectorize) + * for better recall + */ + +app.post('/search/hybrid', async (c) => { + try { + const { query } = await c.req.json<{ query: string }>(); + + // Semantic search + const embeddings = await c.env.AI.run('@cf/baai/bge-base-en-v1.5', { + text: [query], + }); + + const vectorResults = await c.env.VECTORIZE.query(embeddings.data[0], { + topK: 5, + returnMetadata: true, + }); + + // Keyword search (if D1 available) + let keywordResults: any[] = []; + if (c.env.DB) { + const { results } = await c.env.DB.prepare( + 'SELECT id, text FROM documents WHERE text LIKE ? LIMIT 5' + ) + .bind(`%${query}%`) + .all(); + + keywordResults = results || []; + } + + // Combine and deduplicate + const combined = [ + ...vectorResults.matches.map((m) => ({ + id: m.id, + text: m.metadata?.text, + score: m.score, + source: 'vector', + })), + ...keywordResults.map((r) => ({ + id: r.id, + text: r.text, + score: 1.0, + source: 'keyword', + })), + ]; + + // Deduplicate by ID + const unique = Array.from(new Map(combined.map((item) => [item.id, item])).values()); + + return c.json({ + success: true, + query, + results: unique, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Delete Documents +// ============================================================================ + +app.delete('/documents/:id', async (c) => { + try { + const id = c.req.param('id'); + + // Delete from Vectorize + await c.env.VECTORIZE.deleteByIds([id]); + + return c.json({ + success: true, + message: 'Document deleted', + id, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Health Check +// ============================================================================ + +app.get('/health', (c) => { + return c.json({ + status: 'ok', + timestamp: new Date().toISOString(), + }); +}); + +export default app; diff --git a/templates/ai-gateway-integration.ts b/templates/ai-gateway-integration.ts new file mode 100644 index 0000000..02db4c8 --- /dev/null +++ b/templates/ai-gateway-integration.ts @@ -0,0 +1,432 @@ +/** + * Cloudflare AI Gateway - Integration Examples + * + * This template demonstrates: + * - AI Gateway setup and configuration + * - Caching AI responses + * - Logging and analytics + * - Cost tracking + * - Rate limiting + * - Feedback collection + */ + +import { Hono } from 'hono'; + +type Bindings = { + AI: Ai; +}; + +const app = new Hono<{ Bindings: Bindings }>(); + +// ============================================================================ +// Basic AI Gateway Usage +// ============================================================================ + +/** + * Create a gateway at: https://dash.cloudflare.com/ai/ai-gateway + * Use the gateway ID in your requests + */ + +app.post('/gateway/basic', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { + id: 'my-gateway', // Your gateway ID + }, + } + ); + + // Access log ID for analytics + const logId = c.env.AI.aiGatewayLogId; + + return c.json({ + success: true, + response: response.response, + logId, // Use for tracking and feedback + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// AI Gateway with Caching +// ============================================================================ + +/** + * AI Gateway can cache responses to reduce costs + * Same prompt = cached response (no inference cost) + */ + +app.post('/gateway/cached', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { + id: 'my-gateway', + skipCache: false, // Use cache (default) + }, + } + ); + + return c.json({ + success: true, + response: response.response, + logId: c.env.AI.aiGatewayLogId, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Skip Cache for Dynamic Content +// ============================================================================ + +app.post('/gateway/no-cache', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { + id: 'my-gateway', + skipCache: true, // Always fetch fresh response + }, + } + ); + + return c.json({ + success: true, + response: response.response, + logId: c.env.AI.aiGatewayLogId, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Send Feedback to AI Gateway +// ============================================================================ + +/** + * Track user satisfaction with AI responses + * Helps optimize prompts and model selection + */ + +app.post('/gateway/feedback', async (c) => { + try { + const { logId, rating, comment } = await c.req.json<{ + logId: string; + rating: number; // 1-5 + comment?: string; + }>(); + + const gateway = c.env.AI.gateway('my-gateway'); + + await gateway.patchLog(logId, { + feedback: { + rating, + comment, + }, + }); + + return c.json({ + success: true, + message: 'Feedback recorded', + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Track Cost Per Request +// ============================================================================ + +/** + * Monitor neurons usage per request + * AI Gateway logs show cost breakdown + */ + +app.post('/gateway/track-cost', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const start = Date.now(); + + const response = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { + id: 'my-gateway', + }, + } + ); + + const duration = Date.now() - start; + const logId = c.env.AI.aiGatewayLogId; + + return c.json({ + success: true, + response: response.response, + metrics: { + logId, + duration, + // Check AI Gateway dashboard for neurons usage + }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Multi-Model Gateway +// ============================================================================ + +/** + * Use different models through the same gateway + * Compare performance and costs + */ + +app.post('/gateway/multi-model', async (c) => { + try { + const { prompt, model = '@cf/meta/llama-3.1-8b-instruct' } = await c.req.json<{ + prompt: string; + model?: string; + }>(); + + const response = await c.env.AI.run( + model, + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { + id: 'my-gateway', + }, + } + ); + + return c.json({ + success: true, + model, + response: response.response, + logId: c.env.AI.aiGatewayLogId, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Streaming with AI Gateway +// ============================================================================ + +app.post('/gateway/stream', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const stream = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + stream: true, + }, + { + gateway: { + id: 'my-gateway', + }, + } + ); + + // Log ID available after streaming starts + const logId = c.env.AI.aiGatewayLogId; + + return new Response(stream, { + headers: { + 'content-type': 'text/event-stream', + 'x-ai-gateway-log-id': logId || '', + }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Request Analytics Middleware +// ============================================================================ + +/** + * Log all AI requests for analytics + */ + +app.use('/ai/*', async (c, next) => { + const start = Date.now(); + const path = c.req.path; + + await next(); + + const duration = Date.now() - start; + const logId = c.env.AI.aiGatewayLogId; + + // Log to console (or send to analytics service) + console.log({ + timestamp: new Date().toISOString(), + path, + duration, + logId, + status: c.res.status, + }); +}); + +app.post('/ai/chat', async (c) => { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { id: 'my-gateway' }, + } + ); + + return c.json({ + success: true, + response: response.response, + }); +}); + +// ============================================================================ +// Rate Limit Protection +// ============================================================================ + +/** + * AI Gateway provides additional rate limiting + * Configure in dashboard: https://dash.cloudflare.com/ai/ai-gateway + */ + +app.post('/gateway/rate-limited', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await c.env.AI.run( + '@cf/meta/llama-3.1-8b-instruct', + { + messages: [{ role: 'user', content: prompt }], + }, + { + gateway: { + id: 'my-gateway', + }, + } + ); + + return c.json({ + success: true, + response: response.response, + }); + } catch (error) { + const message = (error as Error).message; + + // Check for rate limit error + if (message.includes('429') || message.includes('rate limit')) { + return c.json( + { + success: false, + error: 'Rate limit exceeded. Please try again later.', + retryAfter: 60, // seconds + }, + 429 + ); + } + + return c.json( + { + success: false, + error: message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Health Check +// ============================================================================ + +app.get('/health', (c) => { + return c.json({ + status: 'ok', + timestamp: new Date().toISOString(), + }); +}); + +export default app; diff --git a/templates/ai-image-generation.ts b/templates/ai-image-generation.ts new file mode 100644 index 0000000..9d37155 --- /dev/null +++ b/templates/ai-image-generation.ts @@ -0,0 +1,391 @@ +/** + * Cloudflare Workers AI - Image Generation Examples + * + * This template demonstrates: + * - Text-to-image with Flux models (highest quality) + * - Stable Diffusion XL + * - Image storage in R2 + * - Base64 and binary responses + * - Custom prompts and parameters + */ + +import { Hono } from 'hono'; + +type Bindings = { + AI: Ai; + BUCKET?: R2Bucket; +}; + +const app = new Hono<{ Bindings: Bindings }>(); + +// ============================================================================ +// Flux - Text-to-Image (Highest Quality) +// ============================================================================ + +/** + * Flux 1 Schnell - Fast, high-quality image generation + * Best for: Photorealistic images, detailed artwork + * Rate limit: 720/min + */ + +app.post('/generate/flux', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const imageStream = await c.env.AI.run('@cf/black-forest-labs/flux-1-schnell', { + prompt, + }); + + return new Response(imageStream, { + headers: { 'content-type': 'image/png' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Stable Diffusion XL +// ============================================================================ + +app.post('/generate/sdxl', async (c) => { + try { + const { prompt, num_steps = 20, guidance = 7.5 } = await c.req.json<{ + prompt: string; + num_steps?: number; + guidance?: number; + }>(); + + const imageStream = await c.env.AI.run( + '@cf/stabilityai/stable-diffusion-xl-base-1.0', + { + prompt, + num_steps, // More steps = higher quality, slower + guidance, // CFG scale: higher = more prompt adherence + } + ); + + return new Response(imageStream, { + headers: { 'content-type': 'image/png' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// DreamShaper (Artistic/Stylized) +// ============================================================================ + +app.post('/generate/dreamshaper', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const imageStream = await c.env.AI.run('@cf/lykon/dreamshaper-8-lcm', { + prompt, + }); + + return new Response(imageStream, { + headers: { 'content-type': 'image/png' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Generate and Store in R2 +// ============================================================================ + +app.post('/generate/save', async (c) => { + try { + const { prompt, filename } = await c.req.json<{ + prompt: string; + filename?: string; + }>(); + + if (!c.env.BUCKET) { + return c.json({ error: 'R2 bucket not configured' }, 500); + } + + // Generate image + const imageStream = await c.env.AI.run('@cf/black-forest-labs/flux-1-schnell', { + prompt, + }); + + const imageBytes = await new Response(imageStream).bytes(); + + // Generate filename + const key = filename || `images/${Date.now()}.png`; + + // Store in R2 + await c.env.BUCKET.put(key, imageBytes, { + httpMetadata: { + contentType: 'image/png', + }, + customMetadata: { + prompt, + generatedAt: new Date().toISOString(), + }, + }); + + return c.json({ + success: true, + message: 'Image generated and saved', + key, + url: `https://your-domain.com/${key}`, // Update with your R2 public URL + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Return Base64 Encoded Image +// ============================================================================ + +app.post('/generate/base64', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const imageStream = await c.env.AI.run('@cf/black-forest-labs/flux-1-schnell', { + prompt, + }); + + const imageBytes = await new Response(imageStream).bytes(); + const base64 = btoa(String.fromCharCode(...imageBytes)); + + return c.json({ + success: true, + image: `data:image/png;base64,${base64}`, + prompt, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Image-to-Image (Stable Diffusion) +// ============================================================================ + +/** + * Transform existing images based on prompts + * Requires base64-encoded input image + */ + +app.post('/generate/img2img', async (c) => { + try { + const { prompt, image, strength = 0.8 } = await c.req.json<{ + prompt: string; + image: string; // Base64 encoded + strength?: number; // 0.0-1.0, higher = more transformation + }>(); + + // Decode base64 image to array + const imageData = Uint8Array.from(atob(image.replace(/^data:image\/\w+;base64,/, '')), (c) => + c.charCodeAt(0) + ); + + const result = await c.env.AI.run('@cf/runwayml/stable-diffusion-v1-5-img2img', { + prompt, + image: Array.from(imageData), + strength, + }); + + return new Response(result, { + headers: { 'content-type': 'image/png' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Batch Generation +// ============================================================================ + +app.post('/generate/batch', async (c) => { + try { + const { prompts } = await c.req.json<{ prompts: string[] }>(); + + if (!prompts || prompts.length === 0) { + return c.json({ error: 'prompts array is required' }, 400); + } + + if (prompts.length > 5) { + return c.json({ error: 'Maximum 5 prompts per batch' }, 400); + } + + const images = await Promise.all( + prompts.map(async (prompt) => { + const imageStream = await c.env.AI.run('@cf/black-forest-labs/flux-1-schnell', { + prompt, + }); + + const imageBytes = await new Response(imageStream).bytes(); + const base64 = btoa(String.fromCharCode(...imageBytes)); + + return { + prompt, + image: `data:image/png;base64,${base64}`, + }; + }) + ); + + return c.json({ + success: true, + count: images.length, + images, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Prompt Enhancement +// ============================================================================ + +/** + * Use LLM to enhance user prompts for better image quality + */ + +app.post('/generate/enhanced', async (c) => { + try { + const { userPrompt } = await c.req.json<{ userPrompt: string }>(); + + // Step 1: Enhance prompt with LLM + const enhancement = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { + role: 'system', + content: + 'You are a Stable Diffusion prompt expert. Enhance the user prompt for image generation. Add details about style, lighting, quality, composition. Return ONLY the enhanced prompt, no explanations.', + }, + { + role: 'user', + content: userPrompt, + }, + ], + }); + + const enhancedPrompt = enhancement.response.trim(); + + // Step 2: Generate image with enhanced prompt + const imageStream = await c.env.AI.run('@cf/black-forest-labs/flux-1-schnell', { + prompt: enhancedPrompt, + }); + + const imageBytes = await new Response(imageStream).bytes(); + const base64 = btoa(String.fromCharCode(...imageBytes)); + + return c.json({ + success: true, + originalPrompt: userPrompt, + enhancedPrompt, + image: `data:image/png;base64,${base64}`, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// List Generated Images (from R2) +// ============================================================================ + +app.get('/images', async (c) => { + try { + if (!c.env.BUCKET) { + return c.json({ error: 'R2 bucket not configured' }, 500); + } + + const listed = await c.env.BUCKET.list({ + prefix: 'images/', + limit: 100, + }); + + const images = listed.objects.map((obj) => ({ + key: obj.key, + size: obj.size, + uploaded: obj.uploaded, + url: `https://your-domain.com/${obj.key}`, + })); + + return c.json({ + success: true, + count: images.length, + images, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Health Check +// ============================================================================ + +app.get('/health', (c) => { + return c.json({ + status: 'ok', + timestamp: new Date().toISOString(), + }); +}); + +export default app; diff --git a/templates/ai-text-generation.ts b/templates/ai-text-generation.ts new file mode 100644 index 0000000..0429c15 --- /dev/null +++ b/templates/ai-text-generation.ts @@ -0,0 +1,437 @@ +/** + * Cloudflare Workers AI - Text Generation Examples + * + * This template demonstrates: + * - Basic text generation (prompt and messages) + * - Streaming responses (RECOMMENDED for production) + * - Chat completions with conversation history + * - Structured output with JSON + * - Error handling and retry logic + * - Rate limit management + */ + +import { Hono } from 'hono'; + +type Bindings = { + AI: Ai; +}; + +const app = new Hono<{ Bindings: Bindings }>(); + +// ============================================================================ +// Basic Text Generation +// ============================================================================ + +// Simple prompt (deprecated pattern, use messages instead) +app.post('/simple', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + prompt, + }); + + return c.json({ + success: true, + response: response.response, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Streaming Text Generation (RECOMMENDED) +// ============================================================================ + +/** + * Streaming is ESSENTIAL for production: + * - Prevents buffering large responses in memory + * - Faster time-to-first-token + * - Better user experience + * - Avoids Worker timeout issues + */ + +app.post('/stream', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const stream = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [{ role: 'user', content: prompt }], + stream: true, // Enable streaming + }); + + return new Response(stream, { + headers: { + 'content-type': 'text/event-stream', + 'cache-control': 'no-cache', + connection: 'keep-alive', + }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Chat Completions with History +// ============================================================================ + +app.post('/chat', async (c) => { + try { + const { messages } = await c.req.json<{ + messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>; + }>(); + + // Validate messages + if (!messages || messages.length === 0) { + return c.json({ error: 'Messages array is required' }, 400); + } + + const stream = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages, + stream: true, + max_tokens: 512, // Limit response length + }); + + return new Response(stream, { + headers: { 'content-type': 'text/event-stream' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Streaming with Custom Parameters +// ============================================================================ + +app.post('/stream/custom', async (c) => { + try { + const { prompt, temperature = 0.7, max_tokens = 512 } = await c.req.json<{ + prompt: string; + temperature?: number; + max_tokens?: number; + }>(); + + const stream = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { + role: 'system', + content: 'You are a helpful AI assistant.', + }, + { + role: 'user', + content: prompt, + }, + ], + stream: true, + max_tokens, + temperature, // Controls randomness (0.0-1.0) + }); + + return new Response(stream, { + headers: { 'content-type': 'text/event-stream' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Structured Output (JSON) +// ============================================================================ + +/** + * Generate structured JSON output + * Useful for extracting data, generating schemas, etc. + */ + +app.post('/structured', async (c) => { + try { + const { topic } = await c.req.json<{ topic: string }>(); + + const response = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { + role: 'system', + content: + 'You are a helpful assistant that ONLY returns valid JSON. Never include explanations or markdown, just raw JSON.', + }, + { + role: 'user', + content: `Generate a recipe for ${topic}. Return JSON with keys: name, ingredients (array), instructions (array), prepTime (number in minutes)`, + }, + ], + max_tokens: 1024, + }); + + // Parse JSON response + const data = JSON.parse(response.response); + + return c.json({ + success: true, + data, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Model Comparison +// ============================================================================ + +/** + * Compare different models side-by-side + */ + +app.post('/compare', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const models = [ + '@cf/meta/llama-3.1-8b-instruct', // Balanced + '@cf/meta/llama-3.2-1b-instruct', // Fast + '@cf/qwen/qwen1.5-14b-chat-awq', // High quality + ]; + + const results = await Promise.all( + models.map(async (model) => { + const start = Date.now(); + const response = await c.env.AI.run(model, { + messages: [{ role: 'user', content: prompt }], + max_tokens: 256, + }); + const duration = Date.now() - start; + + return { + model, + response: response.response, + duration, + }; + }) + ); + + return c.json({ + success: true, + results, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Error Handling with Retry +// ============================================================================ + +/** + * Retry logic for rate limits and transient errors + */ + +async function runWithRetry( + ai: Ai, + model: string, + inputs: any, + maxRetries = 3 +): Promise { + let lastError: Error; + + for (let i = 0; i < maxRetries; i++) { + try { + return await ai.run(model, inputs); + } catch (error) { + lastError = error as Error; + const message = lastError.message.toLowerCase(); + + // Rate limit (429) - retry with exponential backoff + if (message.includes('429') || message.includes('rate limit')) { + if (i < maxRetries - 1) { + const delay = Math.pow(2, i) * 1000; // 1s, 2s, 4s + console.log(`Rate limited. Retrying in ${delay}ms...`); + await new Promise((resolve) => setTimeout(resolve, delay)); + continue; + } + } + + // Model unavailable - try fallback model + if (message.includes('model') && message.includes('unavailable')) { + if (i === 0) { + console.log('Model unavailable, trying fallback...'); + model = '@cf/meta/llama-3.2-1b-instruct'; // Faster fallback + continue; + } + } + + // Other errors - throw immediately + throw error; + } + } + + throw lastError!; +} + +app.post('/reliable', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const response = await runWithRetry(c.env.AI, '@cf/meta/llama-3.1-8b-instruct', { + messages: [{ role: 'user', content: prompt }], + }); + + return c.json({ + success: true, + response: response.response, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Token Length Validation +// ============================================================================ + +/** + * Validate input length to prevent token limit errors + * Approximate: 1 token ≈ 4 characters + */ + +function estimateTokens(text: string): number { + return Math.ceil(text.length / 4); +} + +app.post('/validate', async (c) => { + try { + const { prompt } = await c.req.json<{ prompt: string }>(); + + const estimatedTokens = estimateTokens(prompt); + const maxInputTokens = 2048; // Most models support 2K-128K + + if (estimatedTokens > maxInputTokens) { + return c.json( + { + success: false, + error: `Input too long: ${estimatedTokens} tokens (max: ${maxInputTokens})`, + }, + 400 + ); + } + + const response = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [{ role: 'user', content: prompt }], + }); + + return c.json({ + success: true, + response: response.response, + estimatedTokens, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// System Prompts & Personas +// ============================================================================ + +const PERSONAS = { + helpful: 'You are a helpful AI assistant.', + concise: 'You are a concise AI assistant. Keep responses brief.', + technical: 'You are a technical AI assistant. Provide detailed, accurate information.', + creative: 'You are a creative AI assistant. Be imaginative and original.', +}; + +app.post('/persona/:persona', async (c) => { + try { + const persona = c.req.param('persona') as keyof typeof PERSONAS; + const { prompt } = await c.req.json<{ prompt: string }>(); + + if (!PERSONAS[persona]) { + return c.json({ error: 'Invalid persona' }, 400); + } + + const stream = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { role: 'system', content: PERSONAS[persona] }, + { role: 'user', content: prompt }, + ], + stream: true, + }); + + return new Response(stream, { + headers: { 'content-type': 'text/event-stream' }, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Health Check +// ============================================================================ + +app.get('/health', (c) => { + return c.json({ + status: 'ok', + timestamp: new Date().toISOString(), + }); +}); + +export default app; diff --git a/templates/ai-vision-models.ts b/templates/ai-vision-models.ts new file mode 100644 index 0000000..c34ea55 --- /dev/null +++ b/templates/ai-vision-models.ts @@ -0,0 +1,417 @@ +/** + * Cloudflare Workers AI - Vision Models Examples + * + * This template demonstrates: + * - Llama 3.2 11B Vision Instruct for image understanding + * - Image captioning and description + * - Visual question answering + * - Base64 image encoding + * - Combining vision + text prompts + */ + +import { Hono } from 'hono'; + +type Bindings = { + AI: Ai; +}; + +const app = new Hono<{ Bindings: Bindings }>(); + +// ============================================================================ +// Image Understanding +// ============================================================================ + +/** + * Llama 3.2 11B Vision Instruct + * - Understands images and answers questions + * - Accepts base64-encoded images + * - Rate limit: 720/min + */ + +app.post('/vision/understand', async (c) => { + try { + const { image, question = 'What is in this image?' } = await c.req.json<{ + image: string; // Base64 data URL or base64 string + question?: string; + }>(); + + // Ensure image has proper data URL prefix + const imageUrl = image.startsWith('data:') + ? image + : `data:image/png;base64,${image}`; + + const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: question }, + { type: 'image_url', image_url: { url: imageUrl } }, + ], + }, + ], + }); + + return c.json({ + success: true, + question, + answer: response.response, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Image Captioning +// ============================================================================ + +app.post('/vision/caption', async (c) => { + try { + const { image } = await c.req.json<{ image: string }>(); + + const imageUrl = image.startsWith('data:') + ? image + : `data:image/png;base64,${image}`; + + const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: 'Generate a detailed caption for this image. Describe what you see, including objects, people, setting, mood, and any notable details.', + }, + { type: 'image_url', image_url: { url: imageUrl } }, + ], + }, + ], + }); + + return c.json({ + success: true, + caption: response.response, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Visual Question Answering +// ============================================================================ + +app.post('/vision/qa', async (c) => { + try { + const { image, questions } = await c.req.json<{ + image: string; + questions: string[]; + }>(); + + if (!questions || questions.length === 0) { + return c.json({ error: 'questions array is required' }, 400); + } + + const imageUrl = image.startsWith('data:') + ? image + : `data:image/png;base64,${image}`; + + // Answer all questions + const answers = await Promise.all( + questions.map(async (question) => { + const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: question }, + { type: 'image_url', image_url: { url: imageUrl } }, + ], + }, + ], + }); + + return { + question, + answer: response.response, + }; + }) + ); + + return c.json({ + success: true, + count: answers.length, + results: answers, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Image Analysis (Structured Output) +// ============================================================================ + +app.post('/vision/analyze', async (c) => { + try { + const { image } = await c.req.json<{ image: string }>(); + + const imageUrl = image.startsWith('data:') + ? image + : `data:image/png;base64,${image}`; + + const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: `Analyze this image and return a JSON object with: +- objects: array of objects detected +- scene: description of the setting +- mood: emotional tone +- colors: dominant colors +- text: any visible text + +Return ONLY valid JSON, no explanations.`, + }, + { type: 'image_url', image_url: { url: imageUrl } }, + ], + }, + ], + }); + + // Parse JSON response + try { + const analysis = JSON.parse(response.response); + return c.json({ + success: true, + analysis, + }); + } catch { + return c.json({ + success: true, + raw: response.response, + }); + } + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Image Comparison +// ============================================================================ + +app.post('/vision/compare', async (c) => { + try { + const { image1, image2, question = 'What are the differences between these images?' } = + await c.req.json<{ + image1: string; + image2: string; + question?: string; + }>(); + + const imageUrl1 = image1.startsWith('data:') + ? image1 + : `data:image/png;base64,${image1}`; + const imageUrl2 = image2.startsWith('data:') + ? image2 + : `data:image/png;base64,${image2}`; + + // Analyze first image + const analysis1 = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: 'Describe this image in detail.' }, + { type: 'image_url', image_url: { url: imageUrl1 } }, + ], + }, + ], + }); + + // Analyze second image + const analysis2 = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: 'Describe this image in detail.' }, + { type: 'image_url', image_url: { url: imageUrl2 } }, + ], + }, + ], + }); + + // Compare using text generation + const comparison = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + messages: [ + { + role: 'user', + content: `Compare these two images based on their descriptions: + +Image 1: ${analysis1.response} + +Image 2: ${analysis2.response} + +Question: ${question}`, + }, + ], + }); + + return c.json({ + success: true, + image1Description: analysis1.response, + image2Description: analysis2.response, + comparison: comparison.response, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Image Upload from URL +// ============================================================================ + +/** + * Fetch image from URL, convert to base64, and analyze + */ + +app.post('/vision/url', async (c) => { + try { + const { url, question = 'What is in this image?' } = await c.req.json<{ + url: string; + question?: string; + }>(); + + // Fetch image + const imageResponse = await fetch(url); + if (!imageResponse.ok) { + return c.json({ error: 'Failed to fetch image' }, 400); + } + + // Convert to base64 + const imageBytes = await imageResponse.bytes(); + const base64 = btoa(String.fromCharCode(...imageBytes)); + const contentType = imageResponse.headers.get('content-type') || 'image/png'; + const imageUrl = `data:${contentType};base64,${base64}`; + + // Analyze image + const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: question }, + { type: 'image_url', image_url: { url: imageUrl } }, + ], + }, + ], + }); + + return c.json({ + success: true, + sourceUrl: url, + question, + answer: response.response, + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Accessibility: Alt Text Generation +// ============================================================================ + +app.post('/vision/alt-text', async (c) => { + try { + const { image } = await c.req.json<{ image: string }>(); + + const imageUrl = image.startsWith('data:') + ? image + : `data:image/png;base64,${image}`; + + const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', { + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: 'Generate a concise, descriptive alt text for this image for accessibility purposes. Keep it under 125 characters.', + }, + { type: 'image_url', image_url: { url: imageUrl } }, + ], + }, + ], + }); + + return c.json({ + success: true, + altText: response.response.trim(), + }); + } catch (error) { + return c.json( + { + success: false, + error: (error as Error).message, + }, + 500 + ); + } +}); + +// ============================================================================ +// Health Check +// ============================================================================ + +app.get('/health', (c) => { + return c.json({ + status: 'ok', + timestamp: new Date().toISOString(), + }); +}); + +export default app; diff --git a/templates/wrangler-ai-config.jsonc b/templates/wrangler-ai-config.jsonc new file mode 100644 index 0000000..a8dae36 --- /dev/null +++ b/templates/wrangler-ai-config.jsonc @@ -0,0 +1,138 @@ +/** + * Cloudflare Workers AI - Wrangler Configuration + * + * This configuration file sets up Workers AI binding for your Worker. + * Place this in your project root as wrangler.jsonc + */ + +{ + "name": "my-ai-worker", + "main": "src/index.ts", + "compatibility_date": "2025-10-21", + + /** + * AI Binding + * Provides access to Workers AI models via env.AI + */ + "ai": { + "binding": "AI" // Available in your Worker as env.AI + }, + + /** + * Optional: AI Gateway Integration + * Provides caching, logging, and analytics for AI requests + * Create a gateway at: https://dash.cloudflare.com/ai/ai-gateway + */ + // Note: AI Gateway is configured per-request, not in wrangler.jsonc + // Use the gateway option in env.AI.run(): + // env.AI.run(model, inputs, { gateway: { id: 'my-gateway' } }) + + /** + * Optional: Vectorize Binding (for RAG patterns) + * Store and search vector embeddings + */ + "vectorize": [ + { + "binding": "VECTORIZE", + "index_name": "my-embeddings-index" + } + ], + + /** + * Optional: D1 Database (for RAG document storage) + */ + "d1_databases": [ + { + "binding": "DB", + "database_name": "my-database", + "database_id": "YOUR_DATABASE_ID" + } + ], + + /** + * Optional: R2 Bucket (for image storage) + */ + "r2_buckets": [ + { + "binding": "BUCKET", + "bucket_name": "ai-generated-images" + } + ], + + /** + * Optional: KV Namespace (for caching AI responses) + */ + "kv_namespaces": [ + { + "binding": "CACHE", + "id": "YOUR_KV_NAMESPACE_ID" + } + ], + + /** + * Environment Variables + * Store API keys and configuration + */ + "vars": { + "ENVIRONMENT": "production" + }, + + /** + * Secrets (use: wrangler secret put SECRET_NAME) + * - CLOUDFLARE_API_KEY + * - CLOUDFLARE_ACCOUNT_ID + */ + + /** + * Workers Configuration + */ + "limits": { + "cpu_ms": 30000 // 30 seconds (increase for long AI operations) + }, + + /** + * Local Development + * Run: npx wrangler dev + */ + "dev": { + "port": 8787 + } +} + +/** + * TypeScript Types + * + * Add to your src/index.ts: + * + * export interface Env { + * AI: Ai; + * VECTORIZE?: Vectorize; + * DB?: D1Database; + * BUCKET?: R2Bucket; + * CACHE?: KVNamespace; + * CLOUDFLARE_API_KEY?: string; + * CLOUDFLARE_ACCOUNT_ID?: string; + * } + */ + +/** + * Usage Examples + * + * Basic AI inference: + * const response = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + * prompt: 'Hello!', + * }); + * + * With AI Gateway: + * const response = await env.AI.run( + * '@cf/meta/llama-3.1-8b-instruct', + * { prompt: 'Hello!' }, + * { gateway: { id: 'my-gateway' } } + * ); + * + * Streaming: + * const stream = await env.AI.run('@cf/meta/llama-3.1-8b-instruct', { + * messages: [{ role: 'user', content: 'Hello!' }], + * stream: true, + * }); + */