14 KiB
14 KiB
AI SDK Core - Production Patterns
Best practices for deploying AI SDK Core in production environments.
Performance Optimization
1. Streaming for Long-Form Content
Always use streaming for user-facing long-form content:
// ✅ GOOD: User-facing (better perceived performance)
app.post('/chat', async (req, res) => {
const stream = streamText({
model: openai('gpt-4'),
prompt: req.body.message,
});
return stream.toDataStreamResponse();
});
// ❌ BAD: User waits for entire response
app.post('/chat', async (req, res) => {
const result = await generateText({
model: openai('gpt-4'),
prompt: req.body.message,
});
return res.json({ response: result.text });
});
// ✅ GOOD: Background tasks (no user waiting)
async function processDocument(doc: string) {
const result = await generateText({
model: openai('gpt-4'),
prompt: `Analyze: ${doc}`,
});
await saveToDatabase(result.text);
}
2. Set Appropriate maxOutputTokens
// ✅ GOOD: Limit token usage based on use case
const shortSummary = await generateText({
model: openai('gpt-4'),
prompt: 'Summarize in 2 sentences',
maxOutputTokens: 100, // Prevents over-generation
});
const article = await generateText({
model: openai('gpt-4'),
prompt: 'Write article',
maxOutputTokens: 2000, // Appropriate for long-form
});
// ❌ BAD: No limit (can waste tokens/money)
const unlimited = await generateText({
model: openai('gpt-4'),
prompt: 'Write something',
// No maxOutputTokens
});
3. Cache Provider Instances
// ✅ GOOD: Reuse provider instances
const gpt4 = openai('gpt-4-turbo');
const claude = anthropic('claude-3-5-sonnet-20241022');
app.post('/chat', async (req, res) => {
const result = await generateText({
model: gpt4, // Reuse
prompt: req.body.message,
});
return res.json({ response: result.text });
});
// ❌ BAD: Create new instance every time
app.post('/chat', async (req, res) => {
const result = await generateText({
model: openai('gpt-4-turbo'), // New instance each call
prompt: req.body.message,
});
});
4. Optimize Zod Schemas (Especially in Workers)
// ❌ BAD: Complex schema at top level (slow startup)
const ComplexSchema = z.object({
// 50+ fields with deep nesting
});
// ✅ GOOD: Define schemas inside functions
function generateStructuredData() {
const schema = z.object({
// Schema definition here
});
return generateObject({ model: openai('gpt-4'), schema, prompt: '...' });
}
// ✅ GOOD: Split into smaller reusable schemas
const AddressSchema = z.object({ street: z.string(), city: z.string() });
const PersonSchema = z.object({ name: z.string(), address: AddressSchema });
Error Handling
1. Wrap All AI Calls in Try-Catch
async function generateSafely(prompt: string) {
try {
const result = await generateText({
model: openai('gpt-4'),
prompt,
});
return { success: true, data: result.text };
} catch (error) {
if (error instanceof AI_APICallError) {
console.error('API call failed:', error.statusCode, error.message);
return { success: false, error: 'AI service temporarily unavailable' };
} else if (error instanceof AI_NoContentGeneratedError) {
console.error('No content generated');
return { success: false, error: 'Unable to generate response' };
} else {
console.error('Unknown error:', error);
return { success: false, error: 'An error occurred' };
}
}
}
2. Handle Specific Error Types
import {
AI_APICallError,
AI_NoObjectGeneratedError,
AI_TypeValidationError,
AI_RetryError,
} from 'ai';
async function robustGeneration(prompt: string) {
try {
return await generateText({ model: openai('gpt-4'), prompt });
} catch (error) {
switch (error.constructor) {
case AI_APICallError:
if (error.statusCode === 429) {
// Rate limit - wait and retry
await wait(5000);
return retry();
} else if (error.statusCode >= 500) {
// Provider issue - try fallback
return generateText({ model: anthropic('claude-3-5-sonnet-20241022'), prompt });
}
break;
case AI_RetryError:
// All retries failed - use fallback provider
return generateText({ model: google('gemini-2.5-pro'), prompt });
case AI_NoContentGeneratedError:
// Content filtered - return safe message
return { text: 'Unable to generate response for this input.' };
default:
throw error;
}
}
}
3. Implement Retry Logic
async function generateWithRetry(prompt: string, maxRetries = 3) {
for (let i = 0; i < maxRetries; i++) {
try {
return await generateText({
model: openai('gpt-4'),
prompt,
maxRetries: 2, // Built-in retry
});
} catch (error) {
if (i === maxRetries - 1) throw error; // Last attempt failed
// Exponential backoff
const delay = Math.pow(2, i) * 1000;
console.log(`Retry ${i + 1}/${maxRetries} after ${delay}ms`);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
}
4. Log Errors Properly
function logAIError(error: any, context: Record<string, any>) {
const errorLog = {
timestamp: new Date().toISOString(),
type: error.constructor.name,
message: error.message,
statusCode: error.statusCode,
responseBody: error.responseBody,
context,
stack: error.stack,
};
// Send to monitoring service (e.g., Sentry, Datadog)
console.error('AI SDK Error:', JSON.stringify(errorLog));
// Track metrics
metrics.increment('ai.error', {
type: error.constructor.name,
statusCode: error.statusCode,
});
}
try {
const result = await generateText({ model: openai('gpt-4'), prompt });
} catch (error) {
logAIError(error, { prompt, model: 'gpt-4' });
throw error;
}
Cost Optimization
1. Choose Appropriate Models
// Model selection based on task complexity
async function generateWithCostOptimization(prompt: string, complexity: 'simple' | 'medium' | 'complex') {
const models = {
simple: openai('gpt-3.5-turbo'), // $0.50 / 1M tokens
medium: openai('gpt-4-turbo'), // $10 / 1M tokens
complex: openai('gpt-4'), // $30 / 1M tokens
};
return generateText({
model: models[complexity],
prompt,
});
}
// Usage
await generateWithCostOptimization('Translate to Spanish', 'simple');
await generateWithCostOptimization('Analyze sentiment', 'medium');
await generateWithCostOptimization('Complex reasoning task', 'complex');
2. Set Token Limits
// Prevent runaway costs
const result = await generateText({
model: openai('gpt-4'),
prompt: 'Write essay',
maxOutputTokens: 500, // Hard limit
});
// Adjust limits per use case
const limits = {
chatMessage: 200,
summary: 300,
article: 2000,
analysis: 1000,
};
3. Cache Results
import { LRUCache } from 'lru-cache';
const cache = new LRUCache<string, string>({
max: 1000, // Max 1000 items
ttl: 1000 * 60 * 60, // 1 hour TTL
});
async function generateWithCache(prompt: string) {
const cacheKey = `ai:${hash(prompt)}`;
// Check cache
const cached = cache.get(cacheKey);
if (cached) {
console.log('Cache hit');
return { text: cached, cached: true };
}
// Generate
const result = await generateText({
model: openai('gpt-4'),
prompt,
});
// Store in cache
cache.set(cacheKey, result.text);
return { text: result.text, cached: false };
}
4. Monitor Usage
// Track token usage
let totalTokensUsed = 0;
let totalCost = 0;
async function generateWithTracking(prompt: string) {
const result = await generateText({
model: openai('gpt-4'),
prompt,
});
// Track tokens
totalTokensUsed += result.usage.totalTokens;
// Estimate cost (GPT-4: $30/1M tokens)
const cost = (result.usage.totalTokens / 1_000_000) * 30;
totalCost += cost;
console.log(`Tokens: ${result.usage.totalTokens}, Cost: $${cost.toFixed(4)}`);
console.log(`Total tokens: ${totalTokensUsed}, Total cost: $${totalCost.toFixed(2)}`);
return result;
}
Cloudflare Workers Best Practices
1. Lazy Initialization
// ✅ GOOD: Import inside handler
export default {
async fetch(request, env) {
const { generateText } = await import('ai');
const { createWorkersAI } = await import('workers-ai-provider');
const workersai = createWorkersAI({ binding: env.AI });
const result = await generateText({
model: workersai('@cf/meta/llama-3.1-8b-instruct'),
prompt: 'Hello',
});
return new Response(result.text);
}
};
// ❌ BAD: Top-level imports (startup overhead)
import { generateText } from 'ai';
const workersai = createWorkersAI({ binding: env.AI }); // Runs at startup!
2. Monitor Startup Time
# Wrangler reports startup time
npx wrangler deploy
# Output shows:
# Startup Time: 287ms (must be <400ms)
3. Handle Streaming Properly
app.post('/chat/stream', async (c) => {
const workersai = createWorkersAI({ binding: c.env.AI });
const stream = streamText({
model: workersai('@cf/meta/llama-3.1-8b-instruct'),
prompt: 'Hello',
});
// Return ReadableStream for Workers
return new Response(stream.toTextStream(), {
headers: {
'Content-Type': 'text/plain; charset=utf-8',
'X-Content-Type-Options': 'nosniff',
},
});
});
Next.js / Vercel Best Practices
1. Server Actions for Mutations
// app/actions.ts
'use server';
export async function generateContent(input: string) {
const result = await generateText({
model: openai('gpt-4'),
prompt: input,
maxOutputTokens: 500,
});
return result.text;
}
// app/page.tsx (Client Component)
'use client';
import { generateContent } from './actions';
export default function Page() {
const [loading, setLoading] = useState(false);
async function handleSubmit(formData: FormData) {
setLoading(true);
const result = await generateContent(formData.get('input') as string);
setLoading(false);
}
return <form action={handleSubmit}>...</form>;
}
2. Server Components for Initial Loads
// app/page.tsx (Server Component)
export default async function Page() {
// Generate on server
const result = await generateText({
model: openai('gpt-4'),
prompt: 'Welcome message',
});
// No loading state needed
return <div>{result.text}</div>;
}
3. API Routes for Streaming
// app/api/chat/route.ts
import { streamText } from 'ai';
import { openai } from '@ai-sdk/openai';
export async function POST(request: Request) {
const { messages } = await request.json();
const stream = streamText({
model: openai('gpt-4-turbo'),
messages,
});
return stream.toDataStreamResponse();
}
Monitoring and Logging
1. Track Key Metrics
// Token usage
metrics.gauge('ai.tokens.total', result.usage.totalTokens);
metrics.gauge('ai.tokens.prompt', result.usage.promptTokens);
metrics.gauge('ai.tokens.completion', result.usage.completionTokens);
// Response time
const startTime = Date.now();
const result = await generateText({ model: openai('gpt-4'), prompt });
metrics.timing('ai.response_time', Date.now() - startTime);
// Error rate
metrics.increment('ai.errors', { type: error.constructor.name });
2. Structured Logging
import winston from 'winston';
const logger = winston.createLogger({
format: winston.format.json(),
transports: [new winston.transports.Console()],
});
logger.info('AI generation started', {
model: 'gpt-4',
promptLength: prompt.length,
userId: user.id,
});
const result = await generateText({ model: openai('gpt-4'), prompt });
logger.info('AI generation completed', {
model: 'gpt-4',
tokensUsed: result.usage.totalTokens,
responseLength: result.text.length,
duration: Date.now() - startTime,
});
Rate Limiting
1. Queue Requests
import PQueue from 'p-queue';
// Limit: 50 requests per minute
const queue = new PQueue({
concurrency: 5,
interval: 60000,
intervalCap: 50,
});
async function generateQueued(prompt: string) {
return queue.add(() =>
generateText({ model: openai('gpt-4'), prompt })
);
}
2. Monitor Rate Limits
async function generateWithRateCheck(prompt: string) {
const result = await generateText({
model: openai('gpt-4'),
prompt,
});
// Check rate limit headers (provider-specific)
console.log('Remaining requests:', response.headers['x-ratelimit-remaining']);
console.log('Resets at:', response.headers['x-ratelimit-reset']);
return result;
}
Security
1. Sanitize User Inputs
function sanitizePrompt(userInput: string): string {
// Remove potential prompt injections
return userInput
.replace(/system:/gi, '')
.replace(/ignore previous/gi, '')
.slice(0, 1000); // Limit length
}
const result = await generateText({
model: openai('gpt-4'),
prompt: sanitizePrompt(req.body.message),
});
2. Validate API Keys
// Startup validation
function validateEnv() {
const required = ['OPENAI_API_KEY', 'ANTHROPIC_API_KEY'];
for (const key of required) {
if (!process.env[key]) {
throw new Error(`Missing: ${key}`);
}
if (!process.env[key].match(/^sk-/)) {
throw new Error(`Invalid format: ${key}`);
}
}
}
validateEnv();
Deployment
See Vercel's official deployment documentation: https://vercel.com/docs/functions
For Cloudflare Workers: https://developers.cloudflare.com/workers/
Last Updated: 2025-10-21