Files
gh-jezweb-claude-skills-ski…/templates/ai-vision-models.ts
2025-11-30 08:24:38 +08:00

418 lines
10 KiB
TypeScript

/**
* Cloudflare Workers AI - Vision Models Examples
*
* This template demonstrates:
* - Llama 3.2 11B Vision Instruct for image understanding
* - Image captioning and description
* - Visual question answering
* - Base64 image encoding
* - Combining vision + text prompts
*/
import { Hono } from 'hono';
type Bindings = {
AI: Ai;
};
const app = new Hono<{ Bindings: Bindings }>();
// ============================================================================
// Image Understanding
// ============================================================================
/**
* Llama 3.2 11B Vision Instruct
* - Understands images and answers questions
* - Accepts base64-encoded images
* - Rate limit: 720/min
*/
app.post('/vision/understand', async (c) => {
try {
const { image, question = 'What is in this image?' } = await c.req.json<{
image: string; // Base64 data URL or base64 string
question?: string;
}>();
// Ensure image has proper data URL prefix
const imageUrl = image.startsWith('data:')
? image
: `data:image/png;base64,${image}`;
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } },
],
},
],
});
return c.json({
success: true,
question,
answer: response.response,
});
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Image Captioning
// ============================================================================
app.post('/vision/caption', async (c) => {
try {
const { image } = await c.req.json<{ image: string }>();
const imageUrl = image.startsWith('data:')
? image
: `data:image/png;base64,${image}`;
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: 'Generate a detailed caption for this image. Describe what you see, including objects, people, setting, mood, and any notable details.',
},
{ type: 'image_url', image_url: { url: imageUrl } },
],
},
],
});
return c.json({
success: true,
caption: response.response,
});
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Visual Question Answering
// ============================================================================
app.post('/vision/qa', async (c) => {
try {
const { image, questions } = await c.req.json<{
image: string;
questions: string[];
}>();
if (!questions || questions.length === 0) {
return c.json({ error: 'questions array is required' }, 400);
}
const imageUrl = image.startsWith('data:')
? image
: `data:image/png;base64,${image}`;
// Answer all questions
const answers = await Promise.all(
questions.map(async (question) => {
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } },
],
},
],
});
return {
question,
answer: response.response,
};
})
);
return c.json({
success: true,
count: answers.length,
results: answers,
});
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Image Analysis (Structured Output)
// ============================================================================
app.post('/vision/analyze', async (c) => {
try {
const { image } = await c.req.json<{ image: string }>();
const imageUrl = image.startsWith('data:')
? image
: `data:image/png;base64,${image}`;
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: `Analyze this image and return a JSON object with:
- objects: array of objects detected
- scene: description of the setting
- mood: emotional tone
- colors: dominant colors
- text: any visible text
Return ONLY valid JSON, no explanations.`,
},
{ type: 'image_url', image_url: { url: imageUrl } },
],
},
],
});
// Parse JSON response
try {
const analysis = JSON.parse(response.response);
return c.json({
success: true,
analysis,
});
} catch {
return c.json({
success: true,
raw: response.response,
});
}
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Image Comparison
// ============================================================================
app.post('/vision/compare', async (c) => {
try {
const { image1, image2, question = 'What are the differences between these images?' } =
await c.req.json<{
image1: string;
image2: string;
question?: string;
}>();
const imageUrl1 = image1.startsWith('data:')
? image1
: `data:image/png;base64,${image1}`;
const imageUrl2 = image2.startsWith('data:')
? image2
: `data:image/png;base64,${image2}`;
// Analyze first image
const analysis1 = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{ type: 'text', text: 'Describe this image in detail.' },
{ type: 'image_url', image_url: { url: imageUrl1 } },
],
},
],
});
// Analyze second image
const analysis2 = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{ type: 'text', text: 'Describe this image in detail.' },
{ type: 'image_url', image_url: { url: imageUrl2 } },
],
},
],
});
// Compare using text generation
const comparison = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', {
messages: [
{
role: 'user',
content: `Compare these two images based on their descriptions:
Image 1: ${analysis1.response}
Image 2: ${analysis2.response}
Question: ${question}`,
},
],
});
return c.json({
success: true,
image1Description: analysis1.response,
image2Description: analysis2.response,
comparison: comparison.response,
});
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Image Upload from URL
// ============================================================================
/**
* Fetch image from URL, convert to base64, and analyze
*/
app.post('/vision/url', async (c) => {
try {
const { url, question = 'What is in this image?' } = await c.req.json<{
url: string;
question?: string;
}>();
// Fetch image
const imageResponse = await fetch(url);
if (!imageResponse.ok) {
return c.json({ error: 'Failed to fetch image' }, 400);
}
// Convert to base64
const imageBytes = await imageResponse.bytes();
const base64 = btoa(String.fromCharCode(...imageBytes));
const contentType = imageResponse.headers.get('content-type') || 'image/png';
const imageUrl = `data:${contentType};base64,${base64}`;
// Analyze image
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{ type: 'text', text: question },
{ type: 'image_url', image_url: { url: imageUrl } },
],
},
],
});
return c.json({
success: true,
sourceUrl: url,
question,
answer: response.response,
});
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Accessibility: Alt Text Generation
// ============================================================================
app.post('/vision/alt-text', async (c) => {
try {
const { image } = await c.req.json<{ image: string }>();
const imageUrl = image.startsWith('data:')
? image
: `data:image/png;base64,${image}`;
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: 'Generate a concise, descriptive alt text for this image for accessibility purposes. Keep it under 125 characters.',
},
{ type: 'image_url', image_url: { url: imageUrl } },
],
},
],
});
return c.json({
success: true,
altText: response.response.trim(),
});
} catch (error) {
return c.json(
{
success: false,
error: (error as Error).message,
},
500
);
}
});
// ============================================================================
// Health Check
// ============================================================================
app.get('/health', (c) => {
return c.json({
status: 'ok',
timestamp: new Date().toISOString(),
});
});
export default app;