Initial commit
This commit is contained in:
417
templates/ai-vision-models.ts
Normal file
417
templates/ai-vision-models.ts
Normal file
@@ -0,0 +1,417 @@
|
||||
/**
|
||||
* Cloudflare Workers AI - Vision Models Examples
|
||||
*
|
||||
* This template demonstrates:
|
||||
* - Llama 3.2 11B Vision Instruct for image understanding
|
||||
* - Image captioning and description
|
||||
* - Visual question answering
|
||||
* - Base64 image encoding
|
||||
* - Combining vision + text prompts
|
||||
*/
|
||||
|
||||
import { Hono } from 'hono';
|
||||
|
||||
type Bindings = {
|
||||
AI: Ai;
|
||||
};
|
||||
|
||||
const app = new Hono<{ Bindings: Bindings }>();
|
||||
|
||||
// ============================================================================
|
||||
// Image Understanding
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Llama 3.2 11B Vision Instruct
|
||||
* - Understands images and answers questions
|
||||
* - Accepts base64-encoded images
|
||||
* - Rate limit: 720/min
|
||||
*/
|
||||
|
||||
app.post('/vision/understand', async (c) => {
|
||||
try {
|
||||
const { image, question = 'What is in this image?' } = await c.req.json<{
|
||||
image: string; // Base64 data URL or base64 string
|
||||
question?: string;
|
||||
}>();
|
||||
|
||||
// Ensure image has proper data URL prefix
|
||||
const imageUrl = image.startsWith('data:')
|
||||
? image
|
||||
: `data:image/png;base64,${image}`;
|
||||
|
||||
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: question },
|
||||
{ type: 'image_url', image_url: { url: imageUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return c.json({
|
||||
success: true,
|
||||
question,
|
||||
answer: response.response,
|
||||
});
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Image Captioning
|
||||
// ============================================================================
|
||||
|
||||
app.post('/vision/caption', async (c) => {
|
||||
try {
|
||||
const { image } = await c.req.json<{ image: string }>();
|
||||
|
||||
const imageUrl = image.startsWith('data:')
|
||||
? image
|
||||
: `data:image/png;base64,${image}`;
|
||||
|
||||
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: 'Generate a detailed caption for this image. Describe what you see, including objects, people, setting, mood, and any notable details.',
|
||||
},
|
||||
{ type: 'image_url', image_url: { url: imageUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return c.json({
|
||||
success: true,
|
||||
caption: response.response,
|
||||
});
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Visual Question Answering
|
||||
// ============================================================================
|
||||
|
||||
app.post('/vision/qa', async (c) => {
|
||||
try {
|
||||
const { image, questions } = await c.req.json<{
|
||||
image: string;
|
||||
questions: string[];
|
||||
}>();
|
||||
|
||||
if (!questions || questions.length === 0) {
|
||||
return c.json({ error: 'questions array is required' }, 400);
|
||||
}
|
||||
|
||||
const imageUrl = image.startsWith('data:')
|
||||
? image
|
||||
: `data:image/png;base64,${image}`;
|
||||
|
||||
// Answer all questions
|
||||
const answers = await Promise.all(
|
||||
questions.map(async (question) => {
|
||||
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: question },
|
||||
{ type: 'image_url', image_url: { url: imageUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return {
|
||||
question,
|
||||
answer: response.response,
|
||||
};
|
||||
})
|
||||
);
|
||||
|
||||
return c.json({
|
||||
success: true,
|
||||
count: answers.length,
|
||||
results: answers,
|
||||
});
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Image Analysis (Structured Output)
|
||||
// ============================================================================
|
||||
|
||||
app.post('/vision/analyze', async (c) => {
|
||||
try {
|
||||
const { image } = await c.req.json<{ image: string }>();
|
||||
|
||||
const imageUrl = image.startsWith('data:')
|
||||
? image
|
||||
: `data:image/png;base64,${image}`;
|
||||
|
||||
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: `Analyze this image and return a JSON object with:
|
||||
- objects: array of objects detected
|
||||
- scene: description of the setting
|
||||
- mood: emotional tone
|
||||
- colors: dominant colors
|
||||
- text: any visible text
|
||||
|
||||
Return ONLY valid JSON, no explanations.`,
|
||||
},
|
||||
{ type: 'image_url', image_url: { url: imageUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Parse JSON response
|
||||
try {
|
||||
const analysis = JSON.parse(response.response);
|
||||
return c.json({
|
||||
success: true,
|
||||
analysis,
|
||||
});
|
||||
} catch {
|
||||
return c.json({
|
||||
success: true,
|
||||
raw: response.response,
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Image Comparison
|
||||
// ============================================================================
|
||||
|
||||
app.post('/vision/compare', async (c) => {
|
||||
try {
|
||||
const { image1, image2, question = 'What are the differences between these images?' } =
|
||||
await c.req.json<{
|
||||
image1: string;
|
||||
image2: string;
|
||||
question?: string;
|
||||
}>();
|
||||
|
||||
const imageUrl1 = image1.startsWith('data:')
|
||||
? image1
|
||||
: `data:image/png;base64,${image1}`;
|
||||
const imageUrl2 = image2.startsWith('data:')
|
||||
? image2
|
||||
: `data:image/png;base64,${image2}`;
|
||||
|
||||
// Analyze first image
|
||||
const analysis1 = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Describe this image in detail.' },
|
||||
{ type: 'image_url', image_url: { url: imageUrl1 } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Analyze second image
|
||||
const analysis2 = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: 'Describe this image in detail.' },
|
||||
{ type: 'image_url', image_url: { url: imageUrl2 } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Compare using text generation
|
||||
const comparison = await c.env.AI.run('@cf/meta/llama-3.1-8b-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: `Compare these two images based on their descriptions:
|
||||
|
||||
Image 1: ${analysis1.response}
|
||||
|
||||
Image 2: ${analysis2.response}
|
||||
|
||||
Question: ${question}`,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return c.json({
|
||||
success: true,
|
||||
image1Description: analysis1.response,
|
||||
image2Description: analysis2.response,
|
||||
comparison: comparison.response,
|
||||
});
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Image Upload from URL
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Fetch image from URL, convert to base64, and analyze
|
||||
*/
|
||||
|
||||
app.post('/vision/url', async (c) => {
|
||||
try {
|
||||
const { url, question = 'What is in this image?' } = await c.req.json<{
|
||||
url: string;
|
||||
question?: string;
|
||||
}>();
|
||||
|
||||
// Fetch image
|
||||
const imageResponse = await fetch(url);
|
||||
if (!imageResponse.ok) {
|
||||
return c.json({ error: 'Failed to fetch image' }, 400);
|
||||
}
|
||||
|
||||
// Convert to base64
|
||||
const imageBytes = await imageResponse.bytes();
|
||||
const base64 = btoa(String.fromCharCode(...imageBytes));
|
||||
const contentType = imageResponse.headers.get('content-type') || 'image/png';
|
||||
const imageUrl = `data:${contentType};base64,${base64}`;
|
||||
|
||||
// Analyze image
|
||||
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: question },
|
||||
{ type: 'image_url', image_url: { url: imageUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return c.json({
|
||||
success: true,
|
||||
sourceUrl: url,
|
||||
question,
|
||||
answer: response.response,
|
||||
});
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Accessibility: Alt Text Generation
|
||||
// ============================================================================
|
||||
|
||||
app.post('/vision/alt-text', async (c) => {
|
||||
try {
|
||||
const { image } = await c.req.json<{ image: string }>();
|
||||
|
||||
const imageUrl = image.startsWith('data:')
|
||||
? image
|
||||
: `data:image/png;base64,${image}`;
|
||||
|
||||
const response = await c.env.AI.run('@cf/meta/llama-3.2-11b-vision-instruct', {
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{
|
||||
type: 'text',
|
||||
text: 'Generate a concise, descriptive alt text for this image for accessibility purposes. Keep it under 125 characters.',
|
||||
},
|
||||
{ type: 'image_url', image_url: { url: imageUrl } },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
return c.json({
|
||||
success: true,
|
||||
altText: response.response.trim(),
|
||||
});
|
||||
} catch (error) {
|
||||
return c.json(
|
||||
{
|
||||
success: false,
|
||||
error: (error as Error).message,
|
||||
},
|
||||
500
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Health Check
|
||||
// ============================================================================
|
||||
|
||||
app.get('/health', (c) => {
|
||||
return c.json({
|
||||
status: 'ok',
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
});
|
||||
|
||||
export default app;
|
||||
Reference in New Issue
Block a user