Initial commit
This commit is contained in:
139
templates/ai-enhanced-scraper.ts
Normal file
139
templates/ai-enhanced-scraper.ts
Normal file
@@ -0,0 +1,139 @@
|
||||
// AI-Enhanced Web Scraper
|
||||
// Combine Browser Rendering with Workers AI to extract structured data intelligently
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
AI: Ai;
|
||||
}
|
||||
|
||||
interface ProductData {
|
||||
name: string;
|
||||
price: string;
|
||||
description: string;
|
||||
availability: string;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url");
|
||||
|
||||
if (!url) {
|
||||
return new Response("Missing ?url parameter", { status: 400 });
|
||||
}
|
||||
|
||||
// Step 1: Scrape page content with browser
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Extract raw HTML content
|
||||
const bodyContent = await page.$eval("body", (el) => el.innerHTML);
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Truncate to fit AI context (4000 chars)
|
||||
const truncatedContent = bodyContent.slice(0, 4000);
|
||||
|
||||
// Step 2: Extract structured data with AI
|
||||
const aiResponse = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", {
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content:
|
||||
"You are a data extraction assistant. Extract product information from HTML and return ONLY valid JSON.",
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `Extract product information from this HTML. Return JSON with these fields: name, price, description, availability. If any field is not found, use empty string.\n\nHTML:\n${truncatedContent}`,
|
||||
},
|
||||
],
|
||||
stream: false,
|
||||
});
|
||||
|
||||
// Parse AI response
|
||||
let productData: ProductData;
|
||||
try {
|
||||
const responseText = (aiResponse as any).response;
|
||||
// Try to extract JSON from response (AI might wrap it in markdown)
|
||||
const jsonMatch = responseText.match(/\{[\s\S]*\}/);
|
||||
if (jsonMatch) {
|
||||
productData = JSON.parse(jsonMatch[0]);
|
||||
} else {
|
||||
productData = JSON.parse(responseText);
|
||||
}
|
||||
} catch {
|
||||
productData = {
|
||||
name: "",
|
||||
price: "",
|
||||
description: "",
|
||||
availability: "",
|
||||
raw: (aiResponse as any).response,
|
||||
};
|
||||
}
|
||||
|
||||
return Response.json({
|
||||
url,
|
||||
product: productData,
|
||||
extractedAt: new Date().toISOString(),
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return Response.json(
|
||||
{
|
||||
error: error instanceof Error ? error.message : "AI-enhanced scraping failed",
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Setup:
|
||||
* Add AI binding to wrangler.jsonc:
|
||||
* {
|
||||
* "browser": { "binding": "MYBROWSER" },
|
||||
* "ai": { "binding": "AI" },
|
||||
* "compatibility_flags": ["nodejs_compat"]
|
||||
* }
|
||||
*
|
||||
* Usage:
|
||||
* GET /?url=https://example.com/product
|
||||
*
|
||||
* Response:
|
||||
* {
|
||||
* "url": "https://example.com/product",
|
||||
* "product": {
|
||||
* "name": "Example Product",
|
||||
* "price": "$99.99",
|
||||
* "description": "Product description...",
|
||||
* "availability": "In Stock"
|
||||
* },
|
||||
* "extractedAt": "2025-10-22T12:34:56.789Z"
|
||||
* }
|
||||
*
|
||||
* Benefits:
|
||||
* - No need to write custom CSS selectors for each site
|
||||
* - AI adapts to different page structures
|
||||
* - Extracts semantic information, not just raw HTML
|
||||
* - Handles variations in HTML structure
|
||||
*
|
||||
* Limitations:
|
||||
* - AI context limited to ~4000 chars of HTML
|
||||
* - May hallucinate if data not present
|
||||
* - Requires AI binding (uses neurons quota)
|
||||
*
|
||||
* See also:
|
||||
* - cloudflare-workers-ai skill for more AI patterns
|
||||
* - web-scraper-basic.ts for traditional CSS selector approach
|
||||
*/
|
||||
Reference in New Issue
Block a user