Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:24:08 +08:00
commit 7c90a3ac2b
18 changed files with 4579 additions and 0 deletions

View File

@@ -0,0 +1,139 @@
// AI-Enhanced Web Scraper
// Combine Browser Rendering with Workers AI to extract structured data intelligently
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
AI: Ai;
}
interface ProductData {
name: string;
price: string;
description: string;
availability: string;
[key: string]: any;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
if (!url) {
return new Response("Missing ?url parameter", { status: 400 });
}
// Step 1: Scrape page content with browser
const browser = await puppeteer.launch(env.MYBROWSER);
try {
const page = await browser.newPage();
await page.goto(url, {
waitUntil: "networkidle0",
timeout: 30000,
});
// Extract raw HTML content
const bodyContent = await page.$eval("body", (el) => el.innerHTML);
await browser.close();
// Truncate to fit AI context (4000 chars)
const truncatedContent = bodyContent.slice(0, 4000);
// Step 2: Extract structured data with AI
const aiResponse = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", {
messages: [
{
role: "system",
content:
"You are a data extraction assistant. Extract product information from HTML and return ONLY valid JSON.",
},
{
role: "user",
content: `Extract product information from this HTML. Return JSON with these fields: name, price, description, availability. If any field is not found, use empty string.\n\nHTML:\n${truncatedContent}`,
},
],
stream: false,
});
// Parse AI response
let productData: ProductData;
try {
const responseText = (aiResponse as any).response;
// Try to extract JSON from response (AI might wrap it in markdown)
const jsonMatch = responseText.match(/\{[\s\S]*\}/);
if (jsonMatch) {
productData = JSON.parse(jsonMatch[0]);
} else {
productData = JSON.parse(responseText);
}
} catch {
productData = {
name: "",
price: "",
description: "",
availability: "",
raw: (aiResponse as any).response,
};
}
return Response.json({
url,
product: productData,
extractedAt: new Date().toISOString(),
});
} catch (error) {
await browser.close();
return Response.json(
{
error: error instanceof Error ? error.message : "AI-enhanced scraping failed",
},
{ status: 500 }
);
}
},
};
/**
* Setup:
* Add AI binding to wrangler.jsonc:
* {
* "browser": { "binding": "MYBROWSER" },
* "ai": { "binding": "AI" },
* "compatibility_flags": ["nodejs_compat"]
* }
*
* Usage:
* GET /?url=https://example.com/product
*
* Response:
* {
* "url": "https://example.com/product",
* "product": {
* "name": "Example Product",
* "price": "$99.99",
* "description": "Product description...",
* "availability": "In Stock"
* },
* "extractedAt": "2025-10-22T12:34:56.789Z"
* }
*
* Benefits:
* - No need to write custom CSS selectors for each site
* - AI adapts to different page structures
* - Extracts semantic information, not just raw HTML
* - Handles variations in HTML structure
*
* Limitations:
* - AI context limited to ~4000 chars of HTML
* - May hallucinate if data not present
* - Requires AI binding (uses neurons quota)
*
* See also:
* - cloudflare-workers-ai skill for more AI patterns
* - web-scraper-basic.ts for traditional CSS selector approach
*/