Initial commit

2025-11-30 08:24:08 +08:00
commit 7c90a3ac2b
18 changed files with 4579 additions and 0 deletions
--- a/templates/ai-enhanced-scraper.ts
+++ b/templates/ai-enhanced-scraper.ts
@@ -0,0 +1,139 @@
+// AI-Enhanced Web Scraper
+// Combine Browser Rendering with Workers AI to extract structured data intelligently
+
+import puppeteer from "@cloudflare/puppeteer";
+
+interface Env {
+  MYBROWSER: Fetcher;
+  AI: Ai;
+}
+
+interface ProductData {
+  name: string;
+  price: string;
+  description: string;
+  availability: string;
+  [key: string]: any;
+}
+
+export default {
+  async fetch(request: Request, env: Env): Promise<Response> {
+    const { searchParams } = new URL(request.url);
+    const url = searchParams.get("url");
+
+    if (!url) {
+      return new Response("Missing ?url parameter", { status: 400 });
+    }
+
+    // Step 1: Scrape page content with browser
+    const browser = await puppeteer.launch(env.MYBROWSER);
+
+    try {
+      const page = await browser.newPage();
+
+      await page.goto(url, {
+        waitUntil: "networkidle0",
+        timeout: 30000,
+      });
+
+      // Extract raw HTML content
+      const bodyContent = await page.$eval("body", (el) => el.innerHTML);
+
+      await browser.close();
+
+      // Truncate to fit AI context (4000 chars)
+      const truncatedContent = bodyContent.slice(0, 4000);
+
+      // Step 2: Extract structured data with AI
+      const aiResponse = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", {
+        messages: [
+          {
+            role: "system",
+            content:
+              "You are a data extraction assistant. Extract product information from HTML and return ONLY valid JSON.",
+          },
+          {
+            role: "user",
+            content: `Extract product information from this HTML. Return JSON with these fields: name, price, description, availability. If any field is not found, use empty string.\n\nHTML:\n${truncatedContent}`,
+          },
+        ],
+        stream: false,
+      });
+
+      // Parse AI response
+      let productData: ProductData;
+      try {
+        const responseText = (aiResponse as any).response;
+        // Try to extract JSON from response (AI might wrap it in markdown)
+        const jsonMatch = responseText.match(/\{[\s\S]*\}/);
+        if (jsonMatch) {
+          productData = JSON.parse(jsonMatch[0]);
+        } else {
+          productData = JSON.parse(responseText);
+        }
+      } catch {
+        productData = {
+          name: "",
+          price: "",
+          description: "",
+          availability: "",
+          raw: (aiResponse as any).response,
+        };
+      }
+
+      return Response.json({
+        url,
+        product: productData,
+        extractedAt: new Date().toISOString(),
+      });
+    } catch (error) {
+      await browser.close();
+      return Response.json(
+        {
+          error: error instanceof Error ? error.message : "AI-enhanced scraping failed",
+        },
+        { status: 500 }
+      );
+    }
+  },
+};
+
+/**
+ * Setup:
+ *   Add AI binding to wrangler.jsonc:
+ *   {
+ *     "browser": { "binding": "MYBROWSER" },
+ *     "ai": { "binding": "AI" },
+ *     "compatibility_flags": ["nodejs_compat"]
+ *   }
+ *
+ * Usage:
+ *   GET /?url=https://example.com/product
+ *
+ * Response:
+ *   {
+ *     "url": "https://example.com/product",
+ *     "product": {
+ *       "name": "Example Product",
+ *       "price": "$99.99",
+ *       "description": "Product description...",
+ *       "availability": "In Stock"
+ *     },
+ *     "extractedAt": "2025-10-22T12:34:56.789Z"
+ *   }
+ *
+ * Benefits:
+ * - No need to write custom CSS selectors for each site
+ * - AI adapts to different page structures
+ * - Extracts semantic information, not just raw HTML
+ * - Handles variations in HTML structure
+ *
+ * Limitations:
+ * - AI context limited to ~4000 chars of HTML
+ * - May hallucinate if data not present
+ * - Requires AI binding (uses neurons quota)
+ *
+ * See also:
+ * - cloudflare-workers-ai skill for more AI patterns
+ * - web-scraper-basic.ts for traditional CSS selector approach
+ */