Initial commit
This commit is contained in:
139
templates/ai-enhanced-scraper.ts
Normal file
139
templates/ai-enhanced-scraper.ts
Normal file
@@ -0,0 +1,139 @@
|
||||
// AI-Enhanced Web Scraper
|
||||
// Combine Browser Rendering with Workers AI to extract structured data intelligently
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
AI: Ai;
|
||||
}
|
||||
|
||||
interface ProductData {
|
||||
name: string;
|
||||
price: string;
|
||||
description: string;
|
||||
availability: string;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url");
|
||||
|
||||
if (!url) {
|
||||
return new Response("Missing ?url parameter", { status: 400 });
|
||||
}
|
||||
|
||||
// Step 1: Scrape page content with browser
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Extract raw HTML content
|
||||
const bodyContent = await page.$eval("body", (el) => el.innerHTML);
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Truncate to fit AI context (4000 chars)
|
||||
const truncatedContent = bodyContent.slice(0, 4000);
|
||||
|
||||
// Step 2: Extract structured data with AI
|
||||
const aiResponse = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", {
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content:
|
||||
"You are a data extraction assistant. Extract product information from HTML and return ONLY valid JSON.",
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `Extract product information from this HTML. Return JSON with these fields: name, price, description, availability. If any field is not found, use empty string.\n\nHTML:\n${truncatedContent}`,
|
||||
},
|
||||
],
|
||||
stream: false,
|
||||
});
|
||||
|
||||
// Parse AI response
|
||||
let productData: ProductData;
|
||||
try {
|
||||
const responseText = (aiResponse as any).response;
|
||||
// Try to extract JSON from response (AI might wrap it in markdown)
|
||||
const jsonMatch = responseText.match(/\{[\s\S]*\}/);
|
||||
if (jsonMatch) {
|
||||
productData = JSON.parse(jsonMatch[0]);
|
||||
} else {
|
||||
productData = JSON.parse(responseText);
|
||||
}
|
||||
} catch {
|
||||
productData = {
|
||||
name: "",
|
||||
price: "",
|
||||
description: "",
|
||||
availability: "",
|
||||
raw: (aiResponse as any).response,
|
||||
};
|
||||
}
|
||||
|
||||
return Response.json({
|
||||
url,
|
||||
product: productData,
|
||||
extractedAt: new Date().toISOString(),
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return Response.json(
|
||||
{
|
||||
error: error instanceof Error ? error.message : "AI-enhanced scraping failed",
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Setup:
|
||||
* Add AI binding to wrangler.jsonc:
|
||||
* {
|
||||
* "browser": { "binding": "MYBROWSER" },
|
||||
* "ai": { "binding": "AI" },
|
||||
* "compatibility_flags": ["nodejs_compat"]
|
||||
* }
|
||||
*
|
||||
* Usage:
|
||||
* GET /?url=https://example.com/product
|
||||
*
|
||||
* Response:
|
||||
* {
|
||||
* "url": "https://example.com/product",
|
||||
* "product": {
|
||||
* "name": "Example Product",
|
||||
* "price": "$99.99",
|
||||
* "description": "Product description...",
|
||||
* "availability": "In Stock"
|
||||
* },
|
||||
* "extractedAt": "2025-10-22T12:34:56.789Z"
|
||||
* }
|
||||
*
|
||||
* Benefits:
|
||||
* - No need to write custom CSS selectors for each site
|
||||
* - AI adapts to different page structures
|
||||
* - Extracts semantic information, not just raw HTML
|
||||
* - Handles variations in HTML structure
|
||||
*
|
||||
* Limitations:
|
||||
* - AI context limited to ~4000 chars of HTML
|
||||
* - May hallucinate if data not present
|
||||
* - Requires AI binding (uses neurons quota)
|
||||
*
|
||||
* See also:
|
||||
* - cloudflare-workers-ai skill for more AI patterns
|
||||
* - web-scraper-basic.ts for traditional CSS selector approach
|
||||
*/
|
||||
76
templates/basic-screenshot.ts
Normal file
76
templates/basic-screenshot.ts
Normal file
@@ -0,0 +1,76 @@
|
||||
// Basic Screenshot Example
|
||||
// Minimal example for taking screenshots with Cloudflare Browser Rendering
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url");
|
||||
|
||||
if (!url) {
|
||||
return new Response("Missing ?url parameter. Example: ?url=https://example.com", {
|
||||
status: 400,
|
||||
});
|
||||
}
|
||||
|
||||
let normalizedUrl: string;
|
||||
try {
|
||||
normalizedUrl = new URL(url).toString();
|
||||
} catch {
|
||||
return new Response("Invalid URL", { status: 400 });
|
||||
}
|
||||
|
||||
// Launch browser
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
// Create new page
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Navigate to URL
|
||||
await page.goto(normalizedUrl, {
|
||||
waitUntil: "networkidle0", // Wait for network to be idle
|
||||
timeout: 30000, // 30 second timeout
|
||||
});
|
||||
|
||||
// Take screenshot
|
||||
const screenshot = await page.screenshot({
|
||||
fullPage: true, // Capture full scrollable page
|
||||
type: "png",
|
||||
});
|
||||
|
||||
// Clean up
|
||||
await browser.close();
|
||||
|
||||
return new Response(screenshot, {
|
||||
headers: {
|
||||
"content-type": "image/png",
|
||||
"cache-control": "public, max-age=3600", // Cache for 1 hour
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
// Always close browser on error
|
||||
await browser.close();
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Deploy:
|
||||
* npx wrangler deploy
|
||||
*
|
||||
* Test:
|
||||
* https://your-worker.workers.dev/?url=https://example.com
|
||||
*
|
||||
* Configuration (wrangler.jsonc):
|
||||
* {
|
||||
* "browser": { "binding": "MYBROWSER" },
|
||||
* "compatibility_flags": ["nodejs_compat"]
|
||||
* }
|
||||
*/
|
||||
127
templates/pdf-generation.ts
Normal file
127
templates/pdf-generation.ts
Normal file
@@ -0,0 +1,127 @@
|
||||
// PDF Generation
|
||||
// Generate PDFs from URLs or custom HTML content
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
}
|
||||
|
||||
interface PDFRequest {
|
||||
url?: string;
|
||||
html?: string;
|
||||
options?: {
|
||||
format?: "Letter" | "A4" | "A3" | "Legal";
|
||||
landscape?: boolean;
|
||||
margin?: {
|
||||
top?: string;
|
||||
right?: string;
|
||||
bottom?: string;
|
||||
left?: string;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
if (request.method !== "POST") {
|
||||
return new Response("Method not allowed. Use POST with JSON body.", {
|
||||
status: 405,
|
||||
});
|
||||
}
|
||||
|
||||
const body = await request.json<PDFRequest>();
|
||||
const { url, html, options = {} } = body;
|
||||
|
||||
if (!url && !html) {
|
||||
return new Response('Missing "url" or "html" in request body', {
|
||||
status: 400,
|
||||
});
|
||||
}
|
||||
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Load content
|
||||
if (html) {
|
||||
await page.setContent(html, { waitUntil: "networkidle0" });
|
||||
} else if (url) {
|
||||
await page.goto(url, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
}
|
||||
|
||||
// Generate PDF
|
||||
const pdf = await page.pdf({
|
||||
format: options.format || "A4",
|
||||
landscape: options.landscape || false,
|
||||
printBackground: true, // Include background colors/images
|
||||
margin: options.margin || {
|
||||
top: "1cm",
|
||||
right: "1cm",
|
||||
bottom: "1cm",
|
||||
left: "1cm",
|
||||
},
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Generate filename
|
||||
const filename = url
|
||||
? `${new URL(url).hostname.replace(/\./g, "_")}.pdf`
|
||||
: "document.pdf";
|
||||
|
||||
return new Response(pdf, {
|
||||
headers: {
|
||||
"content-type": "application/pdf",
|
||||
"content-disposition": `attachment; filename="${filename}"`,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
error: error instanceof Error ? error.message : "PDF generation failed",
|
||||
}),
|
||||
{
|
||||
status: 500,
|
||||
headers: { "content-type": "application/json" },
|
||||
}
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Usage Examples:
|
||||
*
|
||||
* 1. PDF from URL:
|
||||
* POST /
|
||||
* Content-Type: application/json
|
||||
* {
|
||||
* "url": "https://example.com"
|
||||
* }
|
||||
*
|
||||
* 2. PDF from custom HTML:
|
||||
* POST /
|
||||
* {
|
||||
* "html": "<!DOCTYPE html><html><body><h1>Invoice</h1></body></html>"
|
||||
* }
|
||||
*
|
||||
* 3. PDF with custom options:
|
||||
* POST /
|
||||
* {
|
||||
* "url": "https://example.com",
|
||||
* "options": {
|
||||
* "format": "Letter",
|
||||
* "landscape": true,
|
||||
* "margin": {
|
||||
* "top": "2cm",
|
||||
* "bottom": "2cm"
|
||||
* }
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
99
templates/playwright-example.ts
Normal file
99
templates/playwright-example.ts
Normal file
@@ -0,0 +1,99 @@
|
||||
// Playwright Example
|
||||
// Alternative to Puppeteer using @cloudflare/playwright
|
||||
|
||||
import { chromium } from "@cloudflare/playwright";
|
||||
|
||||
interface Env {
|
||||
BROWSER: Fetcher;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url") || "https://example.com";
|
||||
|
||||
// Launch browser (note: chromium.launch instead of puppeteer.launch)
|
||||
const browser = await chromium.launch(env.BROWSER);
|
||||
|
||||
try {
|
||||
// Create new page
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Navigate to URL
|
||||
await page.goto(url, {
|
||||
waitUntil: "networkidle",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Take screenshot
|
||||
const screenshot = await page.screenshot({
|
||||
fullPage: true,
|
||||
type: "png",
|
||||
});
|
||||
|
||||
// Clean up
|
||||
await browser.close();
|
||||
|
||||
return new Response(screenshot, {
|
||||
headers: {
|
||||
"content-type": "image/png",
|
||||
"cache-control": "public, max-age=3600",
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
error: error instanceof Error ? error.message : "Screenshot failed",
|
||||
}),
|
||||
{
|
||||
status: 500,
|
||||
headers: { "content-type": "application/json" },
|
||||
}
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Playwright vs Puppeteer:
|
||||
*
|
||||
* Similarities:
|
||||
* - Very similar API (page.goto, page.screenshot, etc.)
|
||||
* - Both support Chromium on Workers
|
||||
* - Same use cases (screenshots, PDFs, scraping)
|
||||
*
|
||||
* Differences:
|
||||
*
|
||||
* | Feature | Puppeteer | Playwright |
|
||||
* |---------|-----------|------------|
|
||||
* | Import | `import puppeteer from "@cloudflare/puppeteer"` | `import { chromium } from "@cloudflare/playwright"` |
|
||||
* | Launch | `puppeteer.launch(env.MYBROWSER)` | `chromium.launch(env.BROWSER)` |
|
||||
* | Session Management | ✅ Advanced (sessions, history, limits) | ⚠️ Basic |
|
||||
* | Auto-waiting | Manual waitForSelector() | Built-in auto-waiting |
|
||||
* | Selectors | CSS only | CSS, text, XPath (via workaround) |
|
||||
* | Version | @cloudflare/puppeteer@1.0.4 | @cloudflare/playwright@1.0.0 |
|
||||
*
|
||||
* When to use Playwright:
|
||||
* - Already using Playwright for testing
|
||||
* - Prefer auto-waiting behavior
|
||||
* - Don't need advanced session management
|
||||
*
|
||||
* When to use Puppeteer:
|
||||
* - Need session reuse for performance
|
||||
* - Want to check limits before launching
|
||||
* - More familiar with Puppeteer API
|
||||
*
|
||||
* Installation:
|
||||
* npm install @cloudflare/playwright
|
||||
*
|
||||
* Configuration (wrangler.jsonc):
|
||||
* {
|
||||
* "browser": { "binding": "BROWSER" },
|
||||
* "compatibility_flags": ["nodejs_compat"]
|
||||
* }
|
||||
*
|
||||
* Recommendation:
|
||||
* Stick with Puppeteer for most use cases unless you have
|
||||
* existing Playwright tests to migrate.
|
||||
*/
|
||||
107
templates/screenshot-with-kv-cache.ts
Normal file
107
templates/screenshot-with-kv-cache.ts
Normal file
@@ -0,0 +1,107 @@
|
||||
// Screenshot with KV Caching
|
||||
// Production-ready screenshot service with KV caching to reduce browser usage
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
SCREENSHOT_CACHE: KVNamespace;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url");
|
||||
const refresh = searchParams.get("refresh") === "true";
|
||||
|
||||
if (!url) {
|
||||
return new Response("Missing ?url parameter", { status: 400 });
|
||||
}
|
||||
|
||||
const normalizedUrl = new URL(url).toString();
|
||||
|
||||
// Check cache (unless refresh requested)
|
||||
if (!refresh) {
|
||||
const cached = await env.SCREENSHOT_CACHE.get(normalizedUrl, {
|
||||
type: "arrayBuffer",
|
||||
});
|
||||
|
||||
if (cached) {
|
||||
return new Response(cached, {
|
||||
headers: {
|
||||
"content-type": "image/png",
|
||||
"x-cache": "HIT",
|
||||
"cache-control": "public, max-age=3600",
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Generate screenshot
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(normalizedUrl, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
const screenshot = await page.screenshot({
|
||||
fullPage: true,
|
||||
type: "png",
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
// Cache for 24 hours
|
||||
await env.SCREENSHOT_CACHE.put(normalizedUrl, screenshot, {
|
||||
expirationTtl: 60 * 60 * 24, // 24 hours
|
||||
});
|
||||
|
||||
return new Response(screenshot, {
|
||||
headers: {
|
||||
"content-type": "image/png",
|
||||
"x-cache": "MISS",
|
||||
"cache-control": "public, max-age=3600",
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
error: error instanceof Error ? error.message : "Screenshot failed",
|
||||
}),
|
||||
{
|
||||
status: 500,
|
||||
headers: { "content-type": "application/json" },
|
||||
}
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Setup:
|
||||
* 1. Create KV namespace:
|
||||
* npx wrangler kv namespace create SCREENSHOT_CACHE
|
||||
* npx wrangler kv namespace create SCREENSHOT_CACHE --preview
|
||||
*
|
||||
* 2. Add to wrangler.jsonc:
|
||||
* {
|
||||
* "browser": { "binding": "MYBROWSER" },
|
||||
* "compatibility_flags": ["nodejs_compat"],
|
||||
* "kv_namespaces": [
|
||||
* {
|
||||
* "binding": "SCREENSHOT_CACHE",
|
||||
* "id": "YOUR_KV_ID",
|
||||
* "preview_id": "YOUR_PREVIEW_ID"
|
||||
* }
|
||||
* ]
|
||||
* }
|
||||
*
|
||||
* Usage:
|
||||
* New screenshot: ?url=https://example.com
|
||||
* Force refresh: ?url=https://example.com&refresh=true
|
||||
*/
|
||||
118
templates/session-reuse.ts
Normal file
118
templates/session-reuse.ts
Normal file
@@ -0,0 +1,118 @@
|
||||
// Session Reuse Pattern
|
||||
// Optimize performance by reusing browser sessions instead of launching new ones
|
||||
|
||||
import puppeteer, { Browser } from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get or create a browser instance
|
||||
* Tries to connect to existing session first, launches new one if needed
|
||||
*/
|
||||
async function getBrowser(env: Env): Promise<{ browser: Browser; launched: boolean }> {
|
||||
// Check for available sessions
|
||||
const sessions = await puppeteer.sessions(env.MYBROWSER);
|
||||
|
||||
// Find sessions without active connections
|
||||
const freeSessions = sessions.filter((s) => !s.connectionId);
|
||||
|
||||
if (freeSessions.length > 0) {
|
||||
// Try to connect to existing session
|
||||
try {
|
||||
console.log("Connecting to existing session:", freeSessions[0].sessionId);
|
||||
const browser = await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId);
|
||||
return { browser, launched: false };
|
||||
} catch (error) {
|
||||
console.log("Failed to connect, launching new browser:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Check limits before launching
|
||||
const limits = await puppeteer.limits(env.MYBROWSER);
|
||||
if (limits.allowedBrowserAcquisitions === 0) {
|
||||
throw new Error(
|
||||
`Rate limit reached. Retry after ${limits.timeUntilNextAllowedBrowserAcquisition}ms`
|
||||
);
|
||||
}
|
||||
|
||||
// Launch new session
|
||||
console.log("Launching new browser session");
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
return { browser, launched: true };
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url") || "https://example.com";
|
||||
|
||||
try {
|
||||
// Get or create browser
|
||||
const { browser, launched } = await getBrowser(env);
|
||||
const sessionId = browser.sessionId();
|
||||
|
||||
console.log({
|
||||
sessionId,
|
||||
launched,
|
||||
message: launched ? "New browser launched" : "Reused existing session",
|
||||
});
|
||||
|
||||
// Do work
|
||||
const page = await browser.newPage();
|
||||
await page.goto(url, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
const screenshot = await page.screenshot();
|
||||
await page.close();
|
||||
|
||||
// IMPORTANT: Disconnect (don't close) to keep session alive for reuse
|
||||
await browser.disconnect();
|
||||
|
||||
return new Response(screenshot, {
|
||||
headers: {
|
||||
"content-type": "image/png",
|
||||
"x-session-id": sessionId,
|
||||
"x-session-reused": launched ? "false" : "true",
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
error: error instanceof Error ? error.message : "Unknown error",
|
||||
}),
|
||||
{
|
||||
status: 500,
|
||||
headers: { "content-type": "application/json" },
|
||||
}
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Key Concepts:
|
||||
*
|
||||
* 1. puppeteer.sessions() - List all active sessions
|
||||
* 2. puppeteer.connect() - Connect to existing session
|
||||
* 3. browser.disconnect() - Disconnect WITHOUT closing (keeps session alive)
|
||||
* 4. browser.close() - Terminate session completely
|
||||
* 5. puppeteer.limits() - Check rate limits before launching
|
||||
*
|
||||
* Benefits:
|
||||
* - Faster response times (no cold start)
|
||||
* - Lower concurrency usage
|
||||
* - Better resource utilization
|
||||
*
|
||||
* Trade-offs:
|
||||
* - Sessions time out after 60s idle (extend with keep_alive)
|
||||
* - Must handle connection failures gracefully
|
||||
* - Need to track which sessions are available
|
||||
*
|
||||
* Response Headers:
|
||||
* - x-session-id: Browser session ID
|
||||
* - x-session-reused: true if reused existing session
|
||||
*/
|
||||
116
templates/web-scraper-basic.ts
Normal file
116
templates/web-scraper-basic.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
// Basic Web Scraper
|
||||
// Extract structured data from web pages
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
}
|
||||
|
||||
interface ScrapedData {
|
||||
url: string;
|
||||
title: string;
|
||||
description: string;
|
||||
headings: string[];
|
||||
links: Array<{ text: string; href: string }>;
|
||||
images: Array<{ alt: string; src: string }>;
|
||||
timestamp: string;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url");
|
||||
|
||||
if (!url) {
|
||||
return new Response("Missing ?url parameter", { status: 400 });
|
||||
}
|
||||
|
||||
const normalizedUrl = new URL(url).toString();
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Navigate to page
|
||||
await page.goto(normalizedUrl, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Wait for body to be present
|
||||
await page.waitForSelector("body");
|
||||
|
||||
// Extract structured data
|
||||
const data = await page.evaluate<ScrapedData>(() => {
|
||||
// Get all headings
|
||||
const headings = Array.from(document.querySelectorAll("h1, h2, h3")).map(
|
||||
(el) => el.textContent?.trim() || ""
|
||||
);
|
||||
|
||||
// Get all links
|
||||
const links = Array.from(document.querySelectorAll("a"))
|
||||
.filter((a) => a.href)
|
||||
.map((a) => ({
|
||||
text: a.textContent?.trim() || "",
|
||||
href: a.href,
|
||||
}))
|
||||
.slice(0, 50); // Limit to first 50 links
|
||||
|
||||
// Get all images
|
||||
const images = Array.from(document.querySelectorAll("img"))
|
||||
.filter((img) => img.src)
|
||||
.map((img) => ({
|
||||
alt: img.alt || "",
|
||||
src: img.src,
|
||||
}))
|
||||
.slice(0, 20); // Limit to first 20 images
|
||||
|
||||
return {
|
||||
url: window.location.href,
|
||||
title: document.title,
|
||||
description:
|
||||
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
|
||||
"",
|
||||
headings,
|
||||
links,
|
||||
images,
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
return Response.json(data, {
|
||||
headers: {
|
||||
"cache-control": "public, max-age=3600",
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return Response.json(
|
||||
{
|
||||
error: error instanceof Error ? error.message : "Scraping failed",
|
||||
url: normalizedUrl,
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Usage:
|
||||
* GET /?url=https://example.com
|
||||
*
|
||||
* Response:
|
||||
* {
|
||||
* "url": "https://example.com",
|
||||
* "title": "Example Domain",
|
||||
* "description": "...",
|
||||
* "headings": ["Example Domain"],
|
||||
* "links": [{ "text": "More information...", "href": "..." }],
|
||||
* "images": [],
|
||||
* "timestamp": "2025-10-22T12:34:56.789Z"
|
||||
* }
|
||||
*/
|
||||
138
templates/web-scraper-batch.ts
Normal file
138
templates/web-scraper-batch.ts
Normal file
@@ -0,0 +1,138 @@
|
||||
// Batch Web Scraper
|
||||
// Scrape multiple URLs efficiently using browser tabs
|
||||
|
||||
import puppeteer, { Browser } from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
}
|
||||
|
||||
interface ScrapeResult {
|
||||
url: string;
|
||||
success: boolean;
|
||||
data?: {
|
||||
title: string;
|
||||
description: string;
|
||||
textContent: string; // First 500 chars
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function scrapePage(browser: Browser, url: string): Promise<ScrapeResult> {
|
||||
const page = await browser.newPage();
|
||||
|
||||
try {
|
||||
await page.goto(url, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
const data = await page.evaluate(() => ({
|
||||
title: document.title,
|
||||
description:
|
||||
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
|
||||
"",
|
||||
textContent: document.body.innerText.slice(0, 500), // First 500 chars
|
||||
}));
|
||||
|
||||
await page.close();
|
||||
|
||||
return {
|
||||
url,
|
||||
success: true,
|
||||
data,
|
||||
};
|
||||
} catch (error) {
|
||||
await page.close();
|
||||
|
||||
return {
|
||||
url,
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : "Unknown error",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
if (request.method !== "POST") {
|
||||
return new Response("Method not allowed. Use POST with JSON body.", {
|
||||
status: 405,
|
||||
});
|
||||
}
|
||||
|
||||
const { urls } = await request.json<{ urls: string[] }>();
|
||||
|
||||
if (!urls || !Array.isArray(urls) || urls.length === 0) {
|
||||
return new Response('Missing "urls" array in request body', {
|
||||
status: 400,
|
||||
});
|
||||
}
|
||||
|
||||
// Limit batch size
|
||||
if (urls.length > 20) {
|
||||
return new Response("Maximum 20 URLs per batch", { status: 400 });
|
||||
}
|
||||
|
||||
// Launch single browser
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
// Scrape all URLs in parallel (each in its own tab)
|
||||
const results = await Promise.all(urls.map((url) => scrapePage(browser, url)));
|
||||
|
||||
await browser.close();
|
||||
|
||||
const summary = {
|
||||
total: results.length,
|
||||
successful: results.filter((r) => r.success).length,
|
||||
failed: results.filter((r) => !r.success).length,
|
||||
};
|
||||
|
||||
return Response.json({
|
||||
summary,
|
||||
results,
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return Response.json(
|
||||
{
|
||||
error: error instanceof Error ? error.message : "Batch scraping failed",
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Usage:
|
||||
* POST /
|
||||
* Content-Type: application/json
|
||||
* {
|
||||
* "urls": [
|
||||
* "https://example.com",
|
||||
* "https://example.org",
|
||||
* "https://example.net"
|
||||
* ]
|
||||
* }
|
||||
*
|
||||
* Response:
|
||||
* {
|
||||
* "summary": {
|
||||
* "total": 3,
|
||||
* "successful": 3,
|
||||
* "failed": 0
|
||||
* },
|
||||
* "results": [
|
||||
* {
|
||||
* "url": "https://example.com",
|
||||
* "success": true,
|
||||
* "data": { "title": "...", "description": "...", "textContent": "..." }
|
||||
* }
|
||||
* ]
|
||||
* }
|
||||
*
|
||||
* Note: Uses 1 browser with multiple tabs instead of multiple browsers.
|
||||
* This reduces concurrency usage and is more efficient.
|
||||
*/
|
||||
116
templates/wrangler-browser-config.jsonc
Normal file
116
templates/wrangler-browser-config.jsonc
Normal file
@@ -0,0 +1,116 @@
|
||||
// Complete wrangler.jsonc configuration for Browser Rendering
|
||||
{
|
||||
"name": "browser-worker",
|
||||
"main": "src/index.ts",
|
||||
"compatibility_date": "2023-03-14",
|
||||
|
||||
// REQUIRED: nodejs_compat flag for Browser Rendering
|
||||
"compatibility_flags": [
|
||||
"nodejs_compat"
|
||||
],
|
||||
|
||||
// Browser binding (required)
|
||||
"browser": {
|
||||
"binding": "MYBROWSER"
|
||||
// Optional: Use real headless browser during local development
|
||||
// "remote": true
|
||||
},
|
||||
|
||||
// Optional: KV for caching screenshots/PDFs
|
||||
// Create with: npx wrangler kv namespace create SCREENSHOT_CACHE
|
||||
// npx wrangler kv namespace create SCREENSHOT_CACHE --preview
|
||||
"kv_namespaces": [
|
||||
{
|
||||
"binding": "SCREENSHOT_CACHE",
|
||||
"id": "YOUR_KV_ID", // Replace with actual ID
|
||||
"preview_id": "YOUR_PREVIEW_ID" // Replace with actual preview ID
|
||||
}
|
||||
],
|
||||
|
||||
// Optional: R2 for storing generated files
|
||||
// Create with: npx wrangler r2 bucket create browser-files
|
||||
"r2_buckets": [
|
||||
{
|
||||
"binding": "BROWSER_FILES",
|
||||
"bucket_name": "browser-files"
|
||||
}
|
||||
],
|
||||
|
||||
// Optional: AI binding for AI-enhanced scraping
|
||||
"ai": {
|
||||
"binding": "AI"
|
||||
},
|
||||
|
||||
// Optional: D1 for storing scraping results
|
||||
// Create with: npx wrangler d1 create browser-db
|
||||
"d1_databases": [
|
||||
{
|
||||
"binding": "DB",
|
||||
"database_name": "browser-db",
|
||||
"database_id": "YOUR_DB_ID"
|
||||
}
|
||||
],
|
||||
|
||||
// Optional: Environment variables
|
||||
"vars": {
|
||||
"ENVIRONMENT": "production"
|
||||
},
|
||||
|
||||
// Optional: Secrets (set with: npx wrangler secret put SECRET_NAME)
|
||||
// "secrets": ["API_KEY"]
|
||||
|
||||
// Optional: Custom routes for production
|
||||
// "routes": [
|
||||
// {
|
||||
// "pattern": "browser.example.com/*",
|
||||
// "zone_name": "example.com"
|
||||
// }
|
||||
// ]
|
||||
}
|
||||
|
||||
/**
|
||||
* Key Configuration Notes:
|
||||
*
|
||||
* 1. nodejs_compat flag is REQUIRED
|
||||
* - Browser Rendering needs Node.js APIs
|
||||
* - Automatically enables nodejs_compat_v2 if compatibility_date >= 2024-09-23
|
||||
*
|
||||
* 2. Browser binding name
|
||||
* - Use "MYBROWSER" or any name you prefer
|
||||
* - Reference in code: env.MYBROWSER
|
||||
*
|
||||
* 3. Remote binding for local development
|
||||
* - "remote": true connects to real headless browser
|
||||
* - Useful if hitting 1MB request limit in local dev
|
||||
* - Remove for production (not needed)
|
||||
*
|
||||
* 4. KV for caching
|
||||
* - Highly recommended for production screenshot services
|
||||
* - Reduces browser usage and costs
|
||||
* - Cache TTL: typically 1-24 hours
|
||||
*
|
||||
* 5. R2 for file storage
|
||||
* - Store generated PDFs or screenshots long-term
|
||||
* - Cheaper than KV for large files
|
||||
* - Use presigned URLs for downloads
|
||||
*
|
||||
* 6. AI binding
|
||||
* - Optional: for AI-enhanced scraping
|
||||
* - Requires Workers Paid plan
|
||||
* - See cloudflare-workers-ai skill
|
||||
*
|
||||
* 7. D1 database
|
||||
* - Optional: store scraping metadata
|
||||
* - Track URLs, timestamps, status
|
||||
* - See cloudflare-d1 skill
|
||||
*
|
||||
* Commands:
|
||||
* npx wrangler dev # Local development
|
||||
* npx wrangler deploy # Deploy to production
|
||||
* npx wrangler tail # View logs
|
||||
*
|
||||
* See also:
|
||||
* - cloudflare-worker-base skill for complete Worker setup
|
||||
* - cloudflare-kv skill for KV caching patterns
|
||||
* - cloudflare-r2 skill for R2 storage patterns
|
||||
*/
|
||||
Reference in New Issue
Block a user