Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:24:08 +08:00
commit 7c90a3ac2b
18 changed files with 4579 additions and 0 deletions

View File

@@ -0,0 +1,139 @@
// AI-Enhanced Web Scraper
// Combine Browser Rendering with Workers AI to extract structured data intelligently
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
AI: Ai;
}
interface ProductData {
name: string;
price: string;
description: string;
availability: string;
[key: string]: any;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
if (!url) {
return new Response("Missing ?url parameter", { status: 400 });
}
// Step 1: Scrape page content with browser
const browser = await puppeteer.launch(env.MYBROWSER);
try {
const page = await browser.newPage();
await page.goto(url, {
waitUntil: "networkidle0",
timeout: 30000,
});
// Extract raw HTML content
const bodyContent = await page.$eval("body", (el) => el.innerHTML);
await browser.close();
// Truncate to fit AI context (4000 chars)
const truncatedContent = bodyContent.slice(0, 4000);
// Step 2: Extract structured data with AI
const aiResponse = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", {
messages: [
{
role: "system",
content:
"You are a data extraction assistant. Extract product information from HTML and return ONLY valid JSON.",
},
{
role: "user",
content: `Extract product information from this HTML. Return JSON with these fields: name, price, description, availability. If any field is not found, use empty string.\n\nHTML:\n${truncatedContent}`,
},
],
stream: false,
});
// Parse AI response
let productData: ProductData;
try {
const responseText = (aiResponse as any).response;
// Try to extract JSON from response (AI might wrap it in markdown)
const jsonMatch = responseText.match(/\{[\s\S]*\}/);
if (jsonMatch) {
productData = JSON.parse(jsonMatch[0]);
} else {
productData = JSON.parse(responseText);
}
} catch {
productData = {
name: "",
price: "",
description: "",
availability: "",
raw: (aiResponse as any).response,
};
}
return Response.json({
url,
product: productData,
extractedAt: new Date().toISOString(),
});
} catch (error) {
await browser.close();
return Response.json(
{
error: error instanceof Error ? error.message : "AI-enhanced scraping failed",
},
{ status: 500 }
);
}
},
};
/**
* Setup:
* Add AI binding to wrangler.jsonc:
* {
* "browser": { "binding": "MYBROWSER" },
* "ai": { "binding": "AI" },
* "compatibility_flags": ["nodejs_compat"]
* }
*
* Usage:
* GET /?url=https://example.com/product
*
* Response:
* {
* "url": "https://example.com/product",
* "product": {
* "name": "Example Product",
* "price": "$99.99",
* "description": "Product description...",
* "availability": "In Stock"
* },
* "extractedAt": "2025-10-22T12:34:56.789Z"
* }
*
* Benefits:
* - No need to write custom CSS selectors for each site
* - AI adapts to different page structures
* - Extracts semantic information, not just raw HTML
* - Handles variations in HTML structure
*
* Limitations:
* - AI context limited to ~4000 chars of HTML
* - May hallucinate if data not present
* - Requires AI binding (uses neurons quota)
*
* See also:
* - cloudflare-workers-ai skill for more AI patterns
* - web-scraper-basic.ts for traditional CSS selector approach
*/

View File

@@ -0,0 +1,76 @@
// Basic Screenshot Example
// Minimal example for taking screenshots with Cloudflare Browser Rendering
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
if (!url) {
return new Response("Missing ?url parameter. Example: ?url=https://example.com", {
status: 400,
});
}
let normalizedUrl: string;
try {
normalizedUrl = new URL(url).toString();
} catch {
return new Response("Invalid URL", { status: 400 });
}
// Launch browser
const browser = await puppeteer.launch(env.MYBROWSER);
try {
// Create new page
const page = await browser.newPage();
// Navigate to URL
await page.goto(normalizedUrl, {
waitUntil: "networkidle0", // Wait for network to be idle
timeout: 30000, // 30 second timeout
});
// Take screenshot
const screenshot = await page.screenshot({
fullPage: true, // Capture full scrollable page
type: "png",
});
// Clean up
await browser.close();
return new Response(screenshot, {
headers: {
"content-type": "image/png",
"cache-control": "public, max-age=3600", // Cache for 1 hour
},
});
} catch (error) {
// Always close browser on error
await browser.close();
throw error;
}
},
};
/**
* Deploy:
* npx wrangler deploy
*
* Test:
* https://your-worker.workers.dev/?url=https://example.com
*
* Configuration (wrangler.jsonc):
* {
* "browser": { "binding": "MYBROWSER" },
* "compatibility_flags": ["nodejs_compat"]
* }
*/

127
templates/pdf-generation.ts Normal file
View File

@@ -0,0 +1,127 @@
// PDF Generation
// Generate PDFs from URLs or custom HTML content
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
}
interface PDFRequest {
url?: string;
html?: string;
options?: {
format?: "Letter" | "A4" | "A3" | "Legal";
landscape?: boolean;
margin?: {
top?: string;
right?: string;
bottom?: string;
left?: string;
};
};
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
if (request.method !== "POST") {
return new Response("Method not allowed. Use POST with JSON body.", {
status: 405,
});
}
const body = await request.json<PDFRequest>();
const { url, html, options = {} } = body;
if (!url && !html) {
return new Response('Missing "url" or "html" in request body', {
status: 400,
});
}
const browser = await puppeteer.launch(env.MYBROWSER);
try {
const page = await browser.newPage();
// Load content
if (html) {
await page.setContent(html, { waitUntil: "networkidle0" });
} else if (url) {
await page.goto(url, {
waitUntil: "networkidle0",
timeout: 30000,
});
}
// Generate PDF
const pdf = await page.pdf({
format: options.format || "A4",
landscape: options.landscape || false,
printBackground: true, // Include background colors/images
margin: options.margin || {
top: "1cm",
right: "1cm",
bottom: "1cm",
left: "1cm",
},
});
await browser.close();
// Generate filename
const filename = url
? `${new URL(url).hostname.replace(/\./g, "_")}.pdf`
: "document.pdf";
return new Response(pdf, {
headers: {
"content-type": "application/pdf",
"content-disposition": `attachment; filename="${filename}"`,
},
});
} catch (error) {
await browser.close();
return new Response(
JSON.stringify({
error: error instanceof Error ? error.message : "PDF generation failed",
}),
{
status: 500,
headers: { "content-type": "application/json" },
}
);
}
},
};
/**
* Usage Examples:
*
* 1. PDF from URL:
* POST /
* Content-Type: application/json
* {
* "url": "https://example.com"
* }
*
* 2. PDF from custom HTML:
* POST /
* {
* "html": "<!DOCTYPE html><html><body><h1>Invoice</h1></body></html>"
* }
*
* 3. PDF with custom options:
* POST /
* {
* "url": "https://example.com",
* "options": {
* "format": "Letter",
* "landscape": true,
* "margin": {
* "top": "2cm",
* "bottom": "2cm"
* }
* }
* }
*/

View File

@@ -0,0 +1,99 @@
// Playwright Example
// Alternative to Puppeteer using @cloudflare/playwright
import { chromium } from "@cloudflare/playwright";
interface Env {
BROWSER: Fetcher;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url") || "https://example.com";
// Launch browser (note: chromium.launch instead of puppeteer.launch)
const browser = await chromium.launch(env.BROWSER);
try {
// Create new page
const page = await browser.newPage();
// Navigate to URL
await page.goto(url, {
waitUntil: "networkidle",
timeout: 30000,
});
// Take screenshot
const screenshot = await page.screenshot({
fullPage: true,
type: "png",
});
// Clean up
await browser.close();
return new Response(screenshot, {
headers: {
"content-type": "image/png",
"cache-control": "public, max-age=3600",
},
});
} catch (error) {
await browser.close();
return new Response(
JSON.stringify({
error: error instanceof Error ? error.message : "Screenshot failed",
}),
{
status: 500,
headers: { "content-type": "application/json" },
}
);
}
},
};
/**
* Playwright vs Puppeteer:
*
* Similarities:
* - Very similar API (page.goto, page.screenshot, etc.)
* - Both support Chromium on Workers
* - Same use cases (screenshots, PDFs, scraping)
*
* Differences:
*
* | Feature | Puppeteer | Playwright |
* |---------|-----------|------------|
* | Import | `import puppeteer from "@cloudflare/puppeteer"` | `import { chromium } from "@cloudflare/playwright"` |
* | Launch | `puppeteer.launch(env.MYBROWSER)` | `chromium.launch(env.BROWSER)` |
* | Session Management | ✅ Advanced (sessions, history, limits) | ⚠️ Basic |
* | Auto-waiting | Manual waitForSelector() | Built-in auto-waiting |
* | Selectors | CSS only | CSS, text, XPath (via workaround) |
* | Version | @cloudflare/puppeteer@1.0.4 | @cloudflare/playwright@1.0.0 |
*
* When to use Playwright:
* - Already using Playwright for testing
* - Prefer auto-waiting behavior
* - Don't need advanced session management
*
* When to use Puppeteer:
* - Need session reuse for performance
* - Want to check limits before launching
* - More familiar with Puppeteer API
*
* Installation:
* npm install @cloudflare/playwright
*
* Configuration (wrangler.jsonc):
* {
* "browser": { "binding": "BROWSER" },
* "compatibility_flags": ["nodejs_compat"]
* }
*
* Recommendation:
* Stick with Puppeteer for most use cases unless you have
* existing Playwright tests to migrate.
*/

View File

@@ -0,0 +1,107 @@
// Screenshot with KV Caching
// Production-ready screenshot service with KV caching to reduce browser usage
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
SCREENSHOT_CACHE: KVNamespace;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
const refresh = searchParams.get("refresh") === "true";
if (!url) {
return new Response("Missing ?url parameter", { status: 400 });
}
const normalizedUrl = new URL(url).toString();
// Check cache (unless refresh requested)
if (!refresh) {
const cached = await env.SCREENSHOT_CACHE.get(normalizedUrl, {
type: "arrayBuffer",
});
if (cached) {
return new Response(cached, {
headers: {
"content-type": "image/png",
"x-cache": "HIT",
"cache-control": "public, max-age=3600",
},
});
}
}
// Generate screenshot
const browser = await puppeteer.launch(env.MYBROWSER);
try {
const page = await browser.newPage();
await page.goto(normalizedUrl, {
waitUntil: "networkidle0",
timeout: 30000,
});
const screenshot = await page.screenshot({
fullPage: true,
type: "png",
});
await browser.close();
// Cache for 24 hours
await env.SCREENSHOT_CACHE.put(normalizedUrl, screenshot, {
expirationTtl: 60 * 60 * 24, // 24 hours
});
return new Response(screenshot, {
headers: {
"content-type": "image/png",
"x-cache": "MISS",
"cache-control": "public, max-age=3600",
},
});
} catch (error) {
await browser.close();
return new Response(
JSON.stringify({
error: error instanceof Error ? error.message : "Screenshot failed",
}),
{
status: 500,
headers: { "content-type": "application/json" },
}
);
}
},
};
/**
* Setup:
* 1. Create KV namespace:
* npx wrangler kv namespace create SCREENSHOT_CACHE
* npx wrangler kv namespace create SCREENSHOT_CACHE --preview
*
* 2. Add to wrangler.jsonc:
* {
* "browser": { "binding": "MYBROWSER" },
* "compatibility_flags": ["nodejs_compat"],
* "kv_namespaces": [
* {
* "binding": "SCREENSHOT_CACHE",
* "id": "YOUR_KV_ID",
* "preview_id": "YOUR_PREVIEW_ID"
* }
* ]
* }
*
* Usage:
* New screenshot: ?url=https://example.com
* Force refresh: ?url=https://example.com&refresh=true
*/

118
templates/session-reuse.ts Normal file
View File

@@ -0,0 +1,118 @@
// Session Reuse Pattern
// Optimize performance by reusing browser sessions instead of launching new ones
import puppeteer, { Browser } from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
}
/**
* Get or create a browser instance
* Tries to connect to existing session first, launches new one if needed
*/
async function getBrowser(env: Env): Promise<{ browser: Browser; launched: boolean }> {
// Check for available sessions
const sessions = await puppeteer.sessions(env.MYBROWSER);
// Find sessions without active connections
const freeSessions = sessions.filter((s) => !s.connectionId);
if (freeSessions.length > 0) {
// Try to connect to existing session
try {
console.log("Connecting to existing session:", freeSessions[0].sessionId);
const browser = await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId);
return { browser, launched: false };
} catch (error) {
console.log("Failed to connect, launching new browser:", error);
}
}
// Check limits before launching
const limits = await puppeteer.limits(env.MYBROWSER);
if (limits.allowedBrowserAcquisitions === 0) {
throw new Error(
`Rate limit reached. Retry after ${limits.timeUntilNextAllowedBrowserAcquisition}ms`
);
}
// Launch new session
console.log("Launching new browser session");
const browser = await puppeteer.launch(env.MYBROWSER);
return { browser, launched: true };
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url") || "https://example.com";
try {
// Get or create browser
const { browser, launched } = await getBrowser(env);
const sessionId = browser.sessionId();
console.log({
sessionId,
launched,
message: launched ? "New browser launched" : "Reused existing session",
});
// Do work
const page = await browser.newPage();
await page.goto(url, {
waitUntil: "networkidle0",
timeout: 30000,
});
const screenshot = await page.screenshot();
await page.close();
// IMPORTANT: Disconnect (don't close) to keep session alive for reuse
await browser.disconnect();
return new Response(screenshot, {
headers: {
"content-type": "image/png",
"x-session-id": sessionId,
"x-session-reused": launched ? "false" : "true",
},
});
} catch (error) {
return new Response(
JSON.stringify({
error: error instanceof Error ? error.message : "Unknown error",
}),
{
status: 500,
headers: { "content-type": "application/json" },
}
);
}
},
};
/**
* Key Concepts:
*
* 1. puppeteer.sessions() - List all active sessions
* 2. puppeteer.connect() - Connect to existing session
* 3. browser.disconnect() - Disconnect WITHOUT closing (keeps session alive)
* 4. browser.close() - Terminate session completely
* 5. puppeteer.limits() - Check rate limits before launching
*
* Benefits:
* - Faster response times (no cold start)
* - Lower concurrency usage
* - Better resource utilization
*
* Trade-offs:
* - Sessions time out after 60s idle (extend with keep_alive)
* - Must handle connection failures gracefully
* - Need to track which sessions are available
*
* Response Headers:
* - x-session-id: Browser session ID
* - x-session-reused: true if reused existing session
*/

View File

@@ -0,0 +1,116 @@
// Basic Web Scraper
// Extract structured data from web pages
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
}
interface ScrapedData {
url: string;
title: string;
description: string;
headings: string[];
links: Array<{ text: string; href: string }>;
images: Array<{ alt: string; src: string }>;
timestamp: string;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
if (!url) {
return new Response("Missing ?url parameter", { status: 400 });
}
const normalizedUrl = new URL(url).toString();
const browser = await puppeteer.launch(env.MYBROWSER);
try {
const page = await browser.newPage();
// Navigate to page
await page.goto(normalizedUrl, {
waitUntil: "networkidle0",
timeout: 30000,
});
// Wait for body to be present
await page.waitForSelector("body");
// Extract structured data
const data = await page.evaluate<ScrapedData>(() => {
// Get all headings
const headings = Array.from(document.querySelectorAll("h1, h2, h3")).map(
(el) => el.textContent?.trim() || ""
);
// Get all links
const links = Array.from(document.querySelectorAll("a"))
.filter((a) => a.href)
.map((a) => ({
text: a.textContent?.trim() || "",
href: a.href,
}))
.slice(0, 50); // Limit to first 50 links
// Get all images
const images = Array.from(document.querySelectorAll("img"))
.filter((img) => img.src)
.map((img) => ({
alt: img.alt || "",
src: img.src,
}))
.slice(0, 20); // Limit to first 20 images
return {
url: window.location.href,
title: document.title,
description:
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
"",
headings,
links,
images,
timestamp: new Date().toISOString(),
};
});
await browser.close();
return Response.json(data, {
headers: {
"cache-control": "public, max-age=3600",
},
});
} catch (error) {
await browser.close();
return Response.json(
{
error: error instanceof Error ? error.message : "Scraping failed",
url: normalizedUrl,
},
{ status: 500 }
);
}
},
};
/**
* Usage:
* GET /?url=https://example.com
*
* Response:
* {
* "url": "https://example.com",
* "title": "Example Domain",
* "description": "...",
* "headings": ["Example Domain"],
* "links": [{ "text": "More information...", "href": "..." }],
* "images": [],
* "timestamp": "2025-10-22T12:34:56.789Z"
* }
*/

View File

@@ -0,0 +1,138 @@
// Batch Web Scraper
// Scrape multiple URLs efficiently using browser tabs
import puppeteer, { Browser } from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
}
interface ScrapeResult {
url: string;
success: boolean;
data?: {
title: string;
description: string;
textContent: string; // First 500 chars
};
error?: string;
}
async function scrapePage(browser: Browser, url: string): Promise<ScrapeResult> {
const page = await browser.newPage();
try {
await page.goto(url, {
waitUntil: "networkidle0",
timeout: 30000,
});
const data = await page.evaluate(() => ({
title: document.title,
description:
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
"",
textContent: document.body.innerText.slice(0, 500), // First 500 chars
}));
await page.close();
return {
url,
success: true,
data,
};
} catch (error) {
await page.close();
return {
url,
success: false,
error: error instanceof Error ? error.message : "Unknown error",
};
}
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
if (request.method !== "POST") {
return new Response("Method not allowed. Use POST with JSON body.", {
status: 405,
});
}
const { urls } = await request.json<{ urls: string[] }>();
if (!urls || !Array.isArray(urls) || urls.length === 0) {
return new Response('Missing "urls" array in request body', {
status: 400,
});
}
// Limit batch size
if (urls.length > 20) {
return new Response("Maximum 20 URLs per batch", { status: 400 });
}
// Launch single browser
const browser = await puppeteer.launch(env.MYBROWSER);
try {
// Scrape all URLs in parallel (each in its own tab)
const results = await Promise.all(urls.map((url) => scrapePage(browser, url)));
await browser.close();
const summary = {
total: results.length,
successful: results.filter((r) => r.success).length,
failed: results.filter((r) => !r.success).length,
};
return Response.json({
summary,
results,
});
} catch (error) {
await browser.close();
return Response.json(
{
error: error instanceof Error ? error.message : "Batch scraping failed",
},
{ status: 500 }
);
}
},
};
/**
* Usage:
* POST /
* Content-Type: application/json
* {
* "urls": [
* "https://example.com",
* "https://example.org",
* "https://example.net"
* ]
* }
*
* Response:
* {
* "summary": {
* "total": 3,
* "successful": 3,
* "failed": 0
* },
* "results": [
* {
* "url": "https://example.com",
* "success": true,
* "data": { "title": "...", "description": "...", "textContent": "..." }
* }
* ]
* }
*
* Note: Uses 1 browser with multiple tabs instead of multiple browsers.
* This reduces concurrency usage and is more efficient.
*/

View File

@@ -0,0 +1,116 @@
// Complete wrangler.jsonc configuration for Browser Rendering
{
"name": "browser-worker",
"main": "src/index.ts",
"compatibility_date": "2023-03-14",
// REQUIRED: nodejs_compat flag for Browser Rendering
"compatibility_flags": [
"nodejs_compat"
],
// Browser binding (required)
"browser": {
"binding": "MYBROWSER"
// Optional: Use real headless browser during local development
// "remote": true
},
// Optional: KV for caching screenshots/PDFs
// Create with: npx wrangler kv namespace create SCREENSHOT_CACHE
// npx wrangler kv namespace create SCREENSHOT_CACHE --preview
"kv_namespaces": [
{
"binding": "SCREENSHOT_CACHE",
"id": "YOUR_KV_ID", // Replace with actual ID
"preview_id": "YOUR_PREVIEW_ID" // Replace with actual preview ID
}
],
// Optional: R2 for storing generated files
// Create with: npx wrangler r2 bucket create browser-files
"r2_buckets": [
{
"binding": "BROWSER_FILES",
"bucket_name": "browser-files"
}
],
// Optional: AI binding for AI-enhanced scraping
"ai": {
"binding": "AI"
},
// Optional: D1 for storing scraping results
// Create with: npx wrangler d1 create browser-db
"d1_databases": [
{
"binding": "DB",
"database_name": "browser-db",
"database_id": "YOUR_DB_ID"
}
],
// Optional: Environment variables
"vars": {
"ENVIRONMENT": "production"
},
// Optional: Secrets (set with: npx wrangler secret put SECRET_NAME)
// "secrets": ["API_KEY"]
// Optional: Custom routes for production
// "routes": [
// {
// "pattern": "browser.example.com/*",
// "zone_name": "example.com"
// }
// ]
}
/**
* Key Configuration Notes:
*
* 1. nodejs_compat flag is REQUIRED
* - Browser Rendering needs Node.js APIs
* - Automatically enables nodejs_compat_v2 if compatibility_date >= 2024-09-23
*
* 2. Browser binding name
* - Use "MYBROWSER" or any name you prefer
* - Reference in code: env.MYBROWSER
*
* 3. Remote binding for local development
* - "remote": true connects to real headless browser
* - Useful if hitting 1MB request limit in local dev
* - Remove for production (not needed)
*
* 4. KV for caching
* - Highly recommended for production screenshot services
* - Reduces browser usage and costs
* - Cache TTL: typically 1-24 hours
*
* 5. R2 for file storage
* - Store generated PDFs or screenshots long-term
* - Cheaper than KV for large files
* - Use presigned URLs for downloads
*
* 6. AI binding
* - Optional: for AI-enhanced scraping
* - Requires Workers Paid plan
* - See cloudflare-workers-ai skill
*
* 7. D1 database
* - Optional: store scraping metadata
* - Track URLs, timestamps, status
* - See cloudflare-d1 skill
*
* Commands:
* npx wrangler dev # Local development
* npx wrangler deploy # Deploy to production
* npx wrangler tail # View logs
*
* See also:
* - cloudflare-worker-base skill for complete Worker setup
* - cloudflare-kv skill for KV caching patterns
* - cloudflare-r2 skill for R2 storage patterns
*/