commit 7c90a3ac2b01e50a6cbd8b208e91f4e53a62370f Author: Zhongwei Li Date: Sun Nov 30 08:24:08 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..ed16dcc --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "cloudflare-browser-rendering", + "description": "Add headless Chrome automation with Puppeteer/Playwright on Cloudflare Workers. Use when: taking screenshots, generating PDFs, web scraping, crawling sites, browser automation, or troubleshooting XPath errors, browser timeouts, binding not passed errors, or session limits.", + "version": "1.0.0", + "author": { + "name": "Jeremy Dawes", + "email": "jeremy@jezweb.net" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..80f80c1 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# cloudflare-browser-rendering + +Add headless Chrome automation with Puppeteer/Playwright on Cloudflare Workers. Use when: taking screenshots, generating PDFs, web scraping, crawling sites, browser automation, or troubleshooting XPath errors, browser timeouts, binding not passed errors, or session limits. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..4d25c73 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,783 @@ +--- +name: cloudflare-browser-rendering +description: | + Add headless Chrome automation with Puppeteer/Playwright on Cloudflare Workers. Use when: taking screenshots, generating PDFs, web scraping, crawling sites, browser automation, or troubleshooting XPath errors, browser timeouts, binding not passed errors, or session limits. +license: MIT +--- + +# Cloudflare Browser Rendering - Complete Reference + +Production-ready knowledge domain for building browser automation workflows with Cloudflare Browser Rendering. + +**Status**: Production Ready ✅ +**Last Updated**: 2025-11-23 +**Dependencies**: cloudflare-worker-base (for Worker setup) +**Latest Versions**: @cloudflare/puppeteer@1.0.4 (July 2025), @cloudflare/playwright@1.0.0 (Playwright v1.55 GA Sept 2025), wrangler@4.50.0 + +**Recent Updates (2025)**: +- **Sept 2025**: Playwright v1.55 GA, Stagehand framework support (Workers AI), /links excludeExternalLinks param +- **Aug 2025**: Billing GA (Aug 20), /sessions endpoint in local dev, X-Browser-Ms-Used header +- **July 2025**: Playwright v1.54.1 + MCP v0.0.30, Playwright local dev support (wrangler@4.26.0+), Puppeteer v22.13.1 sync, /content returns title, /json custom_ai param, /screenshot viewport 1920x1080 default +- **June 2025**: Web Bot Auth headers auto-included +- **April 2025**: Playwright support launched, free tier introduced + +--- + +## Table of Contents + +1. [Quick Start (5 minutes)](#quick-start-5-minutes) +2. [Browser Rendering Overview](#browser-rendering-overview) +3. [Puppeteer API Reference](#puppeteer-api-reference) +4. [Playwright API Reference](#playwright-api-reference) +5. [Session Management](#session-management) +6. [Common Patterns](#common-patterns) +7. [Pricing & Limits](#pricing--limits) +8. [Known Issues Prevention](#known-issues-prevention) +9. [Production Checklist](#production-checklist) + +--- + +## Quick Start (5 minutes) + +### 1. Add Browser Binding + +**wrangler.jsonc:** +```jsonc +{ + "name": "browser-worker", + "main": "src/index.ts", + "compatibility_date": "2023-03-14", + "compatibility_flags": ["nodejs_compat"], + "browser": { + "binding": "MYBROWSER" + } +} +``` + +**Why nodejs_compat?** Browser Rendering requires Node.js APIs and polyfills. + +### 2. Install Puppeteer + +```bash +npm install @cloudflare/puppeteer +``` + +### 3. Take Your First Screenshot + +```typescript +import puppeteer from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url") || "https://example.com"; + + // Launch browser + const browser = await puppeteer.launch(env.MYBROWSER); + const page = await browser.newPage(); + + // Navigate and capture + await page.goto(url); + const screenshot = await page.screenshot(); + + // Clean up + await browser.close(); + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } +}; +``` + +### 4. Deploy + +```bash +npx wrangler deploy +``` + +Test at: `https://your-worker.workers.dev/?url=https://example.com` + +**CRITICAL:** +- Always pass `env.MYBROWSER` to `puppeteer.launch()` (not undefined) +- Always call `browser.close()` when done (or use `browser.disconnect()` for session reuse) +- Use `nodejs_compat` compatibility flag + +--- + +## Browser Rendering Overview + +### What is Browser Rendering? + +Cloudflare Browser Rendering provides headless Chromium browsers running on Cloudflare's global network. Use familiar tools like Puppeteer and Playwright to automate browser tasks: + +- **Screenshots** - Capture visual snapshots of web pages +- **PDF Generation** - Convert HTML/URLs to PDFs +- **Web Scraping** - Extract content from dynamic websites +- **Testing** - Automate frontend tests +- **Crawling** - Navigate multi-page workflows + +### Two Integration Methods + +| Method | Best For | Complexity | +|--------|----------|-----------| +| **Workers Bindings** | Complex automation, custom workflows, session management | Advanced | +| **REST API** | Simple screenshot/PDF tasks | Simple | + +**This skill covers Workers Bindings** (the advanced method with full Puppeteer/Playwright APIs). + +### Puppeteer vs Playwright + +| Feature | Puppeteer | Playwright | +|---------|-----------|------------| +| **API Familiarity** | Most popular | Growing adoption | +| **Package** | `@cloudflare/puppeteer@1.0.4` | `@cloudflare/playwright@1.0.0` | +| **Session Management** | ✅ Advanced APIs | ⚠️ Basic | +| **Browser Support** | Chromium only | Chromium only (Firefox/Safari not yet supported) | +| **Best For** | Screenshots, PDFs, scraping | Testing, frontend automation | + +**Recommendation**: Use Puppeteer for most use cases. Playwright is ideal if you're already using it for testing. + +--- + +## Puppeteer API Reference + +**Core APIs** (complete reference: https://pptr.dev/api/): + +**Global Functions:** +- `puppeteer.launch(env.MYBROWSER, options?)` - Launch new browser (CRITICAL: must pass binding) +- `puppeteer.connect(env.MYBROWSER, sessionId)` - Connect to existing session +- `puppeteer.sessions(env.MYBROWSER)` - List running sessions +- `puppeteer.history(env.MYBROWSER)` - List recent sessions (open + closed) +- `puppeteer.limits(env.MYBROWSER)` - Check account limits + +**Browser Methods:** +- `browser.newPage()` - Create new tab (preferred over launching new browsers) +- `browser.sessionId()` - Get session ID for reuse +- `browser.close()` - Terminate session +- `browser.disconnect()` - Keep session alive for reuse +- `browser.createBrowserContext()` - Isolated incognito context (separate cookies/cache) + +**Page Methods:** +- `page.goto(url, { waitUntil, timeout })` - Navigate (use `"networkidle0"` for dynamic content) +- `page.screenshot({ fullPage, type, quality, clip })` - Capture image +- `page.pdf({ format, printBackground, margin })` - Generate PDF +- `page.evaluate(() => ...)` - Execute JS in browser (data extraction, XPath workaround) +- `page.content()` / `page.setContent(html)` - Get/set HTML +- `page.waitForSelector(selector)` - Wait for element +- `page.type(selector, text)` / `page.click(selector)` - Form interaction + +**Critical Patterns:** +```typescript +// Must pass binding +const browser = await puppeteer.launch(env.MYBROWSER); // ✅ +// const browser = await puppeteer.launch(); // ❌ Error! + +// Session reuse for performance +const sessions = await puppeteer.sessions(env.MYBROWSER); +const freeSessions = sessions.filter(s => !s.connectionId); +if (freeSessions.length > 0) { + browser = await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId); +} + +// Keep session alive +await browser.disconnect(); // Don't close + +// XPath workaround (not directly supported) +const data = await page.evaluate(() => { + return new XPathEvaluator() + .createExpression("/html/body/div/h1") + .evaluate(document, XPathResult.FIRST_ORDERED_NODE_TYPE) + .singleNodeValue.innerHTML; +}); +``` + +--- + +## Playwright API Reference + +**Status**: GA (Sept 2025) - Playwright v1.55, MCP v0.0.30 support, local dev support (wrangler@4.26.0+) + +**Installation:** +```bash +npm install @cloudflare/playwright +``` + +**Configuration Requirements (2025 Update):** +```jsonc +{ + "compatibility_flags": ["nodejs_compat"], + "compatibility_date": "2025-09-15" // Required for Playwright v1.55 +} +``` + +**Basic Usage:** +```typescript +import { chromium } from "@cloudflare/playwright"; + +const browser = await chromium.launch(env.BROWSER); +const page = await browser.newPage(); +await page.goto("https://example.com"); +const screenshot = await page.screenshot(); +await browser.close(); +``` + +**Puppeteer vs Playwright:** +- **Import**: `puppeteer` vs `{ chromium }` from "@cloudflare/playwright" +- **Session API**: Puppeteer has advanced session management (sessions/history/limits), Playwright basic +- **Auto-waiting**: Playwright has built-in auto-waiting, Puppeteer requires manual `waitForSelector()` +- **MCP Support**: Playwright MCP v0.0.30 (July 2025), Playwright MCP server available + +**Recommendation**: Use Puppeteer for session reuse patterns. Use Playwright if migrating existing tests or need MCP integration. + +**Official Docs**: https://developers.cloudflare.com/browser-rendering/playwright/ + +--- + +## Session Management + +**Why**: Launching new browsers is slow and consumes concurrency limits. Reuse sessions for faster response, lower concurrency usage, better resource utilization. + +### Session Reuse Pattern (Critical) + +```typescript +async function getBrowser(env: Env): Promise { + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSessions = sessions.filter(s => !s.connectionId); + + if (freeSessions.length > 0) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId); + } catch (e) { + console.log("Failed to connect, launching new browser"); + } + } + + return await puppeteer.launch(env.MYBROWSER); +} + +export default { + async fetch(request: Request, env: Env): Promise { + const browser = await getBrowser(env); + + try { + const page = await browser.newPage(); + await page.goto("https://example.com"); + const screenshot = await page.screenshot(); + + await browser.disconnect(); // ✅ Keep alive for reuse + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } catch (error) { + await browser.close(); // ❌ Close on error + throw error; + } + } +}; +``` + +**Key Rules:** +- ✅ `browser.disconnect()` - Keep session alive for reuse +- ❌ `browser.close()` - Only on errors or when truly done +- ✅ Always handle connection failures + +### Browser Contexts (Cookie/Cache Isolation) + +Use `browser.createBrowserContext()` to share browser but isolate cookies/cache: + +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +const context1 = await browser.createBrowserContext(); // User 1 +const context2 = await browser.createBrowserContext(); // User 2 + +const page1 = await context1.newPage(); +const page2 = await context2.newPage(); +// Separate cookies/cache per context +``` + +### Multiple Tabs Pattern + +**❌ Bad**: Launch 10 browsers for 10 URLs (wastes concurrency) +**✅ Good**: 1 browser, 10 tabs via `Promise.all()` + `browser.newPage()` + +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +const results = await Promise.all( + urls.map(async (url) => { + const page = await browser.newPage(); + await page.goto(url); + const data = await page.evaluate(() => ({ title: document.title })); + await page.close(); + return { url, data }; + }) +); +await browser.close(); +``` + +--- + +## Common Patterns + +### Screenshot with KV Caching + +Cache screenshots to reduce browser usage and improve performance: + +```typescript +interface Env { + MYBROWSER: Fetcher; + CACHE: KVNamespace; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url"); + if (!url) return new Response("Missing ?url parameter", { status: 400 }); + + const normalizedUrl = new URL(url).toString(); + + // Check cache first + let screenshot = await env.CACHE.get(normalizedUrl, { type: "arrayBuffer" }); + + if (!screenshot) { + const browser = await puppeteer.launch(env.MYBROWSER); + const page = await browser.newPage(); + await page.goto(normalizedUrl); + screenshot = await page.screenshot(); + await browser.close(); + + // Cache for 24 hours + await env.CACHE.put(normalizedUrl, screenshot, { expirationTtl: 60 * 60 * 24 }); + } + + return new Response(screenshot, { headers: { "content-type": "image/png" } }); + } +}; +``` + +### AI-Enhanced Scraping + +Combine Browser Rendering with Workers AI for structured data extraction: + +```typescript +interface Env { + MYBROWSER: Fetcher; + AI: Ai; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url"); + + // Scrape page content + const browser = await puppeteer.launch(env.MYBROWSER); + const page = await browser.newPage(); + await page.goto(url!, { waitUntil: "networkidle0" }); + const bodyContent = await page.$eval("body", el => el.innerHTML); + await browser.close(); + + // Extract structured data with AI + const response = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", { + messages: [{ + role: "user", + content: `Extract product info as JSON from this HTML. Include: name, price, description.\n\nHTML:\n${bodyContent.slice(0, 4000)}` + }] + }); + + return Response.json({ url, product: JSON.parse(response.response) }); + } +}; +``` + +**Other Common Patterns**: PDF generation (`page.pdf()`), structured scraping (`page.evaluate()`), form automation (`page.type()` + `page.click()`). See bundled `templates/` directory. + +--- + +## Pricing & Limits + +**Billing GA**: August 20, 2025 + +**Free Tier**: 10 min/day, 3 concurrent, 3 launches/min, 60s timeout +**Paid Tier**: 10 hrs/month included ($0.09/hr after), 10 concurrent avg ($2.00/browser after), 30 launches/min, 60s-10min timeout + +**Concurrency Calculation**: Monthly average of daily peak usage (e.g., 15 browsers avg = (15 - 10 included) × $2.00 = $10.00/mo) + +**Rate Limiting**: Enforced per-second (180 req/min = 3 req/sec, not bursty). Check `puppeteer.limits(env.MYBROWSER)` before launching: + +```typescript +const limits = await puppeteer.limits(env.MYBROWSER); +if (limits.allowedBrowserAcquisitions === 0) { + const delay = limits.timeUntilNextAllowedBrowserAcquisition || 1000; + await new Promise(resolve => setTimeout(resolve, delay)); +} +``` + +--- + +## Known Issues Prevention + +This skill prevents **6 documented issues**: + +--- + +### Issue #1: XPath Selectors Not Supported + +**Error:** "XPath selector not supported" or selector failures +**Source:** https://developers.cloudflare.com/browser-rendering/faq/#why-cant-i-use-an-xpath-selector-when-using-browser-rendering-with-puppeteer +**Why It Happens:** XPath poses a security risk to Workers +**Prevention:** Use CSS selectors or `page.evaluate()` with XPathEvaluator + +**Solution:** +```typescript +// ❌ Don't use XPath directly (not supported) +// await page.$x('/html/body/div/h1') + +// ✅ Use CSS selector +const heading = await page.$("div > h1"); + +// ✅ Or use XPath in page.evaluate() +const innerHtml = await page.evaluate(() => { + return new XPathEvaluator() + .createExpression("/html/body/div/h1") + .evaluate(document, XPathResult.FIRST_ORDERED_NODE_TYPE) + .singleNodeValue.innerHTML; +}); +``` + +--- + +### Issue #2: Browser Binding Not Passed + +**Error:** "Cannot read properties of undefined (reading 'fetch')" +**Source:** https://developers.cloudflare.com/browser-rendering/faq/#cannot-read-properties-of-undefined-reading-fetch +**Why It Happens:** `puppeteer.launch()` called without browser binding +**Prevention:** Always pass `env.MYBROWSER` to launch + +**Solution:** +```typescript +// ❌ Missing browser binding +const browser = await puppeteer.launch(); // Error! + +// ✅ Pass binding +const browser = await puppeteer.launch(env.MYBROWSER); +``` + +--- + +### Issue #3: Browser Timeout (60 seconds) + +**Error:** Browser closes unexpectedly after 60 seconds +**Source:** https://developers.cloudflare.com/browser-rendering/platform/limits/#note-on-browser-timeout +**Why It Happens:** Default timeout is 60 seconds of inactivity +**Prevention:** Use `keep_alive` option to extend up to 10 minutes + +**Solution:** +```typescript +// Extend timeout to 5 minutes for long-running tasks +const browser = await puppeteer.launch(env.MYBROWSER, { + keep_alive: 300000 // 5 minutes = 300,000 ms +}); +``` + +**Note:** Browser closes if no devtools commands for the specified duration. + +--- + +### Issue #4: Concurrency Limits Reached + +**Error:** "Rate limit exceeded" or new browser launch fails +**Source:** https://developers.cloudflare.com/browser-rendering/platform/limits/ +**Why It Happens:** Exceeded concurrent browser limit (3 free, 10-30 paid) +**Prevention:** Reuse sessions, use tabs instead of multiple browsers, check limits before launching + +**Solutions:** +```typescript +// 1. Check limits before launching +const limits = await puppeteer.limits(env.MYBROWSER); +if (limits.allowedBrowserAcquisitions === 0) { + return new Response("Concurrency limit reached", { status: 429 }); +} + +// 2. Reuse sessions +const sessions = await puppeteer.sessions(env.MYBROWSER); +const freeSessions = sessions.filter(s => !s.connectionId); +if (freeSessions.length > 0) { + const browser = await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId); +} + +// 3. Use tabs instead of multiple browsers +const browser = await puppeteer.launch(env.MYBROWSER); +const page1 = await browser.newPage(); +const page2 = await browser.newPage(); // Same browser, different tabs +``` + +--- + +### Issue #5: Local Development Request Size Limit + +**Error:** Request larger than 1MB fails in `wrangler dev` +**Source:** https://developers.cloudflare.com/browser-rendering/faq/#does-local-development-support-all-browser-rendering-features +**Why It Happens:** Local development limitation +**Prevention:** Use `remote: true` in browser binding for local dev + +**Solution:** +```jsonc +// wrangler.jsonc for local development +{ + "browser": { + "binding": "MYBROWSER", + "remote": true // Use real headless browser during dev + } +} +``` + +--- + +### Issue #6: Bot Protection Always Triggered + +**Error:** Website blocks requests as bot traffic +**Source:** https://developers.cloudflare.com/browser-rendering/faq/#will-browser-rendering-bypass-cloudflares-bot-protection +**Why It Happens:** Browser Rendering requests always identified as bots +**Prevention:** Cannot bypass; if scraping your own zone, create WAF skip rule + +**Solution:** +```typescript +// ❌ Cannot bypass bot protection +// Requests will always be identified as bots + +// ✅ If scraping your own Cloudflare zone: +// 1. Go to Security > WAF > Custom rules +// 2. Create skip rule with custom header: +// Header: X-Custom-Auth +// Value: your-secret-token +// 3. Pass header in your scraping requests + +// Note: Automatic headers are included: +// - cf-biso-request-id +// - cf-biso-devtools +``` + +--- + +## Production Checklist + +Before deploying Browser Rendering Workers to production: + +### Configuration +- [ ] **Browser binding configured** in wrangler.jsonc +- [ ] **nodejs_compat flag enabled** (required for Browser Rendering) +- [ ] **Keep-alive timeout set** if tasks take > 60 seconds +- [ ] **Remote binding enabled** for local development if needed + +### Error Handling +- [ ] **Retry logic implemented** for rate limits +- [ ] **Timeout handling** for page.goto() +- [ ] **Browser cleanup** in try-finally blocks +- [ ] **Concurrency limit checks** before launching browsers +- [ ] **Graceful degradation** when browser unavailable + +### Performance +- [ ] **Session reuse implemented** for high-traffic routes +- [ ] **Multiple tabs used** instead of multiple browsers +- [ ] **Incognito contexts** for session isolation +- [ ] **KV caching** for repeated screenshots/PDFs +- [ ] **Batch operations** to maximize browser utilization + +### Monitoring +- [ ] **Log browser session IDs** for debugging +- [ ] **Track browser duration** for billing estimates +- [ ] **Monitor concurrency usage** with puppeteer.limits() +- [ ] **Alert on rate limit errors** +- [ ] **Dashboard monitoring** at https://dash.cloudflare.com/?to=/:account/workers/browser-rendering + +### Security +- [ ] **Input validation** for URLs (prevent SSRF) +- [ ] **Timeout limits** to prevent abuse +- [ ] **Rate limiting** on public endpoints +- [ ] **Authentication** for sensitive scraping endpoints +- [ ] **WAF rules** if scraping your own zone + +### Testing +- [ ] **Test screenshot capture** with various page sizes +- [ ] **Test PDF generation** with custom HTML +- [ ] **Test scraping** with dynamic content (networkidle0) +- [ ] **Test error scenarios** (invalid URLs, timeouts) +- [ ] **Load test** concurrency limits + +--- + +## Error Handling Best Practices + +**Production Pattern** - Use try-catch with proper cleanup: + +```typescript +async function withBrowser(env: Env, fn: (browser: Browser) => Promise): Promise { + let browser: Browser | null = null; + + try { + // 1. Check limits before launching + const limits = await puppeteer.limits(env.MYBROWSER); + if (limits.allowedBrowserAcquisitions === 0) { + throw new Error("Rate limit reached"); + } + + // 2. Try session reuse first + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSessions = sessions.filter(s => !s.connectionId); + browser = freeSessions.length > 0 + ? await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId) + : await puppeteer.launch(env.MYBROWSER); + + // 3. Execute user function + const result = await fn(browser); + + // 4. Disconnect (keep alive) + await browser.disconnect(); + return result; + } catch (error) { + // 5. Close on error + if (browser) await browser.close(); + throw error; + } +} +``` + +**Key Principles**: Check limits → Reuse sessions → Execute → Disconnect on success, close on error + +--- + +## Using Bundled Resources + +### Templates (templates/) + +Ready-to-use code templates for common patterns: + +- `basic-screenshot.ts` - Minimal screenshot example +- `screenshot-with-kv-cache.ts` - Screenshot with KV caching +- `pdf-generation.ts` - Generate PDFs from HTML or URLs +- `web-scraper-basic.ts` - Basic web scraping pattern +- `web-scraper-batch.ts` - Batch scrape multiple URLs +- `session-reuse.ts` - Session reuse for performance +- `ai-enhanced-scraper.ts` - Scraping with Workers AI +- `playwright-example.ts` - Playwright alternative example +- `wrangler-browser-config.jsonc` - Browser binding configuration + +**Usage:** +```bash +# Copy template to your project +cp ~/.claude/skills/cloudflare-browser-rendering/templates/basic-screenshot.ts src/index.ts +``` + +### References (references/) + +Deep-dive documentation: + +- `session-management.md` - Complete session reuse guide +- `pricing-and-limits.md` - Detailed pricing breakdown +- `common-errors.md` - All known issues and solutions +- `puppeteer-vs-playwright.md` - Feature comparison and migration + +**When to load:** Reference when implementing advanced patterns or debugging specific issues. + +--- + +## Dependencies + +**Required:** +- `@cloudflare/puppeteer@1.0.4` - Puppeteer for Workers +- `wrangler@4.43.0+` - Cloudflare CLI + +**Optional:** +- `@cloudflare/playwright@1.0.0` - Playwright for Workers (alternative) +- `@cloudflare/workers-types@4.20251014.0+` - TypeScript types + +**Related Skills:** +- `cloudflare-worker-base` - Worker setup with Hono +- `cloudflare-kv` - KV caching for screenshots +- `cloudflare-r2` - R2 storage for generated files +- `cloudflare-workers-ai` - AI-enhanced scraping + +--- + +## Official Documentation + +- **Browser Rendering Docs**: https://developers.cloudflare.com/browser-rendering/ +- **Puppeteer API**: https://pptr.dev/api/ +- **Playwright API**: https://playwright.dev/docs/api/class-playwright +- **Cloudflare Puppeteer Fork**: https://github.com/cloudflare/puppeteer +- **Cloudflare Playwright Fork**: https://github.com/cloudflare/playwright +- **Pricing**: https://developers.cloudflare.com/browser-rendering/platform/pricing/ +- **Limits**: https://developers.cloudflare.com/browser-rendering/platform/limits/ + +--- + +## Package Versions (Verified 2025-10-22) + +```json +{ + "dependencies": { + "@cloudflare/puppeteer": "^1.0.4" + }, + "devDependencies": { + "@cloudflare/workers-types": "^4.20251014.0", + "wrangler": "^4.43.0" + } +} +``` + +**Alternative (Playwright):** +```json +{ + "dependencies": { + "@cloudflare/playwright": "^1.0.0" + } +} +``` + +--- + +## Troubleshooting + +### Problem: "Cannot read properties of undefined (reading 'fetch')" +**Solution:** Pass browser binding to puppeteer.launch(): +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); // Not just puppeteer.launch() +``` + +### Problem: XPath selectors not working +**Solution:** Use CSS selectors or page.evaluate() with XPathEvaluator (see Issue #1) + +### Problem: Browser closes after 60 seconds +**Solution:** Extend timeout with keep_alive: +```typescript +const browser = await puppeteer.launch(env.MYBROWSER, { keep_alive: 300000 }); +``` + +### Problem: Rate limit reached +**Solution:** Reuse sessions, use tabs, check limits before launching (see Issue #4) + +### Problem: Local dev request > 1MB fails +**Solution:** Enable remote binding in wrangler.jsonc: +```jsonc +{ "browser": { "binding": "MYBROWSER", "remote": true } } +``` + +### Problem: Website blocks as bot +**Solution:** Cannot bypass. If your own zone, create WAF skip rule (see Issue #6) + +--- + +**Questions? Issues?** + +1. Check `references/common-errors.md` for detailed solutions +2. Review `references/session-management.md` for performance optimization +3. Verify browser binding is configured in wrangler.jsonc +4. Check official docs: https://developers.cloudflare.com/browser-rendering/ +5. Ensure `nodejs_compat` compatibility flag is enabled diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..d58dbe5 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,101 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:jezweb/claude-skills:skills/cloudflare-browser-rendering", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "f6b65a162ae4f587ce6a8e81896247875b63e3e5", + "treeHash": "e3ef8d6debe9616a1cbe302e85dc18cb49d96c36872a8c07c7acae4cf142113d", + "generatedAt": "2025-11-28T10:18:58.576479Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "cloudflare-browser-rendering", + "description": "Add headless Chrome automation with Puppeteer/Playwright on Cloudflare Workers. Use when: taking screenshots, generating PDFs, web scraping, crawling sites, browser automation, or troubleshooting XPath errors, browser timeouts, binding not passed errors, or session limits.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "00452a543d02e6fe8c0f8e46d746fdb0a16787aa06f850460200fd92135065cb" + }, + { + "path": "SKILL.md", + "sha256": "f0d36b31ffbdbeb1b5411f9e3ab488f5a6fd4b8751a5882792059ad9b060746d" + }, + { + "path": "references/common-errors.md", + "sha256": "7395079a6f3573ed4858df1c5b653341911f9d4293c01f0384359059b2100171" + }, + { + "path": "references/pricing-and-limits.md", + "sha256": "c38d89c4a3dd11d8564695eaeaab584c0d3cfd64e03fa33d1878715f74c416b1" + }, + { + "path": "references/puppeteer-vs-playwright.md", + "sha256": "44ceb27acff58f2216d42b69f2902c2f6365a56de347a9e6a2605696858e1744" + }, + { + "path": "references/session-management.md", + "sha256": "78467d521547a60ce85e464f5237bb5313dc6c19127267b5492da93a11167131" + }, + { + "path": "scripts/check-versions.sh", + "sha256": "7101b170427b9183cb1375263790732b9c11ff84df86ef09504a04148794173d" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "9891b4c3cbdfbd2a5833ef02a25165152e768782cbbb25e2b781702428a64bb9" + }, + { + "path": "templates/session-reuse.ts", + "sha256": "42a96c01227e25aa2cb0c2e9b9fdb83ade99fb4288a3bf616760645c727ca4b4" + }, + { + "path": "templates/wrangler-browser-config.jsonc", + "sha256": "b587dc298f75a82dff9ba343ad7edb555a25cc9b2621d393839f378f04d7b0a1" + }, + { + "path": "templates/pdf-generation.ts", + "sha256": "cdfd88c037ace52984185a023555cc6a852c2c0bd9036c1a0d08756d4dd849a7" + }, + { + "path": "templates/web-scraper-basic.ts", + "sha256": "3434f82fdd25d8cd4f0a16ff5b12437d0833ec571fcf849feb69332ed2a7b60c" + }, + { + "path": "templates/playwright-example.ts", + "sha256": "c575c7e163675c819bdccce6624e18cb8695c3c7e19dc1107618d946949eb5a0" + }, + { + "path": "templates/screenshot-with-kv-cache.ts", + "sha256": "16d841deba9b8376cb2e80c8a32c0389de44248422350fc12fe1e6b9e46c3ce1" + }, + { + "path": "templates/basic-screenshot.ts", + "sha256": "36564d257d9dd1721f0a1a7e2168d9e719d386c8494c766890fa8b04ac97ff51" + }, + { + "path": "templates/web-scraper-batch.ts", + "sha256": "626904560ecf736f1f39a55fc1a9d6f302e865ead02d32f317573370921e1a25" + }, + { + "path": "templates/ai-enhanced-scraper.ts", + "sha256": "523872273fbe1c4bf850e0243b3b97a7f16a581355c6897e4741b1a1a586590e" + } + ], + "dirSha256": "e3ef8d6debe9616a1cbe302e85dc18cb49d96c36872a8c07c7acae4cf142113d" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/common-errors.md b/references/common-errors.md new file mode 100644 index 0000000..e5df4c4 --- /dev/null +++ b/references/common-errors.md @@ -0,0 +1,632 @@ +# Common Errors and Solutions + +Complete reference for all known Browser Rendering errors with sources, root causes, and solutions. + +--- + +## Error 1: "Cannot read properties of undefined (reading 'fetch')" + +**Full Error:** +``` +TypeError: Cannot read properties of undefined (reading 'fetch') +``` + +**Source**: https://developers.cloudflare.com/browser-rendering/faq/#cannot-read-properties-of-undefined-reading-fetch + +**Root Cause**: Browser binding not passed to `puppeteer.launch()` + +**Why It Happens:** +```typescript +// ❌ Missing browser binding +const browser = await puppeteer.launch(); +// ^ undefined - no binding passed! +``` + +**Solution:** +```typescript +// ✅ Pass browser binding +const browser = await puppeteer.launch(env.MYBROWSER); +// ^^^^^^^^^^^^^^^^ binding from env +``` + +**Prevention**: Always pass `env.MYBROWSER` (or your configured binding name) to `puppeteer.launch()`. + +--- + +## Error 2: XPath Selector Not Supported + +**Full Error:** +``` +Error: XPath selectors are not supported in Browser Rendering +``` + +**Source**: https://developers.cloudflare.com/browser-rendering/faq/#why-cant-i-use-an-xpath-selector-when-using-browser-rendering-with-puppeteer + +**Root Cause**: XPath poses security risk to Workers + +**Why It Happens:** +```typescript +// ❌ XPath selectors not directly supported +const elements = await page.$x('/html/body/div/h1'); +``` + +**Solution 1: Use CSS Selectors** +```typescript +// ✅ Use CSS selector instead +const element = await page.$("div > h1"); +const elements = await page.$$("div > h1"); +``` + +**Solution 2: Use XPath in page.evaluate()** +```typescript +// ✅ Use XPath inside page.evaluate() +const innerHtml = await page.evaluate(() => { + return ( + // @ts-ignore - runs in browser context + new XPathEvaluator() + .createExpression("/html/body/div/h1") + // @ts-ignore + .evaluate(document, XPathResult.FIRST_ORDERED_NODE_TYPE) + .singleNodeValue.innerHTML + ); +}); +``` + +**Prevention**: Use CSS selectors by default. Only use XPath via `page.evaluate()` if absolutely necessary. + +--- + +## Error 3: Browser Timeout + +**Full Error:** +``` +Error: Browser session closed due to inactivity +``` + +**Source**: https://developers.cloudflare.com/browser-rendering/platform/limits/#note-on-browser-timeout + +**Root Cause**: Default 60 second idle timeout + +**Why It Happens:** +- No devtools commands sent for 60 seconds +- Browser automatically closes to free resources + +**Solution: Extend Timeout** +```typescript +// ✅ Extend timeout to 5 minutes +const browser = await puppeteer.launch(env.MYBROWSER, { + keep_alive: 300000 // 5 minutes = 300,000 ms +}); +``` + +**Maximum**: 600,000ms (10 minutes) + +**Use Cases for Extended Timeout:** +- Multi-step workflows +- Long-running scraping +- Session reuse across requests + +**Prevention**: Only extend if actually needed. Longer timeout = more billable hours. + +--- + +## Error 4: Rate Limit Exceeded + +**Full Error:** +``` +Error: Rate limit exceeded. Too many concurrent browsers. +``` + +**Source**: https://developers.cloudflare.com/browser-rendering/platform/limits/ + +**Root Cause**: Exceeded concurrent browser limit + +**Limits:** +- Free tier: 3 concurrent browsers +- Paid tier: 10-30 concurrent browsers + +**Solution 1: Check Limits Before Launching** +```typescript +const limits = await puppeteer.limits(env.MYBROWSER); + +if (limits.allowedBrowserAcquisitions === 0) { + return new Response( + JSON.stringify({ + error: "Rate limit reached", + retryAfter: limits.timeUntilNextAllowedBrowserAcquisition + }), + { status: 429 } + ); +} + +const browser = await puppeteer.launch(env.MYBROWSER); +``` + +**Solution 2: Reuse Sessions** +```typescript +// Try to connect to existing session first +const sessions = await puppeteer.sessions(env.MYBROWSER); +const freeSession = sessions.find(s => !s.connectionId); + +if (freeSession) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch { + // Session closed, launch new + } +} + +return await puppeteer.launch(env.MYBROWSER); +``` + +**Solution 3: Use Multiple Tabs** +```typescript +// ❌ Bad: 10 browsers +for (const url of urls) { + const browser = await puppeteer.launch(env.MYBROWSER); + // ... +} + +// ✅ Good: 1 browser, 10 tabs +const browser = await puppeteer.launch(env.MYBROWSER); +await Promise.all(urls.map(async url => { + const page = await browser.newPage(); + // ... + await page.close(); +})); +await browser.close(); +``` + +**Prevention**: Monitor concurrency usage, implement session reuse, use tabs instead of multiple browsers. + +--- + +## Error 5: Local Development Request Size Limit + +**Full Error:** +``` +Error: Request payload too large (>1MB) +``` + +**Source**: https://developers.cloudflare.com/browser-rendering/faq/#does-local-development-support-all-browser-rendering-features + +**Root Cause**: Local development limitation (requests >1MB fail) + +**Solution: Use Remote Binding** +```jsonc +// wrangler.jsonc +{ + "browser": { + "binding": "MYBROWSER", + "remote": true // ← Use real headless browser during dev + } +} +``` + +**With Remote Binding:** +- Connects to actual Cloudflare browser (not local simulation) +- No 1MB request limit +- Counts toward your quota + +**Prevention**: Enable `remote: true` for local development if working with large payloads. + +--- + +## Error 6: Bot Protection Triggered + +**Full Error:** +``` +Blocked by bot protection / CAPTCHA challenge +``` + +**Source**: https://developers.cloudflare.com/browser-rendering/faq/#will-browser-rendering-bypass-cloudflares-bot-protection + +**Root Cause**: Browser Rendering requests always identified as bots + +**Why It Happens:** +- Cloudflare automatically identifies Browser Rendering traffic +- Cannot bypass bot protection +- Automatic headers added: `cf-biso-request-id`, `cf-biso-devtools` + +**Solution (If Scraping Your Own Zone):** +Create WAF skip rule: + +1. Go to Security > WAF > Custom rules +2. Create skip rule with custom header: + - Header: `X-Custom-Auth` + - Value: `your-secret-token` +3. Add header in your Worker: + +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +const page = await browser.newPage(); + +// Set custom header +await page.setExtraHTTPHeaders({ + "X-Custom-Auth": "your-secret-token" +}); + +await page.goto(url); +``` + +**Solution (If Scraping External Sites):** +- Cannot bypass bot protection +- Some sites will block Browser Rendering traffic +- Consider using site's official API instead + +**Prevention**: Use official APIs when available. Only scrape your own zones if possible. + +--- + +## Error 7: Navigation Timeout + +**Full Error:** +``` +TimeoutError: Navigation timeout of 30000 ms exceeded +``` + +**Root Cause**: Page failed to load within timeout + +**Why It Happens:** +- Slow website +- Large page assets +- Network issues +- Page never reaches desired load state + +**Solution 1: Increase Timeout** +```typescript +await page.goto(url, { + timeout: 60000 // 60 seconds +}); +``` + +**Solution 2: Change Wait Condition** +```typescript +// ❌ Strict (waits for all network requests) +await page.goto(url, { waitUntil: "networkidle0" }); + +// ✅ More lenient (waits for DOMContentLoaded) +await page.goto(url, { waitUntil: "domcontentloaded" }); + +// ✅ Most lenient (waits for load event only) +await page.goto(url, { waitUntil: "load" }); +``` + +**Solution 3: Handle Timeout Gracefully** +```typescript +try { + await page.goto(url, { timeout: 30000 }); +} catch (error) { + if (error instanceof Error && error.name === "TimeoutError") { + console.log("Navigation timeout, taking screenshot anyway"); + const screenshot = await page.screenshot(); + return screenshot; + } + throw error; +} +``` + +**Prevention**: Set appropriate timeouts for your use case. Use lenient wait conditions for slow sites. + +--- + +## Error 8: Memory Limit Exceeded + +**Full Error:** +``` +Error: Browser exceeded its memory limit +``` + +**Root Cause**: Page too large or too many tabs open + +**Why It Happens:** +- Opening many tabs simultaneously +- Large pages with many assets +- Memory leaks from not closing pages + +**Solution 1: Close Pages** +```typescript +const page = await browser.newPage(); +// ... use page ... +await page.close(); // ← Don't forget! +``` + +**Solution 2: Limit Concurrent Tabs** +```typescript +import PQueue from "p-queue"; + +const browser = await puppeteer.launch(env.MYBROWSER); +const queue = new PQueue({ concurrency: 5 }); // Max 5 tabs + +await Promise.all(urls.map(url => + queue.add(async () => { + const page = await browser.newPage(); + await page.goto(url); + // ... + await page.close(); + }) +)); +``` + +**Solution 3: Use Smaller Viewports** +```typescript +await page.setViewport({ + width: 1280, + height: 720 // Smaller than default +}); +``` + +**Prevention**: Always close pages when done. Limit concurrent tabs. Process URLs in batches. + +--- + +## Error 9: Failed to Connect to Session + +**Full Error:** +``` +Error: Failed to connect to browser session +``` + +**Root Cause**: Session closed between `.sessions()` and `.connect()` calls + +**Why It Happens:** +- Session timed out (60s idle) +- Session closed by another Worker +- Session terminated unexpectedly + +**Solution: Handle Connection Failures** +```typescript +const sessions = await puppeteer.sessions(env.MYBROWSER); +const freeSession = sessions.find(s => !s.connectionId); + +if (freeSession) { + try { + const browser = await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + return browser; + } catch (error) { + console.log("Failed to connect to session, launching new browser"); + } +} + +// Fall back to launching new browser +return await puppeteer.launch(env.MYBROWSER); +``` + +**Prevention**: Always wrap `puppeteer.connect()` in try-catch. Have fallback to `puppeteer.launch()`. + +--- + +## Error 10: Too Many Requests Per Minute + +**Full Error:** +``` +Error: Too many browser launches per minute +``` + +**Root Cause**: Exceeded "new browsers per minute" limit + +**Limits:** +- Free tier: 3 per minute (1 every 20 seconds) +- Paid tier: 30 per minute (1 every 2 seconds) + +**Solution: Implement Rate Limiting** +```typescript +async function launchWithRateLimit(env: Env): Promise { + const limits = await puppeteer.limits(env.MYBROWSER); + + if (limits.allowedBrowserAcquisitions === 0) { + const delay = limits.timeUntilNextAllowedBrowserAcquisition || 2000; + console.log(`Rate limited, waiting ${delay}ms`); + await new Promise(resolve => setTimeout(resolve, delay)); + } + + return await puppeteer.launch(env.MYBROWSER); +} +``` + +**Prevention**: Check limits before launching. Implement exponential backoff. Reuse sessions instead of launching new browsers. + +--- + +## Error 11: Binding Not Configured + +**Full Error:** +``` +Error: Browser binding not found +``` + +**Root Cause**: Browser binding not configured in wrangler.jsonc + +**Solution: Add Browser Binding** +```jsonc +// wrangler.jsonc +{ + "browser": { + "binding": "MYBROWSER" + }, + "compatibility_flags": ["nodejs_compat"] +} +``` + +**Also Add to TypeScript Types:** +```typescript +interface Env { + MYBROWSER: Fetcher; +} +``` + +**Prevention**: Always configure browser binding and nodejs_compat flag. + +--- + +## Error 12: nodejs_compat Flag Missing + +**Full Error:** +``` +Error: Node.js APIs not available +``` + +**Root Cause**: `nodejs_compat` compatibility flag not enabled + +**Solution: Add Compatibility Flag** +```jsonc +// wrangler.jsonc +{ + "compatibility_flags": ["nodejs_compat"] +} +``` + +**Why It's Required:** +Browser Rendering needs Node.js APIs and polyfills to work. + +**Prevention**: Always include `nodejs_compat` when using Browser Rendering. + +--- + +## Error Handling Template + +Complete error handling for production use: + +```typescript +import puppeteer, { Browser } from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +async function withBrowser( + env: Env, + fn: (browser: Browser) => Promise +): Promise { + let browser: Browser | null = null; + + try { + // Check limits + const limits = await puppeteer.limits(env.MYBROWSER); + if (limits.allowedBrowserAcquisitions === 0) { + throw new Error( + `Rate limit reached. Retry after ${limits.timeUntilNextAllowedBrowserAcquisition}ms` + ); + } + + // Try to reuse session + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSession = sessions.find(s => !s.connectionId); + + if (freeSession) { + try { + browser = await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch (error) { + console.log("Failed to connect, launching new browser"); + browser = await puppeteer.launch(env.MYBROWSER); + } + } else { + browser = await puppeteer.launch(env.MYBROWSER); + } + + // Execute user function + const result = await fn(browser); + + // Disconnect (keep session alive) + await browser.disconnect(); + + return result; + } catch (error) { + // Close on error + if (browser) { + await browser.close(); + } + + // Re-throw with context + if (error instanceof Error) { + error.message = `Browser operation failed: ${error.message}`; + } + throw error; + } +} + +export default { + async fetch(request: Request, env: Env): Promise { + try { + const screenshot = await withBrowser(env, async (browser) => { + const page = await browser.newPage(); + + try { + await page.goto("https://example.com", { + waitUntil: "networkidle0", + timeout: 30000 + }); + } catch (error) { + if (error instanceof Error && error.name === "TimeoutError") { + console.log("Navigation timeout, taking screenshot anyway"); + } else { + throw error; + } + } + + return await page.screenshot(); + }); + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } catch (error) { + console.error("Request failed:", error); + + return new Response( + JSON.stringify({ + error: error instanceof Error ? error.message : "Unknown error" + }), + { + status: 500, + headers: { "content-type": "application/json" } + } + ); + } + } +}; +``` + +--- + +## Debugging Checklist + +When encountering browser errors: + +1. **Check browser binding** + - [ ] Binding configured in wrangler.jsonc? + - [ ] nodejs_compat flag enabled? + - [ ] Binding passed to puppeteer.launch()? + +2. **Check limits** + - [ ] Within concurrent browser limit? + - [ ] Within new browsers/minute limit? + - [ ] Call puppeteer.limits() to verify? + +3. **Check timeouts** + - [ ] Navigation timeout appropriate? + - [ ] Browser keep_alive set if needed? + - [ ] Timeout errors handled gracefully? + +4. **Check session management** + - [ ] browser.close() called on errors? + - [ ] Pages closed when done? + - [ ] Session reuse implemented correctly? + +5. **Check network** + - [ ] Target URL accessible? + - [ ] No CORS/bot protection issues? + - [ ] Appropriate wait conditions used? + +--- + +## References + +- **FAQ**: https://developers.cloudflare.com/browser-rendering/faq/ +- **Limits**: https://developers.cloudflare.com/browser-rendering/platform/limits/ +- **GitHub Issues**: https://github.com/cloudflare/puppeteer/issues +- **Discord**: https://discord.cloudflare.com/ + +--- + +**Last Updated**: 2025-10-22 diff --git a/references/pricing-and-limits.md b/references/pricing-and-limits.md new file mode 100644 index 0000000..40e5fb8 --- /dev/null +++ b/references/pricing-and-limits.md @@ -0,0 +1,593 @@ +# Pricing and Limits Reference + +Complete breakdown of Cloudflare Browser Rendering pricing, limits, and cost optimization strategies. + +--- + +## Pricing Overview + +Browser Rendering is billed on **two metrics**: + +1. **Duration** - Total browser hours used +2. **Concurrency** - Monthly average of concurrent browsers (Workers Bindings only) + +--- + +## Free Tier (Workers Free Plan) + +| Feature | Limit | +|---------|-------| +| **Browser Duration** | 10 minutes per day | +| **Concurrent Browsers** | 3 per account | +| **New Browsers per Minute** | 3 per minute | +| **REST API Requests** | 6 per minute | +| **Browser Timeout (Idle)** | 60 seconds | +| **Max Session Duration** | No hard limit (closes on idle timeout) | + +### Free Tier Use Cases + +**Good for:** +- Development and testing +- Personal projects +- Low-traffic screenshot services (<100 requests/day) +- Learning and experimentation + +**Not suitable for:** +- Production applications +- High-traffic services +- Long-running scraping jobs +- Batch operations (>3 concurrent browsers) + +--- + +## Paid Tier (Workers Paid Plan) + +### Included Limits + +| Feature | Included | +|---------|----------| +| **Browser Duration** | 10 hours per month | +| **Concurrent Browsers** | 10 (monthly average) | +| **New Browsers per Minute** | 30 per minute | +| **REST API Requests** | 180 per minute | +| **Max Concurrent Browsers** | 30 per account | +| **Browser Timeout** | 60 seconds (extendable to 10 minutes with keep_alive) | + +### Beyond Included Limits + +| Metric | Price | +|--------|-------| +| **Additional Browser Hours** | $0.09 per hour | +| **Additional Concurrent Browsers** | $2.00 per browser (monthly average) | + +### Requesting Higher Limits + +If you need more than: +- 30 concurrent browsers +- 30 new browsers per minute +- 180 REST API requests per minute + +**Request higher limits**: https://forms.gle/CdueDKvb26mTaepa9 + +--- + +## Rate Limits + +### Per-Second Enforcement + +Rate limits are enforced **per-second**, not per-minute. + +**Example**: 180 requests per minute = 3 requests per second + +**This means:** +- ❌ Cannot send all 180 requests at once +- ✅ Must spread evenly over the minute (3/second) + +**Implementation:** +```typescript +async function rateLimitedLaunch(env: Env): Promise { + const limits = await puppeteer.limits(env.MYBROWSER); + + if (limits.allowedBrowserAcquisitions === 0) { + const delay = limits.timeUntilNextAllowedBrowserAcquisition; + await new Promise(resolve => setTimeout(resolve, delay)); + } + + return await puppeteer.launch(env.MYBROWSER); +} +``` + +### Free Tier Rate Limits + +- **Concurrent browsers**: 3 +- **New browsers/minute**: 3 (= 1 every 20 seconds) +- **REST API requests/minute**: 6 (= 1 every 10 seconds) + +### Paid Tier Rate Limits + +- **Concurrent browsers**: 30 (default, can request higher) +- **New browsers/minute**: 30 (= 1 every 2 seconds) +- **REST API requests/minute**: 180 (= 3 per second) + +--- + +## Duration Billing + +### How It Works + +1. **Daily Totals**: Cloudflare sums all browser usage each day (in seconds) +2. **Monthly Total**: Sum of all daily totals +3. **Rounded to Hours**: Total rounded to nearest hour +4. **Billed**: Total hours minus 10 included hours + +**Example:** +- Day 1: 60 seconds (1 minute) +- Day 2: 120 seconds (2 minutes) +- ... +- Day 30: 90 seconds (1.5 minutes) +- **Monthly Total**: 45 minutes = 0.75 hours (rounded to 1 hour) +- **Billable**: 1 hour - 10 included = 0 hours (still within free allowance) + +### Failed Requests + +**Failed requests are NOT billed** if they fail with `waitForTimeout` error. + +**Example:** +```typescript +try { + await page.goto(url, { timeout: 30000 }); +} catch (error) { + // If this times out, browser time is NOT charged + console.log("Navigation timeout - not billed"); +} +``` + +### Duration Optimization + +**Minimize browser time:** + +1. **Close browsers promptly** + ```typescript + await browser.close(); // Don't leave hanging + ``` + +2. **Use session reuse** + ```typescript + // Reuse session instead of launching new browser + const browser = await puppeteer.connect(env.MYBROWSER, sessionId); + ``` + +3. **Timeout management** + ```typescript + // Set appropriate timeouts (don't wait forever) + await page.goto(url, { timeout: 30000 }); + ``` + +4. **Cache aggressively** + ```typescript + // Cache screenshots in KV to avoid re-rendering + const cached = await env.KV.get(url, { type: "arrayBuffer" }); + if (cached) return new Response(cached); + ``` + +--- + +## Concurrency Billing + +### How It Works + +1. **Daily Peak**: Cloudflare records highest concurrent browsers each day +2. **Monthly Average**: Average of all daily peaks +3. **Billed**: Average - 10 included browsers + +**Formula:** +``` +monthly_average = sum(daily_peaks) / days_in_month +billable = max(0, monthly_average - 10) +cost = billable * $2.00 +``` + +**Example:** +- Days 1-15: 10 concurrent browsers (daily peak) +- Days 16-30: 20 concurrent browsers (daily peak) +- Monthly average: ((10 × 15) + (20 × 15)) / 30 = 15 browsers +- Billable: 15 - 10 = 5 browsers +- **Cost**: 5 × $2.00 = **$10.00** + +### Concurrency vs Duration + +| Scenario | Concurrency Impact | Duration Impact | +|----------|-------------------|-----------------| +| 1 browser for 10 hours | 1 concurrent browser | 10 browser hours | +| 10 browsers for 1 hour | 10 concurrent browsers | 10 browser hours | +| 100 browsers for 6 minutes | 100 concurrent browsers (!!) | 10 browser hours | + +**Key Insight**: Short bursts of high concurrency are EXPENSIVE. + +### Concurrency Optimization + +**Minimize concurrent browsers:** + +1. **Use multiple tabs** + ```typescript + // ❌ Bad: 10 browsers + for (const url of urls) { + const browser = await puppeteer.launch(env.MYBROWSER); + // ... + } + + // ✅ Good: 1 browser, 10 tabs + const browser = await puppeteer.launch(env.MYBROWSER); + await Promise.all(urls.map(async url => { + const page = await browser.newPage(); + // ... + })); + ``` + +2. **Session reuse** + ```typescript + // Maintain pool of warm browsers + // Reuse instead of launching new ones + ``` + +3. **Queue requests** + ```typescript + // Limit concurrent operations + const queue = new PQueue({ concurrency: 3 }); + await Promise.all(urls.map(url => queue.add(() => process(url)))); + ``` + +4. **Incognito contexts** + ```typescript + // Share browser, isolate sessions + const context1 = await browser.createBrowserContext(); + const context2 = await browser.createBrowserContext(); + ``` + +--- + +## Cost Examples + +### Example 1: Screenshot Service + +**Scenario:** +- 10,000 screenshots per month +- 3 second average per screenshot +- No caching, no session reuse + +**Duration:** +- 10,000 × 3 seconds = 30,000 seconds = 8.33 hours +- Billable: 8.33 - 10 = 0 hours (within free allowance) +- **Duration Cost**: $0.00 + +**Concurrency:** +- Assume 100 requests/hour during peak (9am-5pm weekdays) +- 100 requests/hour ÷ 3600 seconds = 0.028 browsers/second +- Peak: ~3 concurrent browsers +- Daily peak (weekdays): 3 browsers +- Daily peak (weekends): 1 browser +- Monthly average: ((3 × 22) + (1 × 8)) / 30 = 2.5 browsers +- Billable: 2.5 - 10 = 0 (within free allowance) +- **Concurrency Cost**: $0.00 + +**Total: $0.00** (within free tier!) + +--- + +### Example 2: Heavy Scraping + +**Scenario:** +- 1,000 URLs per day +- 10 seconds average per URL +- Batch processing (10 concurrent browsers) + +**Duration:** +- 1,000 × 10 seconds × 30 days = 300,000 seconds = 83.33 hours +- Billable: 83.33 - 10 = 73.33 hours +- **Duration Cost**: 73.33 × $0.09 = **$6.60** + +**Concurrency:** +- Daily peak: 10 concurrent browsers (every day) +- Monthly average: 10 browsers +- Billable: 10 - 10 = 0 (within free allowance) +- **Concurrency Cost**: $0.00 + +**Total: $6.60/month** + +--- + +### Example 3: Burst Traffic + +**Scenario:** +- Newsletter sent monthly with screenshot links +- 10,000 screenshots in 1 hour +- Each screenshot: 3 seconds + +**Duration:** +- 10,000 × 3 seconds = 30,000 seconds = 8.33 hours +- Billable: 8.33 - 10 = 0 hours +- **Duration Cost**: $0.00 + +**Concurrency:** +- 10,000 screenshots in 1 hour = 166 requests/minute +- At 3 seconds each: ~8.3 concurrent browsers +- But limited to 30 max, so likely queueing +- Daily peak: 30 browsers (rate limit) +- Monthly average: (30 × 1 day + 1 × 29 days) / 30 = 1.97 browsers +- Billable: 1.97 - 10 = 0 +- **Concurrency Cost**: $0.00 + +**Total: $0.00** + +**Note**: Would hit rate limits. Better to spread over longer period or request higher limits. + +--- + +### Example 4: Production API (Optimized) + +**Scenario:** +- 100,000 screenshots per month +- Session reuse + KV caching (90% cache hit rate) +- 10,000 actual browser renderings +- 5 seconds average per render +- Maintain pool of 5 warm browsers + +**Duration:** +- 10,000 × 5 seconds = 50,000 seconds = 13.89 hours +- Billable: 13.89 - 10 = 3.89 hours +- **Duration Cost**: 3.89 × $0.09 = **$0.35** + +**Concurrency:** +- Maintain pool of 5 browsers (keep_alive) +- Daily peak: 5 browsers +- Monthly average: 5 browsers +- Billable: 5 - 10 = 0 +- **Concurrency Cost**: $0.00 + +**Total: $0.35/month** for 100k requests! + +**ROI**: $0.0000035 per screenshot + +--- + +## Cost Optimization Strategies + +### 1. Aggressive Caching + +**Strategy**: Cache screenshots/PDFs in KV or R2 + +**Impact**: +- Reduces browser hours by 80-95% +- Reduces concurrency needs +- Faster response times + +**Implementation**: +```typescript +// Check cache first +const cached = await env.KV.get(url, { type: "arrayBuffer" }); +if (cached) return new Response(cached); + +// Generate and cache +const screenshot = await generateScreenshot(url); +await env.KV.put(url, screenshot, { expirationTtl: 86400 }); +``` + +**Cost Savings**: 80-95% reduction + +--- + +### 2. Session Reuse + +**Strategy**: Maintain pool of warm browsers, reuse sessions + +**Impact**: +- Reduces cold start time +- Lower concurrency charges +- Better throughput + +**Implementation**: See `session-reuse.ts` template + +**Cost Savings**: 30-50% reduction + +--- + +### 3. Multiple Tabs + +**Strategy**: Use tabs instead of multiple browsers + +**Impact**: +- 10-50x reduction in concurrency +- Minimal duration increase +- Much cheaper + +**Implementation**: +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +await Promise.all(urls.map(async url => { + const page = await browser.newPage(); + // process + await page.close(); +})); +await browser.close(); +``` + +**Cost Savings**: 90%+ reduction in concurrency charges + +--- + +### 4. Appropriate Timeouts + +**Strategy**: Set reasonable timeouts, don't wait forever + +**Impact**: +- Prevents hanging browsers +- Reduces wasted duration +- Better error handling + +**Implementation**: +```typescript +await page.goto(url, { + timeout: 30000, // 30 second max + waitUntil: "networkidle0" +}); +``` + +**Cost Savings**: 20-40% reduction + +--- + +### 5. Request Queueing + +**Strategy**: Limit concurrent operations to stay within limits + +**Impact**: +- Avoid rate limit errors +- Predictable costs +- Better resource utilization + +**Implementation**: +```typescript +import PQueue from "p-queue"; + +const queue = new PQueue({ concurrency: 5 }); + +await Promise.all(urls.map(url => + queue.add(() => processUrl(url)) +)); +``` + +**Cost Savings**: Avoids rate limit charges + +--- + +## Monitoring Usage + +### Dashboard + +View usage in Cloudflare Dashboard: + +https://dash.cloudflare.com/?to=/:account/workers/browser-rendering + +**Metrics available:** +- Total browser hours used +- REST API requests +- Concurrent browsers (graph) +- Cost estimates + +### Response Headers + +REST API returns browser time used: + +``` +X-Browser-Ms-Used: 2340 +``` + +(Browser time in milliseconds for that request) + +### Custom Tracking + +```typescript +interface UsageMetrics { + date: string; + browserHours: number; + peakConcurrency: number; + requests: number; + cacheHitRate: number; +} + +// Track in D1 or Analytics Engine +await env.ANALYTICS.writeDataPoint({ + indexes: [date], + blobs: ["browser_usage"], + doubles: [browserHours, peakConcurrency, requests] +}); +``` + +--- + +## Cost Alerts + +### Set Up Alerts + +1. **Monitor daily peaks** + ```typescript + const limits = await puppeteer.limits(env.MYBROWSER); + if (limits.activeSessions.length > 15) { + console.warn("High concurrency detected:", limits.activeSessions.length); + } + ``` + +2. **Track hourly usage** + ```typescript + const usage = await getHourlyUsage(); + if (usage.browserHours > 1) { + console.warn("High browser usage this hour:", usage.browserHours); + } + ``` + +3. **Set budget limits** + ```typescript + const monthlyBudget = 50; // $50/month + const currentCost = await estimateCurrentCost(); + if (currentCost > monthlyBudget * 0.8) { + console.warn("Approaching monthly budget:", currentCost); + } + ``` + +--- + +## Best Practices Summary + +1. **Always cache** screenshots/PDFs in KV or R2 +2. **Reuse sessions** instead of launching new browsers +3. **Use multiple tabs** instead of multiple browsers +4. **Set appropriate timeouts** to prevent hanging +5. **Monitor usage** in dashboard and logs +6. **Queue requests** to stay within rate limits +7. **Test caching** to optimize hit rate +8. **Profile operations** to identify slow requests +9. **Use incognito contexts** for session isolation +10. **Request higher limits** if needed for production + +--- + +## Common Questions + +### Q: Are failed requests billed? + +**A**: No. Requests that fail with `waitForTimeout` error are NOT billed. + +### Q: How is concurrency calculated? + +**A**: Monthly average of daily peak concurrent browsers. + +### Q: Can I reduce my bill? + +**A**: Yes! Use caching, session reuse, and multiple tabs. See optimization strategies above. + +### Q: What if I hit limits? + +**A**: Implement queueing, or request higher limits: https://forms.gle/CdueDKvb26mTaepa9 + +### Q: Is there a free tier? + +**A**: Yes! 10 minutes/day browser time, 3 concurrent browsers. + +### Q: How do I estimate costs? + +**A**: Monitor usage in dashboard, then calculate: +- Duration: (hours - 10) × $0.09 +- Concurrency: (avg - 10) × $2.00 + +--- + +## References + +- **Official Pricing Docs**: https://developers.cloudflare.com/browser-rendering/platform/pricing/ +- **Limits Docs**: https://developers.cloudflare.com/browser-rendering/platform/limits/ +- **Dashboard**: https://dash.cloudflare.com/?to=/:account/workers/browser-rendering +- **Request Higher Limits**: https://forms.gle/CdueDKvb26mTaepa9 + +--- + +**Last Updated**: 2025-10-22 diff --git a/references/puppeteer-vs-playwright.md b/references/puppeteer-vs-playwright.md new file mode 100644 index 0000000..d62e408 --- /dev/null +++ b/references/puppeteer-vs-playwright.md @@ -0,0 +1,627 @@ +# Puppeteer vs Playwright Comparison + +Complete comparison guide for choosing between @cloudflare/puppeteer and @cloudflare/playwright. + +--- + +## Quick Recommendation + +**Use Puppeteer if:** +- ✅ Starting a new project +- ✅ Need session management features +- ✅ Want to optimize performance/costs +- ✅ Building screenshot/PDF services +- ✅ Web scraping workflows + +**Use Playwright if:** +- ✅ Already have Playwright tests to migrate +- ✅ Prefer auto-waiting behavior +- ✅ Don't need advanced session features +- ✅ Want cross-browser APIs (even if only Chromium supported now) + +**Bottom Line**: **Puppeteer is recommended** for most Browser Rendering use cases. + +--- + +## Package Installation + +### Puppeteer +```bash +npm install @cloudflare/puppeteer +``` + +**Version**: 1.0.4 (based on Puppeteer v23.x) + +### Playwright +```bash +npm install @cloudflare/playwright +``` + +**Version**: 1.0.0 (based on Playwright v1.55.0) + +--- + +## API Comparison + +### Launching a Browser + +**Puppeteer:** +```typescript +import puppeteer from "@cloudflare/puppeteer"; + +const browser = await puppeteer.launch(env.MYBROWSER); +``` + +**Playwright:** +```typescript +import { chromium } from "@cloudflare/playwright"; + +const browser = await chromium.launch(env.BROWSER); +``` + +**Key Difference**: Playwright uses `chromium.launch()` (browser-specific), Puppeteer uses `puppeteer.launch()` (generic). + +--- + +### Basic Screenshot Example + +**Puppeteer:** +```typescript +import puppeteer from "@cloudflare/puppeteer"; + +export default { + async fetch(request: Request, env: Env): Promise { + const browser = await puppeteer.launch(env.MYBROWSER); + const page = await browser.newPage(); + await page.goto("https://example.com"); + const screenshot = await page.screenshot(); + await browser.close(); + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } +}; +``` + +**Playwright:** +```typescript +import { chromium } from "@cloudflare/playwright"; + +export default { + async fetch(request: Request, env: Env): Promise { + const browser = await chromium.launch(env.BROWSER); + const page = await browser.newPage(); + await page.goto("https://example.com"); + const screenshot = await page.screenshot(); + await browser.close(); + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } +}; +``` + +**Key Difference**: Nearly identical! Main difference is import and launch method. + +--- + +## Feature Comparison + +| Feature | Puppeteer | Playwright | Notes | +|---------|-----------|------------|-------| +| **Basic Screenshots** | ✅ Yes | ✅ Yes | Both support PNG/JPEG | +| **PDF Generation** | ✅ Yes | ✅ Yes | Identical API | +| **Page Navigation** | ✅ Yes | ✅ Yes | Similar API | +| **Element Selectors** | CSS only | CSS, text | Playwright has more selector types | +| **Auto-waiting** | ❌ Manual | ✅ Built-in | Playwright waits for elements automatically | +| **Session Management** | ✅ Advanced | ⚠️ Basic | Puppeteer has .sessions(), .history(), .limits() | +| **Session Reuse** | ✅ Yes | ⚠️ Limited | Puppeteer has .connect() with sessionId | +| **Browser Contexts** | ✅ Yes | ✅ Yes | Both support incognito contexts | +| **Multiple Tabs** | ✅ Yes | ✅ Yes | Both support multiple pages | +| **Network Interception** | ✅ Yes | ✅ Yes | Similar APIs | +| **Geolocation** | ✅ Yes | ✅ Yes | Similar APIs | +| **Emulation** | ✅ Yes | ✅ Yes | Device emulation, viewport | +| **Browser Support** | Chromium only | Chromium only | Firefox/Safari not yet supported | +| **TypeScript Types** | ✅ Yes | ✅ Yes | Both fully typed | + +--- + +## Session Management + +### Puppeteer (Advanced) + +```typescript +// List active sessions +const sessions = await puppeteer.sessions(env.MYBROWSER); + +// Find free session +const freeSession = sessions.find(s => !s.connectionId); + +// Connect to existing session +if (freeSession) { + const browser = await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); +} + +// Check limits +const limits = await puppeteer.limits(env.MYBROWSER); +console.log("Can launch:", limits.allowedBrowserAcquisitions > 0); + +// View history +const history = await puppeteer.history(env.MYBROWSER); +``` + +**Puppeteer APIs:** +- ✅ `puppeteer.sessions()` - List active sessions +- ✅ `puppeteer.connect()` - Connect to session by ID +- ✅ `puppeteer.history()` - View recent sessions +- ✅ `puppeteer.limits()` - Check account limits +- ✅ `browser.sessionId()` - Get current session ID +- ✅ `browser.disconnect()` - Disconnect without closing + +--- + +### Playwright (Basic) + +```typescript +// Launch browser +const browser = await chromium.launch(env.BROWSER); + +// Get session info (basic) +// Note: No .sessions(), .history(), or .limits() APIs +``` + +**Playwright APIs:** +- ❌ No `chromium.sessions()` equivalent +- ❌ No session reuse APIs +- ❌ No limits checking +- ❌ No session history + +**Workaround**: Use Puppeteer-style session management via REST API (more complex). + +--- + +## Auto-Waiting Behavior + +### Puppeteer (Manual) + +```typescript +// Must explicitly wait for elements +await page.goto("https://example.com"); +await page.waitForSelector("button#submit"); +await page.click("button#submit"); +``` + +**Pros**: Fine-grained control + +**Cons**: More verbose, easy to forget waits + +--- + +### Playwright (Auto-waiting) + +```typescript +// Automatically waits for elements +await page.goto("https://example.com"); +await page.click("button#submit"); // Waits automatically! +``` + +**Pros**: Less boilerplate, fewer timing issues + +**Cons**: Less control over wait behavior + +--- + +## Selector Support + +### Puppeteer + +**Supported:** +- CSS selectors: `"button#submit"`, `"div > p"` +- `:visible`, `:hidden` pseudo-classes +- `page.$()`, `page.$$()` for querying + +**Not Supported:** +- XPath selectors (use `page.evaluate()` workaround) +- Text selectors +- Layout selectors + +**Example:** +```typescript +// CSS selector +const button = await page.$("button#submit"); + +// XPath workaround +const heading = await page.evaluate(() => { + return new XPathEvaluator() + .createExpression("//h1[@class='title']") + .evaluate(document, XPathResult.FIRST_ORDERED_NODE_TYPE) + .singleNodeValue.textContent; +}); +``` + +--- + +### Playwright + +**Supported:** +- CSS selectors: `"button#submit"` +- Text selectors: `"text=Submit"` +- XPath selectors: `"xpath=//button"` +- Layout selectors: `"button :right-of(:text('Cancel'))"` + +**Example:** +```typescript +// CSS selector +await page.click("button#submit"); + +// Text selector +await page.click("text=Submit"); + +// Combined selector +await page.click("button >> text=Submit"); +``` + +**Advantage**: More flexible selector options + +--- + +## Performance & Cost + +### Puppeteer (Optimized) + +**Session Reuse:** +```typescript +// Reuse sessions to reduce costs +const sessions = await puppeteer.sessions(env.MYBROWSER); +const browser = await puppeteer.connect(env.MYBROWSER, sessionId); +await browser.disconnect(); // Keep alive +``` + +**Cost Impact:** +- ✅ Reduce cold starts by 50-70% +- ✅ Lower concurrency charges +- ✅ Better throughput + +--- + +### Playwright (Limited Optimization) + +**No Session Reuse:** +```typescript +// Must launch new browser each time +const browser = await chromium.launch(env.BROWSER); +await browser.close(); // Cannot keep alive for reuse +``` + +**Cost Impact:** +- ❌ Higher browser hours (cold starts every request) +- ❌ Higher concurrency usage +- ❌ Lower throughput + +**Difference**: ~30-50% higher costs with Playwright vs optimized Puppeteer. + +--- + +## API Differences + +| Operation | Puppeteer | Playwright | +|-----------|-----------|------------| +| **Import** | `import puppeteer from "@cloudflare/puppeteer"` | `import { chromium } from "@cloudflare/playwright"` | +| **Launch** | `puppeteer.launch(env.MYBROWSER)` | `chromium.launch(env.BROWSER)` | +| **Connect** | `puppeteer.connect(env.MYBROWSER, sessionId)` | ❌ Not available | +| **Sessions** | `puppeteer.sessions(env.MYBROWSER)` | ❌ Not available | +| **Limits** | `puppeteer.limits(env.MYBROWSER)` | ❌ Not available | +| **Goto** | `page.goto(url, { waitUntil: "networkidle0" })` | `page.goto(url, { waitUntil: "networkidle" })` | +| **Screenshot** | `page.screenshot({ fullPage: true })` | `page.screenshot({ fullPage: true })` | +| **PDF** | `page.pdf({ format: "A4" })` | `page.pdf({ format: "A4" })` | +| **Wait** | `page.waitForSelector("button")` | `page.locator("button").waitFor()` | +| **Click** | `page.click("button")` | `page.click("button")` (auto-waits) | + +--- + +## Migration Guide + +### Puppeteer → Playwright + +```typescript +// Before (Puppeteer) +import puppeteer from "@cloudflare/puppeteer"; + +const browser = await puppeteer.launch(env.MYBROWSER); +const page = await browser.newPage(); +await page.goto(url, { waitUntil: "networkidle0" }); +await page.waitForSelector("button#submit"); +await page.click("button#submit"); +const screenshot = await page.screenshot(); +await browser.close(); +``` + +```typescript +// After (Playwright) +import { chromium } from "@cloudflare/playwright"; + +const browser = await chromium.launch(env.BROWSER); +const page = await browser.newPage(); +await page.goto(url, { waitUntil: "networkidle" }); +// No waitForSelector needed - auto-waits +await page.click("button#submit"); +const screenshot = await page.screenshot(); +await browser.close(); +``` + +**Changes:** +1. Import: `puppeteer` → `{ chromium }` +2. Launch: `puppeteer.launch()` → `chromium.launch()` +3. Wait: `networkidle0` → `networkidle` +4. Remove explicit `waitForSelector()` (auto-waits) + +--- + +### Playwright → Puppeteer + +```typescript +// Before (Playwright) +import { chromium } from "@cloudflare/playwright"; + +const browser = await chromium.launch(env.BROWSER); +const page = await browser.newPage(); +await page.goto(url); +await page.click("button#submit"); // Auto-waits +``` + +```typescript +// After (Puppeteer) +import puppeteer from "@cloudflare/puppeteer"; + +const browser = await puppeteer.launch(env.MYBROWSER); +const page = await browser.newPage(); +await page.goto(url, { waitUntil: "networkidle0" }); +await page.waitForSelector("button#submit"); // Explicit wait +await page.click("button#submit"); +``` + +**Changes:** +1. Import: `{ chromium }` → `puppeteer` +2. Launch: `chromium.launch()` → `puppeteer.launch()` +3. Add explicit waits: `page.waitForSelector()` +4. Specify wait conditions: `waitUntil: "networkidle0"` + +--- + +## Use Case Recommendations + +### Screenshot Service +**Winner**: **Puppeteer** + +**Reason**: Session reuse reduces costs by 30-50% + +```typescript +// Puppeteer: Reuse sessions +const sessions = await puppeteer.sessions(env.MYBROWSER); +const browser = await puppeteer.connect(env.MYBROWSER, sessionId); +await browser.disconnect(); // Keep alive +``` + +--- + +### PDF Generation +**Winner**: **Tie** + +**Reason**: Identical API, no session reuse benefit + +```typescript +// Both have same API +const pdf = await page.pdf({ format: "A4" }); +``` + +--- + +### Web Scraping +**Winner**: **Puppeteer** + +**Reason**: Session management + limit checking + +```typescript +// Puppeteer: Check limits before scraping +const limits = await puppeteer.limits(env.MYBROWSER); +if (limits.allowedBrowserAcquisitions === 0) { + await delay(limits.timeUntilNextAllowedBrowserAcquisition); +} +``` + +--- + +### Test Migration +**Winner**: **Playwright** + +**Reason**: Easier to migrate existing Playwright tests + +```typescript +// Minimal changes needed +// Just update imports and launch +``` + +--- + +### Interactive Automation +**Winner**: **Tie** + +**Reason**: Both support form filling, clicking, etc. + +--- + +## Configuration + +### wrangler.jsonc (Puppeteer) + +```jsonc +{ + "browser": { + "binding": "MYBROWSER" + }, + "compatibility_flags": ["nodejs_compat"] +} +``` + +```typescript +interface Env { + MYBROWSER: Fetcher; +} +``` + +--- + +### wrangler.jsonc (Playwright) + +```jsonc +{ + "browser": { + "binding": "BROWSER" + }, + "compatibility_flags": ["nodejs_compat"] +} +``` + +```typescript +interface Env { + BROWSER: Fetcher; +} +``` + +**Note**: Binding name is arbitrary, but convention is `MYBROWSER` for Puppeteer and `BROWSER` for Playwright. + +--- + +## Production Considerations + +### Puppeteer Advantages +- ✅ Session reuse (30-50% cost savings) +- ✅ Limit checking (`puppeteer.limits()`) +- ✅ Session monitoring (`puppeteer.sessions()`, `.history()`) +- ✅ Better performance optimization options +- ✅ More mature Cloudflare fork + +### Playwright Advantages +- ✅ Auto-waiting (less code) +- ✅ More selector types +- ✅ Better cross-browser APIs (future-proof) +- ✅ Easier migration from existing tests + +--- + +## Recommendation Summary + +| Scenario | Recommended | Reason | +|----------|-------------|--------| +| New project | **Puppeteer** | Session management + cost optimization | +| Screenshot service | **Puppeteer** | Session reuse saves 30-50% | +| PDF generation | **Tie** | Identical API | +| Web scraping | **Puppeteer** | Limit checking + session management | +| Migrating Playwright tests | **Playwright** | Minimal changes needed | +| High traffic production | **Puppeteer** | Better performance optimization | +| Quick prototype | **Tie** | Both easy to start with | + +--- + +## Code Examples + +### Puppeteer (Production-Optimized) + +```typescript +import puppeteer, { Browser } from "@cloudflare/puppeteer"; + +async function getBrowser(env: Env): Promise { + // Check limits + const limits = await puppeteer.limits(env.MYBROWSER); + if (limits.allowedBrowserAcquisitions === 0) { + throw new Error("Rate limit reached"); + } + + // Try to reuse session + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSession = sessions.find(s => !s.connectionId); + + if (freeSession) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch { + // Session closed, launch new + } + } + + return await puppeteer.launch(env.MYBROWSER); +} + +export default { + async fetch(request: Request, env: Env): Promise { + const browser = await getBrowser(env); + + try { + const page = await browser.newPage(); + await page.goto("https://example.com", { + waitUntil: "networkidle0", + timeout: 30000 + }); + const screenshot = await page.screenshot(); + + // Disconnect (keep alive) + await browser.disconnect(); + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } catch (error) { + await browser.close(); + throw error; + } + } +}; +``` + +--- + +### Playwright (Simple) + +```typescript +import { chromium } from "@cloudflare/playwright"; + +export default { + async fetch(request: Request, env: Env): Promise { + const browser = await chromium.launch(env.BROWSER); + + try { + const page = await browser.newPage(); + await page.goto("https://example.com", { + waitUntil: "networkidle", + timeout: 30000 + }); + const screenshot = await page.screenshot(); + + await browser.close(); + + return new Response(screenshot, { + headers: { "content-type": "image/png" } + }); + } catch (error) { + await browser.close(); + throw error; + } + } +}; +``` + +--- + +## References + +- **Puppeteer Docs**: https://pptr.dev/ +- **Playwright Docs**: https://playwright.dev/ +- **Cloudflare Puppeteer Fork**: https://github.com/cloudflare/puppeteer +- **Cloudflare Playwright Fork**: https://github.com/cloudflare/playwright +- **Browser Rendering Docs**: https://developers.cloudflare.com/browser-rendering/ + +--- + +**Last Updated**: 2025-10-22 diff --git a/references/session-management.md b/references/session-management.md new file mode 100644 index 0000000..0ffcc97 --- /dev/null +++ b/references/session-management.md @@ -0,0 +1,739 @@ +# Session Management Guide + +Complete guide to browser session management for performance optimization and concurrency handling. + +--- + +## Why Session Management Matters + +**The Problem:** +- Launching new browsers is slow (~2-3 seconds cold start) +- Each launch consumes concurrency quota +- Free tier: Only 3 concurrent browsers +- Paid tier: 10-30 concurrent browsers (costs $2/browser beyond included) + +**The Solution:** +- Reuse browser sessions across requests +- Use multiple tabs instead of multiple browsers +- Check limits before launching +- Disconnect (don't close) to keep sessions alive + +**Benefits:** +- ⚡ **50-70% faster** (no cold start) +- 💰 **Lower costs** (reduced concurrency charges) +- 📊 **Better utilization** (one browser, many tabs) + +--- + +## Session Lifecycle + +``` +1. Launch → Browser session created (session ID assigned) +2. Connected → Worker actively using browser +3. Disconnected → Session idle, available for reuse +4. Timeout → Session closed after 60s idle (configurable) +5. Closed → Session terminated (must launch new one) +``` + +### Session States + +| State | Description | Can Connect? | +|-------|-------------|--------------| +| **Active with connection** | Worker is using browser | ❌ No (occupied) | +| **Active without connection** | Browser idle, waiting | ✅ Yes (available) | +| **Closed** | Session terminated | ❌ No (gone) | + +--- + +## Session Management API + +### puppeteer.sessions() + +List all currently running browser sessions. + +**Signature:** +```typescript +await puppeteer.sessions(binding: Fetcher): Promise +``` + +**Response:** +```typescript +interface SessionInfo { + sessionId: string; // Unique session ID + startTime: number; // Unix timestamp (ms) + connectionId?: string; // Present if worker is connected + connectionStartTime?: number; +} +``` + +**Example:** +```typescript +const sessions = await puppeteer.sessions(env.MYBROWSER); + +// Find free sessions (no active connection) +const freeSessions = sessions.filter(s => !s.connectionId); + +// Find occupied sessions +const occupiedSessions = sessions.filter(s => s.connectionId); + +console.log({ + total: sessions.length, + available: freeSessions.length, + occupied: occupiedSessions.length +}); +``` + +**Output:** +```json +[ + { + "sessionId": "478f4d7d-e943-40f6-a414-837d3736a1dc", + "startTime": 1711621703708, + "connectionId": "2a2246fa-e234-4dc1-8433-87e6cee80145", + "connectionStartTime": 1711621704607 + }, + { + "sessionId": "565e05fb-4d2a-402b-869b-5b65b1381db7", + "startTime": 1711621703808 + } +] +``` + +**Interpretation:** +- Session `478f4d...` is **occupied** (has connectionId) +- Session `565e05...` is **available** (no connectionId) + +--- + +### puppeteer.history() + +List recent sessions, both open and closed. + +**Signature:** +```typescript +await puppeteer.history(binding: Fetcher): Promise +``` + +**Response:** +```typescript +interface HistoryEntry { + sessionId: string; + startTime: number; + endTime?: number; // Present if closed + closeReason?: number; // Numeric close code + closeReasonText?: string; // Human-readable reason +} +``` + +**Close Reasons:** +- `"NormalClosure"` - Explicitly closed with browser.close() +- `"BrowserIdle"` - Timeout due to 60s idle period +- `"Unknown"` - Unexpected closure + +**Example:** +```typescript +const history = await puppeteer.history(env.MYBROWSER); + +history.forEach(entry => { + const duration = entry.endTime + ? (entry.endTime - entry.startTime) / 1000 + : 'still running'; + + console.log({ + sessionId: entry.sessionId, + duration: `${duration}s`, + closeReason: entry.closeReasonText || 'N/A' + }); +}); +``` + +**Use Cases:** +- Monitor browser usage patterns +- Debug unexpected closures +- Track session lifetimes +- Estimate costs + +--- + +### puppeteer.limits() + +Check current account limits and session availability. + +**Signature:** +```typescript +await puppeteer.limits(binding: Fetcher): Promise +``` + +**Response:** +```typescript +interface LimitsInfo { + activeSessions: Array<{ id: string }>; + maxConcurrentSessions: number; + allowedBrowserAcquisitions: number; // Can launch now? + timeUntilNextAllowedBrowserAcquisition: number; // ms to wait +} +``` + +**Example:** +```typescript +const limits = await puppeteer.limits(env.MYBROWSER); + +console.log({ + active: limits.activeSessions.length, + max: limits.maxConcurrentSessions, + canLaunch: limits.allowedBrowserAcquisitions > 0, + waitTime: limits.timeUntilNextAllowedBrowserAcquisition +}); +``` + +**Output:** +```json +{ + "activeSessions": [ + { "id": "478f4d7d-e943-40f6-a414-837d3736a1dc" }, + { "id": "565e05fb-4d2a-402b-869b-5b65b1381db7" } + ], + "allowedBrowserAcquisitions": 1, + "maxConcurrentSessions": 10, + "timeUntilNextAllowedBrowserAcquisition": 0 +} +``` + +**Interpretation:** +- 2 sessions currently active +- Maximum 10 concurrent sessions allowed +- Can launch 1 more browser now +- No wait time required + +--- + +### puppeteer.connect() + +Connect to an existing browser session. + +**Signature:** +```typescript +await puppeteer.connect(binding: Fetcher, sessionId: string): Promise +``` + +**Example:** +```typescript +const sessions = await puppeteer.sessions(env.MYBROWSER); +const freeSession = sessions.find(s => !s.connectionId); + +if (freeSession) { + try { + const browser = await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + console.log("Connected to existing session:", browser.sessionId()); + } catch (error) { + console.log("Connection failed, session may have closed"); + } +} +``` + +**Error Handling:** +Session may close between `.sessions()` call and `.connect()` call. Always wrap in try-catch. + +--- + +### browser.sessionId() + +Get the current browser's session ID. + +**Signature:** +```typescript +browser.sessionId(): string +``` + +**Example:** +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +const sessionId = browser.sessionId(); +console.log("Current session:", sessionId); +``` + +--- + +### browser.disconnect() + +Disconnect from browser WITHOUT closing it. + +**Signature:** +```typescript +await browser.disconnect(): Promise +``` + +**When to use:** +- Want to reuse session later +- Keep browser warm for next request +- Reduce cold start times + +**Example:** +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +const sessionId = browser.sessionId(); + +// Do work +const page = await browser.newPage(); +await page.goto("https://example.com"); + +// Disconnect (keep alive) +await browser.disconnect(); + +// Later: reconnect +const browserAgain = await puppeteer.connect(env.MYBROWSER, sessionId); +``` + +**Important:** +- Browser will still timeout after 60s idle (use `keep_alive` to extend) +- Session remains in your concurrent browser count +- Other workers CAN connect to this session + +--- + +### browser.close() + +Close the browser and terminate the session. + +**Signature:** +```typescript +await browser.close(): Promise +``` + +**When to use:** +- Done with browser completely +- Want to free concurrency slot +- Error occurred during processing + +**Example:** +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); + +try { + // Do work +} catch (error) { + await browser.close(); // Clean up on error + throw error; +} + +await browser.close(); // Normal cleanup +``` + +--- + +## Session Reuse Patterns + +### Pattern 1: Simple Reuse + +```typescript +async function getBrowser(env: Env): Promise { + // Try to connect to existing session + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSession = sessions.find(s => !s.connectionId); + + if (freeSession) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch { + // Session closed, launch new one + } + } + + // Launch new browser + return await puppeteer.launch(env.MYBROWSER); +} + +export default { + async fetch(request: Request, env: Env): Promise { + const browser = await getBrowser(env); + + // Do work + const page = await browser.newPage(); + // ... + + // Disconnect (keep alive) + await browser.disconnect(); + + return response; + } +}; +``` + +--- + +### Pattern 2: Reuse with Limits Check + +```typescript +async function getBrowserSafe(env: Env): Promise { + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSession = sessions.find(s => !s.connectionId); + + if (freeSession) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch { + // Continue to launch + } + } + + // Check limits before launching + const limits = await puppeteer.limits(env.MYBROWSER); + + if (limits.allowedBrowserAcquisitions === 0) { + throw new Error( + `Rate limit reached. Retry after ${limits.timeUntilNextAllowedBrowserAcquisition}ms` + ); + } + + return await puppeteer.launch(env.MYBROWSER); +} +``` + +--- + +### Pattern 3: Retry with Backoff + +```typescript +async function getBrowserWithRetry( + env: Env, + maxRetries = 3 +): Promise { + for (let i = 0; i < maxRetries; i++) { + try { + // Try existing session first + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSession = sessions.find(s => !s.connectionId); + + if (freeSession) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch { + // Continue to launch + } + } + + // Check limits + const limits = await puppeteer.limits(env.MYBROWSER); + + if (limits.allowedBrowserAcquisitions > 0) { + return await puppeteer.launch(env.MYBROWSER); + } + + // Rate limited, wait and retry + if (i < maxRetries - 1) { + const delay = Math.min( + limits.timeUntilNextAllowedBrowserAcquisition, + Math.pow(2, i) * 1000 // Exponential backoff + ); + await new Promise(resolve => setTimeout(resolve, delay)); + } + } catch (error) { + if (i === maxRetries - 1) throw error; + } + } + + throw new Error("Failed to acquire browser after retries"); +} +``` + +--- + +## Browser Timeout Management + +### Default Timeout + +Browsers close after **60 seconds of inactivity** (no devtools commands). + +**Inactivity means:** +- No `page.goto()` +- No `page.screenshot()` +- No `page.evaluate()` +- No other browser/page operations + +### Extending Timeout with keep_alive + +```typescript +const browser = await puppeteer.launch(env.MYBROWSER, { + keep_alive: 300000 // 5 minutes = 300,000 ms +}); +``` + +**Maximum:** 600,000ms (10 minutes) + +**Use Cases:** +- Long-running scraping workflows +- Multi-step form automation +- Session reuse across multiple requests + +**Cost Impact:** +- Longer keep_alive = more browser hours billed +- Only extend if actually needed + +--- + +## Incognito Browser Contexts + +Use browser contexts to isolate cookies/cache while sharing a browser. + +**Benefits:** +- 1 concurrent browser instead of N +- Separate cookies/cache per context +- Test multi-user scenarios +- Session isolation + +**Example:** +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); + +// Create isolated contexts +const context1 = await browser.createBrowserContext(); +const context2 = await browser.createBrowserContext(); + +// Each context has separate state +const page1 = await context1.newPage(); +const page2 = await context2.newPage(); + +await page1.goto("https://app.example.com"); // User 1 +await page2.goto("https://app.example.com"); // User 2 + +// page1 and page2 have separate cookies +await context1.close(); +await context2.close(); +await browser.close(); +``` + +--- + +## Multiple Tabs vs Multiple Browsers + +### ❌ Bad: Multiple Browsers + +```typescript +// Uses 10 concurrent browsers! +for (const url of urls) { + const browser = await puppeteer.launch(env.MYBROWSER); + const page = await browser.newPage(); + await page.goto(url); + await browser.close(); +} +``` + +**Problems:** +- 10x concurrency usage +- 10x cold start delays +- May hit concurrency limits + +--- + +### ✅ Good: Multiple Tabs + +```typescript +// Uses 1 concurrent browser +const browser = await puppeteer.launch(env.MYBROWSER); + +const results = await Promise.all( + urls.map(async (url) => { + const page = await browser.newPage(); + await page.goto(url); + const data = await page.evaluate(() => ({ + title: document.title + })); + await page.close(); + return data; + }) +); + +await browser.close(); +``` + +**Benefits:** +- 1 concurrent browser (10x reduction) +- Faster (no repeated cold starts) +- Cheaper (reduced concurrency charges) + +--- + +## Monitoring and Debugging + +### Log Session Activity + +```typescript +const browser = await puppeteer.launch(env.MYBROWSER); +const sessionId = browser.sessionId(); + +console.log({ + event: "browser_launched", + sessionId, + timestamp: Date.now() +}); + +// Do work + +await browser.disconnect(); + +console.log({ + event: "browser_disconnected", + sessionId, + timestamp: Date.now() +}); +``` + +### Track Session Metrics + +```typescript +interface SessionMetrics { + sessionId: string; + launched: boolean; // true = new, false = reused + duration: number; // ms + operations: number; // page navigations +} + +async function trackSession(env: Env, fn: (browser: Browser) => Promise) { + const start = Date.now(); + const sessions = await puppeteer.sessions(env.MYBROWSER); + const freeSession = sessions.find(s => !s.connectionId); + + let browser: Browser; + let launched: boolean; + + if (freeSession) { + browser = await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + launched = false; + } else { + browser = await puppeteer.launch(env.MYBROWSER); + launched = true; + } + + await fn(browser); + + const metrics: SessionMetrics = { + sessionId: browser.sessionId(), + launched, + duration: Date.now() - start, + operations: 1 // Track actual operations in production + }; + + await browser.disconnect(); + + return metrics; +} +``` + +--- + +## Production Best Practices + +1. **Always Check Limits** + - Call `puppeteer.limits()` before launching + - Handle rate limit errors gracefully + - Implement retry with backoff + +2. **Prefer Session Reuse** + - Try `puppeteer.connect()` first + - Fall back to `puppeteer.launch()` only if needed + - Use `browser.disconnect()` instead of `browser.close()` + +3. **Use Multiple Tabs** + - One browser, many tabs + - Reduces concurrency usage 10-50x + - Faster than multiple browsers + +4. **Set Appropriate Timeouts** + - Default 60s is fine for most use cases + - Extend only if actually needed (keep_alive) + - Remember: longer timeout = more billable hours + +5. **Handle Errors** + - Always `browser.close()` on errors + - Wrap `puppeteer.connect()` in try-catch + - Gracefully handle rate limits + +6. **Monitor Usage** + - Log session IDs + - Track reuse rate + - Monitor concurrency in dashboard + +7. **Use Incognito Contexts** + - Isolate sessions while sharing browser + - Better than multiple browsers + - Test multi-user scenarios safely + +--- + +## Cost Optimization + +### Scenario: Screenshot Service (1000 requests/hour) + +**Bad Approach (No Session Reuse):** +- Launch new browser for each request +- 1000 browsers/hour +- Average session: 5 seconds +- Browser hours: (1000 * 5) / 3600 = 1.39 hours +- Average concurrency: ~14 browsers +- **Cost**: 1.39 hours = $0.13 + (14-10) * $2 = $8.13/hour + +**Good Approach (Session Reuse):** +- Maintain pool of 3-5 warm browsers +- Reuse sessions across requests +- Average session: 1 hour (keep_alive) +- Browser hours: 5 hours (5 browsers * 1 hour) +- Average concurrency: 5 browsers +- **Cost**: 5 hours = $0.45/hour + +**Savings: 94%** ($8.13 → $0.45) + +--- + +## Common Issues + +### Issue: "Failed to connect to session" + +**Cause:** Session closed between `.sessions()` and `.connect()` calls + +**Solution:** +```typescript +const freeSession = sessions.find(s => !s.connectionId); +if (freeSession) { + try { + return await puppeteer.connect(env.MYBROWSER, freeSession.sessionId); + } catch (error) { + console.log("Session closed, launching new browser"); + return await puppeteer.launch(env.MYBROWSER); + } +} +``` + +### Issue: Sessions timing out too quickly + +**Cause:** Default 60s idle timeout + +**Solution:** Extend with keep_alive: +```typescript +const browser = await puppeteer.launch(env.MYBROWSER, { + keep_alive: 300000 // 5 minutes +}); +``` + +### Issue: Rate limit reached + +**Cause:** Too many concurrent browsers or launches per minute + +**Solution:** Check limits before launching: +```typescript +const limits = await puppeteer.limits(env.MYBROWSER); +if (limits.allowedBrowserAcquisitions === 0) { + return new Response("Rate limit reached", { status: 429 }); +} +``` + +--- + +## Reference + +- **Official Docs**: https://developers.cloudflare.com/browser-rendering/workers-bindings/reuse-sessions/ +- **Limits**: https://developers.cloudflare.com/browser-rendering/platform/limits/ +- **Pricing**: https://developers.cloudflare.com/browser-rendering/platform/pricing/ + +--- + +**Last Updated**: 2025-10-22 diff --git a/scripts/check-versions.sh b/scripts/check-versions.sh new file mode 100755 index 0000000..8238010 --- /dev/null +++ b/scripts/check-versions.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# check-versions.sh +# Verify package versions for Cloudflare Browser Rendering skill + +set -e + +echo "Checking Cloudflare Browser Rendering package versions..." +echo "" + +# Function to check package version +check_package() { + local package=$1 + local current=$2 + + echo "📦 $package" + echo " Current in skill: $current" + + if command -v npm &> /dev/null; then + latest=$(npm view $package version 2>/dev/null || echo "N/A") + echo " Latest on npm: $latest" + + if [ "$current" != "$latest" ] && [ "$latest" != "N/A" ]; then + echo " ⚠️ Update available!" + else + echo " ✅ Up to date" + fi + else + echo " ⚠️ npm not found, skipping latest version check" + fi + + echo "" +} + +echo "=== Core Packages ===" +echo "" + +check_package "@cloudflare/puppeteer" "1.0.4" +check_package "@cloudflare/playwright" "1.0.0" + +echo "=== Related Packages ===" +echo "" + +check_package "wrangler" "4.43.0" +check_package "@cloudflare/workers-types" "4.20251014.0" + +echo "=== Verification Complete ===" +echo "" +echo "To update a package version in this skill:" +echo "1. Update the version in SKILL.md" +echo "2. Update templates if API changes" +echo "3. Test all template files" +echo "4. Update 'Last Updated' date" +echo "5. Commit changes" diff --git a/templates/ai-enhanced-scraper.ts b/templates/ai-enhanced-scraper.ts new file mode 100644 index 0000000..ecc995e --- /dev/null +++ b/templates/ai-enhanced-scraper.ts @@ -0,0 +1,139 @@ +// AI-Enhanced Web Scraper +// Combine Browser Rendering with Workers AI to extract structured data intelligently + +import puppeteer from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; + AI: Ai; +} + +interface ProductData { + name: string; + price: string; + description: string; + availability: string; + [key: string]: any; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url"); + + if (!url) { + return new Response("Missing ?url parameter", { status: 400 }); + } + + // Step 1: Scrape page content with browser + const browser = await puppeteer.launch(env.MYBROWSER); + + try { + const page = await browser.newPage(); + + await page.goto(url, { + waitUntil: "networkidle0", + timeout: 30000, + }); + + // Extract raw HTML content + const bodyContent = await page.$eval("body", (el) => el.innerHTML); + + await browser.close(); + + // Truncate to fit AI context (4000 chars) + const truncatedContent = bodyContent.slice(0, 4000); + + // Step 2: Extract structured data with AI + const aiResponse = await env.AI.run("@cf/meta/llama-3.1-8b-instruct", { + messages: [ + { + role: "system", + content: + "You are a data extraction assistant. Extract product information from HTML and return ONLY valid JSON.", + }, + { + role: "user", + content: `Extract product information from this HTML. Return JSON with these fields: name, price, description, availability. If any field is not found, use empty string.\n\nHTML:\n${truncatedContent}`, + }, + ], + stream: false, + }); + + // Parse AI response + let productData: ProductData; + try { + const responseText = (aiResponse as any).response; + // Try to extract JSON from response (AI might wrap it in markdown) + const jsonMatch = responseText.match(/\{[\s\S]*\}/); + if (jsonMatch) { + productData = JSON.parse(jsonMatch[0]); + } else { + productData = JSON.parse(responseText); + } + } catch { + productData = { + name: "", + price: "", + description: "", + availability: "", + raw: (aiResponse as any).response, + }; + } + + return Response.json({ + url, + product: productData, + extractedAt: new Date().toISOString(), + }); + } catch (error) { + await browser.close(); + return Response.json( + { + error: error instanceof Error ? error.message : "AI-enhanced scraping failed", + }, + { status: 500 } + ); + } + }, +}; + +/** + * Setup: + * Add AI binding to wrangler.jsonc: + * { + * "browser": { "binding": "MYBROWSER" }, + * "ai": { "binding": "AI" }, + * "compatibility_flags": ["nodejs_compat"] + * } + * + * Usage: + * GET /?url=https://example.com/product + * + * Response: + * { + * "url": "https://example.com/product", + * "product": { + * "name": "Example Product", + * "price": "$99.99", + * "description": "Product description...", + * "availability": "In Stock" + * }, + * "extractedAt": "2025-10-22T12:34:56.789Z" + * } + * + * Benefits: + * - No need to write custom CSS selectors for each site + * - AI adapts to different page structures + * - Extracts semantic information, not just raw HTML + * - Handles variations in HTML structure + * + * Limitations: + * - AI context limited to ~4000 chars of HTML + * - May hallucinate if data not present + * - Requires AI binding (uses neurons quota) + * + * See also: + * - cloudflare-workers-ai skill for more AI patterns + * - web-scraper-basic.ts for traditional CSS selector approach + */ diff --git a/templates/basic-screenshot.ts b/templates/basic-screenshot.ts new file mode 100644 index 0000000..59fb163 --- /dev/null +++ b/templates/basic-screenshot.ts @@ -0,0 +1,76 @@ +// Basic Screenshot Example +// Minimal example for taking screenshots with Cloudflare Browser Rendering + +import puppeteer from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url"); + + if (!url) { + return new Response("Missing ?url parameter. Example: ?url=https://example.com", { + status: 400, + }); + } + + let normalizedUrl: string; + try { + normalizedUrl = new URL(url).toString(); + } catch { + return new Response("Invalid URL", { status: 400 }); + } + + // Launch browser + const browser = await puppeteer.launch(env.MYBROWSER); + + try { + // Create new page + const page = await browser.newPage(); + + // Navigate to URL + await page.goto(normalizedUrl, { + waitUntil: "networkidle0", // Wait for network to be idle + timeout: 30000, // 30 second timeout + }); + + // Take screenshot + const screenshot = await page.screenshot({ + fullPage: true, // Capture full scrollable page + type: "png", + }); + + // Clean up + await browser.close(); + + return new Response(screenshot, { + headers: { + "content-type": "image/png", + "cache-control": "public, max-age=3600", // Cache for 1 hour + }, + }); + } catch (error) { + // Always close browser on error + await browser.close(); + throw error; + } + }, +}; + +/** + * Deploy: + * npx wrangler deploy + * + * Test: + * https://your-worker.workers.dev/?url=https://example.com + * + * Configuration (wrangler.jsonc): + * { + * "browser": { "binding": "MYBROWSER" }, + * "compatibility_flags": ["nodejs_compat"] + * } + */ diff --git a/templates/pdf-generation.ts b/templates/pdf-generation.ts new file mode 100644 index 0000000..d68bfd8 --- /dev/null +++ b/templates/pdf-generation.ts @@ -0,0 +1,127 @@ +// PDF Generation +// Generate PDFs from URLs or custom HTML content + +import puppeteer from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +interface PDFRequest { + url?: string; + html?: string; + options?: { + format?: "Letter" | "A4" | "A3" | "Legal"; + landscape?: boolean; + margin?: { + top?: string; + right?: string; + bottom?: string; + left?: string; + }; + }; +} + +export default { + async fetch(request: Request, env: Env): Promise { + if (request.method !== "POST") { + return new Response("Method not allowed. Use POST with JSON body.", { + status: 405, + }); + } + + const body = await request.json(); + const { url, html, options = {} } = body; + + if (!url && !html) { + return new Response('Missing "url" or "html" in request body', { + status: 400, + }); + } + + const browser = await puppeteer.launch(env.MYBROWSER); + + try { + const page = await browser.newPage(); + + // Load content + if (html) { + await page.setContent(html, { waitUntil: "networkidle0" }); + } else if (url) { + await page.goto(url, { + waitUntil: "networkidle0", + timeout: 30000, + }); + } + + // Generate PDF + const pdf = await page.pdf({ + format: options.format || "A4", + landscape: options.landscape || false, + printBackground: true, // Include background colors/images + margin: options.margin || { + top: "1cm", + right: "1cm", + bottom: "1cm", + left: "1cm", + }, + }); + + await browser.close(); + + // Generate filename + const filename = url + ? `${new URL(url).hostname.replace(/\./g, "_")}.pdf` + : "document.pdf"; + + return new Response(pdf, { + headers: { + "content-type": "application/pdf", + "content-disposition": `attachment; filename="${filename}"`, + }, + }); + } catch (error) { + await browser.close(); + return new Response( + JSON.stringify({ + error: error instanceof Error ? error.message : "PDF generation failed", + }), + { + status: 500, + headers: { "content-type": "application/json" }, + } + ); + } + }, +}; + +/** + * Usage Examples: + * + * 1. PDF from URL: + * POST / + * Content-Type: application/json + * { + * "url": "https://example.com" + * } + * + * 2. PDF from custom HTML: + * POST / + * { + * "html": "

Invoice

" + * } + * + * 3. PDF with custom options: + * POST / + * { + * "url": "https://example.com", + * "options": { + * "format": "Letter", + * "landscape": true, + * "margin": { + * "top": "2cm", + * "bottom": "2cm" + * } + * } + * } + */ diff --git a/templates/playwright-example.ts b/templates/playwright-example.ts new file mode 100644 index 0000000..24b3adb --- /dev/null +++ b/templates/playwright-example.ts @@ -0,0 +1,99 @@ +// Playwright Example +// Alternative to Puppeteer using @cloudflare/playwright + +import { chromium } from "@cloudflare/playwright"; + +interface Env { + BROWSER: Fetcher; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url") || "https://example.com"; + + // Launch browser (note: chromium.launch instead of puppeteer.launch) + const browser = await chromium.launch(env.BROWSER); + + try { + // Create new page + const page = await browser.newPage(); + + // Navigate to URL + await page.goto(url, { + waitUntil: "networkidle", + timeout: 30000, + }); + + // Take screenshot + const screenshot = await page.screenshot({ + fullPage: true, + type: "png", + }); + + // Clean up + await browser.close(); + + return new Response(screenshot, { + headers: { + "content-type": "image/png", + "cache-control": "public, max-age=3600", + }, + }); + } catch (error) { + await browser.close(); + return new Response( + JSON.stringify({ + error: error instanceof Error ? error.message : "Screenshot failed", + }), + { + status: 500, + headers: { "content-type": "application/json" }, + } + ); + } + }, +}; + +/** + * Playwright vs Puppeteer: + * + * Similarities: + * - Very similar API (page.goto, page.screenshot, etc.) + * - Both support Chromium on Workers + * - Same use cases (screenshots, PDFs, scraping) + * + * Differences: + * + * | Feature | Puppeteer | Playwright | + * |---------|-----------|------------| + * | Import | `import puppeteer from "@cloudflare/puppeteer"` | `import { chromium } from "@cloudflare/playwright"` | + * | Launch | `puppeteer.launch(env.MYBROWSER)` | `chromium.launch(env.BROWSER)` | + * | Session Management | ✅ Advanced (sessions, history, limits) | ⚠️ Basic | + * | Auto-waiting | Manual waitForSelector() | Built-in auto-waiting | + * | Selectors | CSS only | CSS, text, XPath (via workaround) | + * | Version | @cloudflare/puppeteer@1.0.4 | @cloudflare/playwright@1.0.0 | + * + * When to use Playwright: + * - Already using Playwright for testing + * - Prefer auto-waiting behavior + * - Don't need advanced session management + * + * When to use Puppeteer: + * - Need session reuse for performance + * - Want to check limits before launching + * - More familiar with Puppeteer API + * + * Installation: + * npm install @cloudflare/playwright + * + * Configuration (wrangler.jsonc): + * { + * "browser": { "binding": "BROWSER" }, + * "compatibility_flags": ["nodejs_compat"] + * } + * + * Recommendation: + * Stick with Puppeteer for most use cases unless you have + * existing Playwright tests to migrate. + */ diff --git a/templates/screenshot-with-kv-cache.ts b/templates/screenshot-with-kv-cache.ts new file mode 100644 index 0000000..2eed7c3 --- /dev/null +++ b/templates/screenshot-with-kv-cache.ts @@ -0,0 +1,107 @@ +// Screenshot with KV Caching +// Production-ready screenshot service with KV caching to reduce browser usage + +import puppeteer from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; + SCREENSHOT_CACHE: KVNamespace; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url"); + const refresh = searchParams.get("refresh") === "true"; + + if (!url) { + return new Response("Missing ?url parameter", { status: 400 }); + } + + const normalizedUrl = new URL(url).toString(); + + // Check cache (unless refresh requested) + if (!refresh) { + const cached = await env.SCREENSHOT_CACHE.get(normalizedUrl, { + type: "arrayBuffer", + }); + + if (cached) { + return new Response(cached, { + headers: { + "content-type": "image/png", + "x-cache": "HIT", + "cache-control": "public, max-age=3600", + }, + }); + } + } + + // Generate screenshot + const browser = await puppeteer.launch(env.MYBROWSER); + + try { + const page = await browser.newPage(); + + await page.goto(normalizedUrl, { + waitUntil: "networkidle0", + timeout: 30000, + }); + + const screenshot = await page.screenshot({ + fullPage: true, + type: "png", + }); + + await browser.close(); + + // Cache for 24 hours + await env.SCREENSHOT_CACHE.put(normalizedUrl, screenshot, { + expirationTtl: 60 * 60 * 24, // 24 hours + }); + + return new Response(screenshot, { + headers: { + "content-type": "image/png", + "x-cache": "MISS", + "cache-control": "public, max-age=3600", + }, + }); + } catch (error) { + await browser.close(); + return new Response( + JSON.stringify({ + error: error instanceof Error ? error.message : "Screenshot failed", + }), + { + status: 500, + headers: { "content-type": "application/json" }, + } + ); + } + }, +}; + +/** + * Setup: + * 1. Create KV namespace: + * npx wrangler kv namespace create SCREENSHOT_CACHE + * npx wrangler kv namespace create SCREENSHOT_CACHE --preview + * + * 2. Add to wrangler.jsonc: + * { + * "browser": { "binding": "MYBROWSER" }, + * "compatibility_flags": ["nodejs_compat"], + * "kv_namespaces": [ + * { + * "binding": "SCREENSHOT_CACHE", + * "id": "YOUR_KV_ID", + * "preview_id": "YOUR_PREVIEW_ID" + * } + * ] + * } + * + * Usage: + * New screenshot: ?url=https://example.com + * Force refresh: ?url=https://example.com&refresh=true + */ diff --git a/templates/session-reuse.ts b/templates/session-reuse.ts new file mode 100644 index 0000000..847f331 --- /dev/null +++ b/templates/session-reuse.ts @@ -0,0 +1,118 @@ +// Session Reuse Pattern +// Optimize performance by reusing browser sessions instead of launching new ones + +import puppeteer, { Browser } from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +/** + * Get or create a browser instance + * Tries to connect to existing session first, launches new one if needed + */ +async function getBrowser(env: Env): Promise<{ browser: Browser; launched: boolean }> { + // Check for available sessions + const sessions = await puppeteer.sessions(env.MYBROWSER); + + // Find sessions without active connections + const freeSessions = sessions.filter((s) => !s.connectionId); + + if (freeSessions.length > 0) { + // Try to connect to existing session + try { + console.log("Connecting to existing session:", freeSessions[0].sessionId); + const browser = await puppeteer.connect(env.MYBROWSER, freeSessions[0].sessionId); + return { browser, launched: false }; + } catch (error) { + console.log("Failed to connect, launching new browser:", error); + } + } + + // Check limits before launching + const limits = await puppeteer.limits(env.MYBROWSER); + if (limits.allowedBrowserAcquisitions === 0) { + throw new Error( + `Rate limit reached. Retry after ${limits.timeUntilNextAllowedBrowserAcquisition}ms` + ); + } + + // Launch new session + console.log("Launching new browser session"); + const browser = await puppeteer.launch(env.MYBROWSER); + return { browser, launched: true }; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url") || "https://example.com"; + + try { + // Get or create browser + const { browser, launched } = await getBrowser(env); + const sessionId = browser.sessionId(); + + console.log({ + sessionId, + launched, + message: launched ? "New browser launched" : "Reused existing session", + }); + + // Do work + const page = await browser.newPage(); + await page.goto(url, { + waitUntil: "networkidle0", + timeout: 30000, + }); + + const screenshot = await page.screenshot(); + await page.close(); + + // IMPORTANT: Disconnect (don't close) to keep session alive for reuse + await browser.disconnect(); + + return new Response(screenshot, { + headers: { + "content-type": "image/png", + "x-session-id": sessionId, + "x-session-reused": launched ? "false" : "true", + }, + }); + } catch (error) { + return new Response( + JSON.stringify({ + error: error instanceof Error ? error.message : "Unknown error", + }), + { + status: 500, + headers: { "content-type": "application/json" }, + } + ); + } + }, +}; + +/** + * Key Concepts: + * + * 1. puppeteer.sessions() - List all active sessions + * 2. puppeteer.connect() - Connect to existing session + * 3. browser.disconnect() - Disconnect WITHOUT closing (keeps session alive) + * 4. browser.close() - Terminate session completely + * 5. puppeteer.limits() - Check rate limits before launching + * + * Benefits: + * - Faster response times (no cold start) + * - Lower concurrency usage + * - Better resource utilization + * + * Trade-offs: + * - Sessions time out after 60s idle (extend with keep_alive) + * - Must handle connection failures gracefully + * - Need to track which sessions are available + * + * Response Headers: + * - x-session-id: Browser session ID + * - x-session-reused: true if reused existing session + */ diff --git a/templates/web-scraper-basic.ts b/templates/web-scraper-basic.ts new file mode 100644 index 0000000..46122dc --- /dev/null +++ b/templates/web-scraper-basic.ts @@ -0,0 +1,116 @@ +// Basic Web Scraper +// Extract structured data from web pages + +import puppeteer from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +interface ScrapedData { + url: string; + title: string; + description: string; + headings: string[]; + links: Array<{ text: string; href: string }>; + images: Array<{ alt: string; src: string }>; + timestamp: string; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const { searchParams } = new URL(request.url); + const url = searchParams.get("url"); + + if (!url) { + return new Response("Missing ?url parameter", { status: 400 }); + } + + const normalizedUrl = new URL(url).toString(); + const browser = await puppeteer.launch(env.MYBROWSER); + + try { + const page = await browser.newPage(); + + // Navigate to page + await page.goto(normalizedUrl, { + waitUntil: "networkidle0", + timeout: 30000, + }); + + // Wait for body to be present + await page.waitForSelector("body"); + + // Extract structured data + const data = await page.evaluate(() => { + // Get all headings + const headings = Array.from(document.querySelectorAll("h1, h2, h3")).map( + (el) => el.textContent?.trim() || "" + ); + + // Get all links + const links = Array.from(document.querySelectorAll("a")) + .filter((a) => a.href) + .map((a) => ({ + text: a.textContent?.trim() || "", + href: a.href, + })) + .slice(0, 50); // Limit to first 50 links + + // Get all images + const images = Array.from(document.querySelectorAll("img")) + .filter((img) => img.src) + .map((img) => ({ + alt: img.alt || "", + src: img.src, + })) + .slice(0, 20); // Limit to first 20 images + + return { + url: window.location.href, + title: document.title, + description: + document.querySelector('meta[name="description"]')?.getAttribute("content") || + "", + headings, + links, + images, + timestamp: new Date().toISOString(), + }; + }); + + await browser.close(); + + return Response.json(data, { + headers: { + "cache-control": "public, max-age=3600", + }, + }); + } catch (error) { + await browser.close(); + return Response.json( + { + error: error instanceof Error ? error.message : "Scraping failed", + url: normalizedUrl, + }, + { status: 500 } + ); + } + }, +}; + +/** + * Usage: + * GET /?url=https://example.com + * + * Response: + * { + * "url": "https://example.com", + * "title": "Example Domain", + * "description": "...", + * "headings": ["Example Domain"], + * "links": [{ "text": "More information...", "href": "..." }], + * "images": [], + * "timestamp": "2025-10-22T12:34:56.789Z" + * } + */ diff --git a/templates/web-scraper-batch.ts b/templates/web-scraper-batch.ts new file mode 100644 index 0000000..7dfa9cd --- /dev/null +++ b/templates/web-scraper-batch.ts @@ -0,0 +1,138 @@ +// Batch Web Scraper +// Scrape multiple URLs efficiently using browser tabs + +import puppeteer, { Browser } from "@cloudflare/puppeteer"; + +interface Env { + MYBROWSER: Fetcher; +} + +interface ScrapeResult { + url: string; + success: boolean; + data?: { + title: string; + description: string; + textContent: string; // First 500 chars + }; + error?: string; +} + +async function scrapePage(browser: Browser, url: string): Promise { + const page = await browser.newPage(); + + try { + await page.goto(url, { + waitUntil: "networkidle0", + timeout: 30000, + }); + + const data = await page.evaluate(() => ({ + title: document.title, + description: + document.querySelector('meta[name="description"]')?.getAttribute("content") || + "", + textContent: document.body.innerText.slice(0, 500), // First 500 chars + })); + + await page.close(); + + return { + url, + success: true, + data, + }; + } catch (error) { + await page.close(); + + return { + url, + success: false, + error: error instanceof Error ? error.message : "Unknown error", + }; + } +} + +export default { + async fetch(request: Request, env: Env): Promise { + if (request.method !== "POST") { + return new Response("Method not allowed. Use POST with JSON body.", { + status: 405, + }); + } + + const { urls } = await request.json<{ urls: string[] }>(); + + if (!urls || !Array.isArray(urls) || urls.length === 0) { + return new Response('Missing "urls" array in request body', { + status: 400, + }); + } + + // Limit batch size + if (urls.length > 20) { + return new Response("Maximum 20 URLs per batch", { status: 400 }); + } + + // Launch single browser + const browser = await puppeteer.launch(env.MYBROWSER); + + try { + // Scrape all URLs in parallel (each in its own tab) + const results = await Promise.all(urls.map((url) => scrapePage(browser, url))); + + await browser.close(); + + const summary = { + total: results.length, + successful: results.filter((r) => r.success).length, + failed: results.filter((r) => !r.success).length, + }; + + return Response.json({ + summary, + results, + }); + } catch (error) { + await browser.close(); + return Response.json( + { + error: error instanceof Error ? error.message : "Batch scraping failed", + }, + { status: 500 } + ); + } + }, +}; + +/** + * Usage: + * POST / + * Content-Type: application/json + * { + * "urls": [ + * "https://example.com", + * "https://example.org", + * "https://example.net" + * ] + * } + * + * Response: + * { + * "summary": { + * "total": 3, + * "successful": 3, + * "failed": 0 + * }, + * "results": [ + * { + * "url": "https://example.com", + * "success": true, + * "data": { "title": "...", "description": "...", "textContent": "..." } + * } + * ] + * } + * + * Note: Uses 1 browser with multiple tabs instead of multiple browsers. + * This reduces concurrency usage and is more efficient. + */ diff --git a/templates/wrangler-browser-config.jsonc b/templates/wrangler-browser-config.jsonc new file mode 100644 index 0000000..e564e61 --- /dev/null +++ b/templates/wrangler-browser-config.jsonc @@ -0,0 +1,116 @@ +// Complete wrangler.jsonc configuration for Browser Rendering +{ + "name": "browser-worker", + "main": "src/index.ts", + "compatibility_date": "2023-03-14", + + // REQUIRED: nodejs_compat flag for Browser Rendering + "compatibility_flags": [ + "nodejs_compat" + ], + + // Browser binding (required) + "browser": { + "binding": "MYBROWSER" + // Optional: Use real headless browser during local development + // "remote": true + }, + + // Optional: KV for caching screenshots/PDFs + // Create with: npx wrangler kv namespace create SCREENSHOT_CACHE + // npx wrangler kv namespace create SCREENSHOT_CACHE --preview + "kv_namespaces": [ + { + "binding": "SCREENSHOT_CACHE", + "id": "YOUR_KV_ID", // Replace with actual ID + "preview_id": "YOUR_PREVIEW_ID" // Replace with actual preview ID + } + ], + + // Optional: R2 for storing generated files + // Create with: npx wrangler r2 bucket create browser-files + "r2_buckets": [ + { + "binding": "BROWSER_FILES", + "bucket_name": "browser-files" + } + ], + + // Optional: AI binding for AI-enhanced scraping + "ai": { + "binding": "AI" + }, + + // Optional: D1 for storing scraping results + // Create with: npx wrangler d1 create browser-db + "d1_databases": [ + { + "binding": "DB", + "database_name": "browser-db", + "database_id": "YOUR_DB_ID" + } + ], + + // Optional: Environment variables + "vars": { + "ENVIRONMENT": "production" + }, + + // Optional: Secrets (set with: npx wrangler secret put SECRET_NAME) + // "secrets": ["API_KEY"] + + // Optional: Custom routes for production + // "routes": [ + // { + // "pattern": "browser.example.com/*", + // "zone_name": "example.com" + // } + // ] +} + +/** + * Key Configuration Notes: + * + * 1. nodejs_compat flag is REQUIRED + * - Browser Rendering needs Node.js APIs + * - Automatically enables nodejs_compat_v2 if compatibility_date >= 2024-09-23 + * + * 2. Browser binding name + * - Use "MYBROWSER" or any name you prefer + * - Reference in code: env.MYBROWSER + * + * 3. Remote binding for local development + * - "remote": true connects to real headless browser + * - Useful if hitting 1MB request limit in local dev + * - Remove for production (not needed) + * + * 4. KV for caching + * - Highly recommended for production screenshot services + * - Reduces browser usage and costs + * - Cache TTL: typically 1-24 hours + * + * 5. R2 for file storage + * - Store generated PDFs or screenshots long-term + * - Cheaper than KV for large files + * - Use presigned URLs for downloads + * + * 6. AI binding + * - Optional: for AI-enhanced scraping + * - Requires Workers Paid plan + * - See cloudflare-workers-ai skill + * + * 7. D1 database + * - Optional: store scraping metadata + * - Track URLs, timestamps, status + * - See cloudflare-d1 skill + * + * Commands: + * npx wrangler dev # Local development + * npx wrangler deploy # Deploy to production + * npx wrangler tail # View logs + * + * See also: + * - cloudflare-worker-base skill for complete Worker setup + * - cloudflare-kv skill for KV caching patterns + * - cloudflare-r2 skill for R2 storage patterns + */