Files
gh-jezweb-claude-skills-ski…/templates/web-scraper-basic.ts
2025-11-30 08:24:08 +08:00

117 lines
2.9 KiB
TypeScript

// Basic Web Scraper
// Extract structured data from web pages
import puppeteer from "@cloudflare/puppeteer";
interface Env {
MYBROWSER: Fetcher;
}
interface ScrapedData {
url: string;
title: string;
description: string;
headings: string[];
links: Array<{ text: string; href: string }>;
images: Array<{ alt: string; src: string }>;
timestamp: string;
}
export default {
async fetch(request: Request, env: Env): Promise<Response> {
const { searchParams } = new URL(request.url);
const url = searchParams.get("url");
if (!url) {
return new Response("Missing ?url parameter", { status: 400 });
}
const normalizedUrl = new URL(url).toString();
const browser = await puppeteer.launch(env.MYBROWSER);
try {
const page = await browser.newPage();
// Navigate to page
await page.goto(normalizedUrl, {
waitUntil: "networkidle0",
timeout: 30000,
});
// Wait for body to be present
await page.waitForSelector("body");
// Extract structured data
const data = await page.evaluate<ScrapedData>(() => {
// Get all headings
const headings = Array.from(document.querySelectorAll("h1, h2, h3")).map(
(el) => el.textContent?.trim() || ""
);
// Get all links
const links = Array.from(document.querySelectorAll("a"))
.filter((a) => a.href)
.map((a) => ({
text: a.textContent?.trim() || "",
href: a.href,
}))
.slice(0, 50); // Limit to first 50 links
// Get all images
const images = Array.from(document.querySelectorAll("img"))
.filter((img) => img.src)
.map((img) => ({
alt: img.alt || "",
src: img.src,
}))
.slice(0, 20); // Limit to first 20 images
return {
url: window.location.href,
title: document.title,
description:
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
"",
headings,
links,
images,
timestamp: new Date().toISOString(),
};
});
await browser.close();
return Response.json(data, {
headers: {
"cache-control": "public, max-age=3600",
},
});
} catch (error) {
await browser.close();
return Response.json(
{
error: error instanceof Error ? error.message : "Scraping failed",
url: normalizedUrl,
},
{ status: 500 }
);
}
},
};
/**
* Usage:
* GET /?url=https://example.com
*
* Response:
* {
* "url": "https://example.com",
* "title": "Example Domain",
* "description": "...",
* "headings": ["Example Domain"],
* "links": [{ "text": "More information...", "href": "..." }],
* "images": [],
* "timestamp": "2025-10-22T12:34:56.789Z"
* }
*/