Initial commit
This commit is contained in:
116
templates/web-scraper-basic.ts
Normal file
116
templates/web-scraper-basic.ts
Normal file
@@ -0,0 +1,116 @@
|
||||
// Basic Web Scraper
|
||||
// Extract structured data from web pages
|
||||
|
||||
import puppeteer from "@cloudflare/puppeteer";
|
||||
|
||||
interface Env {
|
||||
MYBROWSER: Fetcher;
|
||||
}
|
||||
|
||||
interface ScrapedData {
|
||||
url: string;
|
||||
title: string;
|
||||
description: string;
|
||||
headings: string[];
|
||||
links: Array<{ text: string; href: string }>;
|
||||
images: Array<{ alt: string; src: string }>;
|
||||
timestamp: string;
|
||||
}
|
||||
|
||||
export default {
|
||||
async fetch(request: Request, env: Env): Promise<Response> {
|
||||
const { searchParams } = new URL(request.url);
|
||||
const url = searchParams.get("url");
|
||||
|
||||
if (!url) {
|
||||
return new Response("Missing ?url parameter", { status: 400 });
|
||||
}
|
||||
|
||||
const normalizedUrl = new URL(url).toString();
|
||||
const browser = await puppeteer.launch(env.MYBROWSER);
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Navigate to page
|
||||
await page.goto(normalizedUrl, {
|
||||
waitUntil: "networkidle0",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
// Wait for body to be present
|
||||
await page.waitForSelector("body");
|
||||
|
||||
// Extract structured data
|
||||
const data = await page.evaluate<ScrapedData>(() => {
|
||||
// Get all headings
|
||||
const headings = Array.from(document.querySelectorAll("h1, h2, h3")).map(
|
||||
(el) => el.textContent?.trim() || ""
|
||||
);
|
||||
|
||||
// Get all links
|
||||
const links = Array.from(document.querySelectorAll("a"))
|
||||
.filter((a) => a.href)
|
||||
.map((a) => ({
|
||||
text: a.textContent?.trim() || "",
|
||||
href: a.href,
|
||||
}))
|
||||
.slice(0, 50); // Limit to first 50 links
|
||||
|
||||
// Get all images
|
||||
const images = Array.from(document.querySelectorAll("img"))
|
||||
.filter((img) => img.src)
|
||||
.map((img) => ({
|
||||
alt: img.alt || "",
|
||||
src: img.src,
|
||||
}))
|
||||
.slice(0, 20); // Limit to first 20 images
|
||||
|
||||
return {
|
||||
url: window.location.href,
|
||||
title: document.title,
|
||||
description:
|
||||
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
|
||||
"",
|
||||
headings,
|
||||
links,
|
||||
images,
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
|
||||
return Response.json(data, {
|
||||
headers: {
|
||||
"cache-control": "public, max-age=3600",
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
await browser.close();
|
||||
return Response.json(
|
||||
{
|
||||
error: error instanceof Error ? error.message : "Scraping failed",
|
||||
url: normalizedUrl,
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Usage:
|
||||
* GET /?url=https://example.com
|
||||
*
|
||||
* Response:
|
||||
* {
|
||||
* "url": "https://example.com",
|
||||
* "title": "Example Domain",
|
||||
* "description": "...",
|
||||
* "headings": ["Example Domain"],
|
||||
* "links": [{ "text": "More information...", "href": "..." }],
|
||||
* "images": [],
|
||||
* "timestamp": "2025-10-22T12:34:56.789Z"
|
||||
* }
|
||||
*/
|
||||
Reference in New Issue
Block a user