117 lines
2.9 KiB
TypeScript
117 lines
2.9 KiB
TypeScript
// Basic Web Scraper
|
|
// Extract structured data from web pages
|
|
|
|
import puppeteer from "@cloudflare/puppeteer";
|
|
|
|
interface Env {
|
|
MYBROWSER: Fetcher;
|
|
}
|
|
|
|
interface ScrapedData {
|
|
url: string;
|
|
title: string;
|
|
description: string;
|
|
headings: string[];
|
|
links: Array<{ text: string; href: string }>;
|
|
images: Array<{ alt: string; src: string }>;
|
|
timestamp: string;
|
|
}
|
|
|
|
export default {
|
|
async fetch(request: Request, env: Env): Promise<Response> {
|
|
const { searchParams } = new URL(request.url);
|
|
const url = searchParams.get("url");
|
|
|
|
if (!url) {
|
|
return new Response("Missing ?url parameter", { status: 400 });
|
|
}
|
|
|
|
const normalizedUrl = new URL(url).toString();
|
|
const browser = await puppeteer.launch(env.MYBROWSER);
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
|
|
// Navigate to page
|
|
await page.goto(normalizedUrl, {
|
|
waitUntil: "networkidle0",
|
|
timeout: 30000,
|
|
});
|
|
|
|
// Wait for body to be present
|
|
await page.waitForSelector("body");
|
|
|
|
// Extract structured data
|
|
const data = await page.evaluate<ScrapedData>(() => {
|
|
// Get all headings
|
|
const headings = Array.from(document.querySelectorAll("h1, h2, h3")).map(
|
|
(el) => el.textContent?.trim() || ""
|
|
);
|
|
|
|
// Get all links
|
|
const links = Array.from(document.querySelectorAll("a"))
|
|
.filter((a) => a.href)
|
|
.map((a) => ({
|
|
text: a.textContent?.trim() || "",
|
|
href: a.href,
|
|
}))
|
|
.slice(0, 50); // Limit to first 50 links
|
|
|
|
// Get all images
|
|
const images = Array.from(document.querySelectorAll("img"))
|
|
.filter((img) => img.src)
|
|
.map((img) => ({
|
|
alt: img.alt || "",
|
|
src: img.src,
|
|
}))
|
|
.slice(0, 20); // Limit to first 20 images
|
|
|
|
return {
|
|
url: window.location.href,
|
|
title: document.title,
|
|
description:
|
|
document.querySelector('meta[name="description"]')?.getAttribute("content") ||
|
|
"",
|
|
headings,
|
|
links,
|
|
images,
|
|
timestamp: new Date().toISOString(),
|
|
};
|
|
});
|
|
|
|
await browser.close();
|
|
|
|
return Response.json(data, {
|
|
headers: {
|
|
"cache-control": "public, max-age=3600",
|
|
},
|
|
});
|
|
} catch (error) {
|
|
await browser.close();
|
|
return Response.json(
|
|
{
|
|
error: error instanceof Error ? error.message : "Scraping failed",
|
|
url: normalizedUrl,
|
|
},
|
|
{ status: 500 }
|
|
);
|
|
}
|
|
},
|
|
};
|
|
|
|
/**
|
|
* Usage:
|
|
* GET /?url=https://example.com
|
|
*
|
|
* Response:
|
|
* {
|
|
* "url": "https://example.com",
|
|
* "title": "Example Domain",
|
|
* "description": "...",
|
|
* "headings": ["Example Domain"],
|
|
* "links": [{ "text": "More information...", "href": "..." }],
|
|
* "images": [],
|
|
* "timestamp": "2025-10-22T12:34:56.789Z"
|
|
* }
|
|
*/
|