1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72import type { APIGatewayProxyEventV2, APIGatewayProxyResultV2 } from "aws-lambda";
import Sitemapper from "sitemapper";
interface CrawlRequest {
url: string;
depth?: number;
}
const corsHeaders = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Headers": "Content-Type",
"Access-Control-Allow-Methods": "POST, OPTIONS",
};
const respond = (statusCode: number, body: object): APIGatewayProxyResultV2 => ({
statusCode,
headers: { "Content-Type": "application/json", ...corsHeaders },
body: JSON.stringify(body),
});
export const handler = async (
event: APIGatewayProxyEventV2
): Promise<APIGatewayProxyResultV2> => {
// Handle CORS preflight
if (event.requestContext?.http?.method === "OPTIONS") {
return respond(200, {});
}
try {
const body: CrawlRequest = JSON.parse(event.body || "{}");
if (!body.url) {
return respond(400, { error: "url is required" });
}
// Normalize URL — ensure it has a protocol
let targetUrl = body.url.trim();
if (!targetUrl.startsWith("http://") && !targetUrl.startsWith("https://")) {
targetUrl = `https://${targetUrl}`;
}
// Try sitemap discovery first
const sitemapper = new Sitemapper({
url: `${targetUrl.replace(/\/$/, "")}/sitemap.xml`,
timeout: 15000,
});
const { sites } = await sitemapper.fetch();
if (sites.length > 0) {
return respond(200, {
url: targetUrl,
method: "sitemap",
urls: sites,
});
}
// No sitemap found — return just the original URL
return respond(200, {
url: targetUrl,
method: "none",
urls: [targetUrl],
});
} catch (error) {
console.error("Crawl error:", error);
return respond(500, {
error: "Crawl failed",
message: error instanceof Error ? error.message : String(error),
});
}
};