1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91import { writeFileSync } from "node:fs";
import { join } from "node:path";
import { extractJson } from "../util/json.ts";
import { feedbackPreamble, loadImage, type PipelineContext } from "./context.ts";
import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts";
import type { Fragment } from "./fragment.ts";
const PAGE_AGENT = "page";
// Single coherent extraction: one vision call converts the WHOLE page into one
// accessible-HTML fragment. This replaces fanning the page out to many
// content agents that each re-rendered it (which produced duplicated output for
// nested structures like forms). The specialist agents in agents/ remain in the
// repo for the contribution/refinement story; this is the primary path.
const SYSTEM_PROMPT = `You convert an ENTIRE document page (provided as an image) into a single, coherent,
accessible HTML fragment that meets WCAG 2.2 AA. You see the whole page and produce ONE
faithful representation of it. NEVER duplicate content or render the same thing two ways
(for example, do not output both a <form> and a <table> for the same fields) โ choose the
single structure that best matches the source.
Output ONLY the body content (no <html>, <head>, or <body> wrapper). Use the most appropriate
semantic structure for what the page actually is: headings in correct nesting order,
paragraphs, lists, tables with <caption>/<thead>/<th scope>, forms with
<label>/<fieldset>/<legend>, figures with <figcaption>, footnotes, etc. Transcribe visible
text faithfully and do not invent content. If content is cut off at a page edge, note it in
the "log" field.
If โ and only if โ this page contains a content type that a DEDICATED specialist agent would
handle clearly better than this general pass (something beyond the common types: paragraph,
heading, list, table, form field, image, quote, caption, footnote), include a
"suggested_agent". Suggest sparingly; omit it (or null) otherwise.
Respond with ONLY this JSON:
{ "html": "<accessible HTML for the whole page โ body content only, no duplication>",
"log": "notes, e.g. content cut off at an edge",
"suggested_agent": { "name": "lowerCamelCase", "reason": "why a specialist is warranted" } }`;
export interface ExtractionResult {
fragments: Fragment[];
suggestions: { name: string; reason: string; image: string }[];
}
function stripFences(t: string): string {
const m = t.match(/```(?:html)?\s*([\s\S]*?)```/i);
return (m ? m[1] : t).trim();
}
// One fragment per page, in submitted order.
export async function runExtraction(ctx: PipelineContext): Promise<ExtractionResult> {
const fragments: Fragment[] = [];
const suggestions: ExtractionResult["suggestions"] = [];
for (const img of ctx.images) {
const user =
`Convert this document page image (filename: ${img.name}, page ${img.order} of ${ctx.images.length}) ` +
`to accessible HTML.\n\n${ACCESSIBILITY_REQUIREMENTS}${feedbackPreamble(ctx)}`;
const res = await ctx.router.complete(
PAGE_AGENT,
"vision",
[
{ role: "system", content: SYSTEM_PROMPT },
{ role: "user", content: user },
],
{ images: [loadImage(img)] },
);
ctx.log.agentCall({
agent: { name: PAGE_AGENT, file: "page.md", content: SYSTEM_PROMPT, capabilities: ["vision"], sha: null, sessionBuilt: false },
phase: "extraction",
image: img.name,
output: res.text,
});
const parsed = extractJson<{ html?: string; log?: string; suggested_agent?: { name?: string; reason?: string } }>(res.text);
fragments.push({
image: img.name,
order: img.order,
agent: "page.md",
region: "page",
innerHtml: parsed?.html ?? stripFences(res.text),
edges: [],
log: parsed?.log ?? "",
});
const sa = parsed?.suggested_agent;
if (sa?.name) suggestions.push({ name: sa.name, reason: sa.reason ?? "", image: img.name });
}
writeFileSync(
join(ctx.paths.sessionFragments(ctx.sessionId), "fragments.json"),
JSON.stringify(fragments, null, 2),
);
return { fragments, suggestions };
}