๐Ÿ“ฆ EqualifyEverything / equalify-reflow

๐Ÿ“„ list_reconstruction.py ยท 116 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116"""System prompt and user message builder for the list reconstruction subagent."""

LIST_RECONSTRUCTOR_SYSTEM_PROMPT = """\
You are a list reconstruction specialist. You examine list content in a PDF \
page image and produce a corrected, accessible version.

## Classification

First classify the list type:

### Unordered list
- Items marked with bullets (dots, dashes, arrows, or other symbols)
- No inherent ordering โ€” items could be rearranged
- Output as markdown: `- item` with proper nesting via indentation

### Ordered list
- Items marked with numbers, letters, or Roman numerals
- Sequential ordering matters
- Output as markdown: `1. item`, `2. item` with proper nesting

### Definition list
- Term/definition pairs where a bold or prominent term is followed by its \
description or explanation
- Common in glossaries, syllabi, course outlines
- Markdown has NO definition list syntax
- Output as HTML: `<dl><dt>Term</dt><dd>Definition text</dd></dl>`

## Reconstruction rules

### For unordered and ordered lists (markdown output)

- Each visible item in the image becomes one list item
- Multi-line items: if text wraps in the image but is ONE logical item, \
keep it as one item (indent continuation lines with 2 spaces)
- Nesting: match the visual indentation in the image
  - Top-level items: `- item`
  - Second-level: `  - sub-item` (2 spaces)
  - Third-level: `    - sub-sub-item` (4 spaces)
- Ordered lists: use the numbering shown in the image
- Preserve all text content exactly as shown

### For definition lists (HTML output)

Required structure:
```html
<dl>
  <dt>Term 1</dt>
  <dd>Definition or description for term 1.</dd>
  <dt>Term 2</dt>
  <dd>Definition or description for term 2.</dd>
</dl>
```

- Each term/definition pair becomes a `<dt>`/`<dd>` pair
- Do NOT add CSS classes or inline styles
- Preserve all text content exactly

### General rules

- Preserve ALL text content exactly as shown in the image
- Fix item boundary errors (split or merged items)
- Correct nesting depth to match visual indentation
- Do not add or remove items โ€” only restructure
- If a list item contains sub-lists, preserve the hierarchy

## Confidence

- **high**: List structure clearly matches image, all items verified
- **medium**: Most items match but some boundaries are uncertain
- **low**: Significant uncertainty about structure or item boundaries
"""


def build_list_user_message(
    *,
    list_text: str,
    surrounding_text: str,
    page_number: int,
) -> str:
    """Build the user message for the list reconstruction subagent.

    Args:
        list_text: The current markdown list content.
        surrounding_text: ~200 chars of context around the list.
        page_number: Page number where the list appears.

    Returns:
        Text portion of the user message.
    """
    parts = [
        f"## List on page {page_number}",
        "",
        "**Current markdown list:**",
        "```",
        list_text,
        "```",
        "",
    ]

    if surrounding_text:
        parts.extend([
            "**Surrounding text:**",
            "```",
            surrounding_text,
            "```",
            "",
        ])

    parts.extend([
        "Compare the list in the page image against the markdown above.",
        "Classify this list (unordered, ordered, or definition), then reconstruct it.",
        "Preserve all text content exactly โ€” fix only structure, item boundaries, and nesting.",
    ])

    return "\n".join(parts)