1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230"""Document context models for the remediation pipeline.
These models provide semantic context to downstream agents, enabling them
to make better decisions and generate AutoCorrections directly.
DocumentSummary captures:
- Topic understanding (what the document is about)
- Key entities (names, projects, technical terms for OCR detection)
- Domain vocabulary (helps catch OCR errors)
- Expected elements (abstract, references, figures)
- Audience level (academic, student, general)
ObservationContext provides full context for processing a single observation.
"""
from typing import TYPE_CHECKING
from pydantic import BaseModel, ConfigDict, Field
if TYPE_CHECKING:
from .observation import Observation
from .remediation import HeadingTree
class DocumentSummary(BaseModel):
"""Context passed to all downstream agents.
Generated during Phase 1 (Analysis) by the summary agent. This provides
semantic context that helps all downstream agents make better decisions.
Key uses:
- OCR error detection: key_entities helps catch "Exxon" → "Enzo" errors
- Structure validation: expected_elements guides heading validation
- Alt text generation: topic_summary provides context for descriptions
Attributes:
title: Document title
document_type: Classification (research_paper, syllabus, exam, etc.)
topic_summary: 1-2 sentences about content
structure_summary: Layout description (e.g., "9 pages, two-column")
key_entities: Names, projects, technical terms for OCR detection
domain_terms: Domain-specific vocabulary
expected_elements: Expected structural elements
audience_level: Target audience (academic, student, general)
Example:
>>> summary = DocumentSummary(
... title="yt: An Open Source Framework",
... document_type="research_paper",
... topic_summary="Research paper about yt, an open-source analysis framework",
... structure_summary="9 pages, two-column, abstract + 6 sections",
... key_entities=["yt", "Enzo", "Matthew Turk", "DVCS"],
... domain_terms=["parallelization", "MPI", "OpenMP"],
... expected_elements=["abstract", "introduction", "references"],
... audience_level="academic"
... )
"""
# Basic info
title: str = Field(
...,
description="Document title"
)
document_type: str = Field(
...,
description="Document classification: research_paper, syllabus, exam, etc."
)
# Semantic context
topic_summary: str = Field(
...,
description="1-2 sentences about document content"
)
structure_summary: str = Field(
default="",
description="Layout description (e.g., '9 pages, two-column, abstract + 6 sections')"
)
# Key terms for OCR detection and context
key_entities: list[str] = Field(
default_factory=list,
description="Names, projects, technical terms: ['yt', 'Enzo', 'Turk']"
)
domain_terms: list[str] = Field(
default_factory=list,
description="Domain vocabulary: ['parallelization', 'MPI', 'OpenMP']"
)
# Expectations
expected_elements: list[str] = Field(
default_factory=list,
description="Expected structural elements: ['abstract', 'references', 'figures']"
)
audience_level: str = Field(
default="general",
description="Target audience: academic, student, general"
)
model_config = ConfigDict(
json_schema_extra={
"example": {
"title": "yt: An Open Source Framework for Analysis",
"document_type": "research_paper",
"topic_summary": (
"Research paper describing yt, an open-source Python "
"framework for analyzing astrophysical simulations."
),
"structure_summary": (
"9 pages, two-column layout, abstract + 6 sections + references"
),
"key_entities": ["yt", "Enzo", "Matthew Turk", "DVCS", "NumPy"],
"domain_terms": ["parallelization", "MPI", "OpenMP", "simulation", "AMR"],
"expected_elements": [
"abstract", "introduction", "methodology", "results", "references"
],
"audience_level": "academic"
}
}
)
class ObservationContext(BaseModel):
"""Full context for processing a single observation.
Provides all the context needed for an agent to make a decision about
an observation, including document summary, visual context, and
surrounding text.
This model is used when processing individual observations, enabling
agents to make context-aware decisions.
Attributes:
observation: The observation being processed
document_summary: Semantic context from analysis phase
heading_tree: Document structure for hierarchy validation
page_image_base64: The specific page as base64 PNG (for visual comparison)
markdown_excerpt: ~1000 chars centered on the issue
before_context: ~500 chars before the issue
after_context: ~500 chars after the issue
page_num: Page number where observation occurs
line_range: Optional line range in markdown
Example:
>>> context = ObservationContext(
... observation=obs,
... document_summary=summary,
... heading_tree=tree,
... markdown_excerpt="## Introduction\\n\\nThis paper presents...",
... before_context="# Title\\n\\n",
... after_context="\\n\\n## Methods\\n\\n",
... page_num=1
... )
"""
# Note: Using string type hints to avoid circular imports
# These will be resolved at runtime by Pydantic
observation: "Observation" = Field(
...,
description="The observation being processed"
)
document_summary: DocumentSummary = Field(
...,
description="Semantic context from analysis phase"
)
heading_tree: "HeadingTree" = Field(
...,
description="Document structure for hierarchy validation"
)
# Visual context
page_image_base64: str | None = Field(
default=None,
description="The specific page as base64 PNG"
)
# Textual context (trimmed, not full doc)
markdown_excerpt: str = Field(
...,
description="~1000 chars centered on the issue"
)
before_context: str = Field(
default="",
description="~500 chars before the issue"
)
after_context: str = Field(
default="",
description="~500 chars after the issue"
)
# Location helpers
page_num: int = Field(
...,
ge=1,
description="Page number where observation occurs"
)
line_range: tuple[int, int] | None = Field(
default=None,
description="Line range in markdown (start, end)"
)
model_config = ConfigDict(
json_schema_extra={
"example": {
"observation": {"id": "obs-123", "job_id": "job-456"},
"document_summary": {
"title": "Example Document",
"document_type": "research_paper",
"topic_summary": "A research paper about...",
},
"heading_tree": {"document_title": "Example", "sections": []},
"page_image_base64": None,
"markdown_excerpt": "## Introduction\n\nThis paper presents...",
"before_context": "# Title\n\n",
"after_context": "\n\n## Methods\n\n",
"page_num": 1,
"line_range": [10, 25]
}
}
)
# Update forward references for runtime
# This allows Pydantic to resolve the string type hints
def _update_forward_refs() -> None:
"""Update forward references after all models are defined."""
ObservationContext.model_rebuild()
__all__ = ["DocumentSummary", "ObservationContext"]