1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286"""Tests for text cleanup utilities."""
from src.utils.text_cleanup import (
cleanup_markdown,
collapse_letter_spacing,
fix_excessive_whitespace,
fix_url_formatting,
normalize_quotes,
normalize_unicode,
validate_urls,
)
class TestNormalizeUnicode:
"""Tests for unicode normalization."""
def test_normalizes_composed_characters(self):
# Café with combining accent
text = "cafe\u0301"
result = normalize_unicode(text)
assert result == "café"
def test_handles_already_normalized_text(self):
text = "Hello World"
assert normalize_unicode(text) == text
def test_handles_diacritics(self):
text = "Prlic" # Could have various diacritic forms
result = normalize_unicode(text)
# Should be normalized to canonical form
assert isinstance(result, str)
class TestCollapseLetterSpacing:
"""Tests for letter-spacing collapse."""
def test_collapses_spaced_word(self) -> None:
"""Basic letter-spaced word should be collapsed."""
text = "the r e q u i r e m e n t s are met"
result = collapse_letter_spacing(text)
assert result == "the requirements are met"
def test_collapses_multiple_spaced_words(self) -> None:
"""Multiple letter-spaced words in one text."""
# Note: "H a s" (3 letters) is below threshold, but "d e t e r m i n e d" (10 letters) is not
text = "H a s d e t e r m i n e d its curriculum"
result = collapse_letter_spacing(text)
# Only "determined" gets collapsed, "H a s" is preserved (too short)
assert result == "H a s determined its curriculum"
def test_preserves_normal_text(self) -> None:
"""Normal text should not be affected."""
text = "normal text stays unchanged"
assert collapse_letter_spacing(text) == text
def test_preserves_short_sequences(self) -> None:
"""Short sequences (3 letters or less) should not be collapsed."""
# "a b c" has only 2 spaces, minimum is 3 for safety
text = "a b c should stay"
assert collapse_letter_spacing(text) == text
def test_collapses_minimum_length(self) -> None:
"""Sequences with 4+ letters should be collapsed."""
# "a b c d" has 3 spaces = minimum
text = "word a b c d word"
result = collapse_letter_spacing(text)
assert result == "word abcd word"
def test_handles_mixed_case(self) -> None:
"""Mixed case letter-spacing should work."""
text = "P r o f e s s i o n a l Licensure"
result = collapse_letter_spacing(text)
assert result == "Professional Licensure"
def test_real_world_example(self) -> None:
"""Test with actual OCR artifact from Professional Licensure PDF."""
text = "Has determined its curriculum meets the educational r e q u i r e m e n t s"
result = collapse_letter_spacing(text)
assert result == "Has determined its curriculum meets the educational requirements"
def test_preserves_markdown_structure(self) -> None:
"""Should not break markdown formatting."""
text = "# H e a d i n g\n\nParagraph with s p a c e d words."
result = collapse_letter_spacing(text)
assert "# Heading" in result
assert "spaced" in result
def test_handles_empty_string(self) -> None:
"""Empty string should return empty string."""
assert collapse_letter_spacing("") == ""
def test_handles_only_spaces(self) -> None:
"""String with only spaces should return as-is."""
text = " "
assert collapse_letter_spacing(text) == text
class TestFixExcessiveWhitespace:
"""Tests for whitespace cleanup."""
def test_removes_multiple_spaces(self) -> None:
text = "hello world"
assert fix_excessive_whitespace(text) == "hello world"
def test_preserves_double_newlines(self):
text = "para1\n\npara2"
assert fix_excessive_whitespace(text) == "para1\n\npara2"
def test_reduces_triple_newlines_to_double(self):
text = "para1\n\n\npara2"
assert fix_excessive_whitespace(text) == "para1\n\npara2"
def test_removes_trailing_whitespace_on_lines(self):
text = "line1 \nline2 \n"
result = fix_excessive_whitespace(text)
assert result == "line1\nline2\n"
class TestNormalizeQuotes:
"""Tests for quote normalization."""
def test_normalizes_curly_double_quotes(self):
text = "\u201cHello\u201d" # "Hello" with smart quotes
result = normalize_quotes(text)
# Should be ASCII double quotes
assert result == '"Hello"'
assert ord(result[0]) == 34 # ASCII quote
def test_normalizes_curly_single_quotes(self):
text = "\u2018Hello\u2019" # 'Hello' with smart quotes
result = normalize_quotes(text)
# Should be ASCII apostrophes
assert result == "'Hello'"
assert ord(result[0]) == 39 # ASCII apostrophe
def test_handles_mixed_quotes(self):
text = "\u201cShe\u2019s here\u201d" # "She's here" with smart quotes
result = normalize_quotes(text)
# Should all be ASCII
assert result == "\"She's here\""
assert ord(result[0]) == 34 # Double quote
assert ord(result[4]) == 39 # Apostrophe
def test_handles_guillemets(self):
text = "«Hello»"
result = normalize_quotes(text)
assert result == '"Hello"'
assert ord(result[0]) == 34 # ASCII quote
class TestValidateUrls:
"""Tests for URL validation."""
def test_finds_valid_urls(self):
text = "Visit http://example.com for more"
broken = validate_urls(text)
assert len(broken) == 0
def test_finds_broken_urls(self):
# URL without netloc (domain) should be flagged as broken
text = "Visit http:/// for info"
broken = validate_urls(text)
# Malformed URL should be flagged
assert len(broken) >= 1
def test_handles_https_urls(self):
text = "Secure site: https://example.com/path"
broken = validate_urls(text)
assert len(broken) == 0
def test_finds_multiple_urls(self):
text = "Visit http://site1.com and http://site2.com"
broken = validate_urls(text)
# Both valid
assert len(broken) == 0
class TestFixUrlFormatting:
"""Tests for URL formatting fixes."""
def test_adds_protocol_to_markdown_links(self):
text = "[Example](example.com)"
result = fix_url_formatting(text)
assert "[Example](http://example.com)" in result
def test_preserves_existing_protocols(self):
text = "[Example](https://example.com)"
result = fix_url_formatting(text)
assert result == text
def test_handles_multiple_links(self):
text = "[Site1](site1.com) and [Site2](https://site2.com)"
result = fix_url_formatting(text)
assert "http://site1.com" in result
assert "https://site2.com" in result
class TestCleanupMarkdown:
"""Integration tests for safe cleanup pipeline."""
def test_applies_safe_fixes_only(self):
text = """de-
veloping code with \u201csmart quotes\u201d
Multiple spaces and \n\n\n triple newlines.
Smith, J. Bibliography entry."""
result = cleanup_markdown(text, log_warnings=False)
# Check SAFE fixes were applied
assert '"smart quotes"' in result # Quotes normalized (SAFE)
assert "Multiple spaces" in result # Whitespace fixed (SAFE)
# Check RISKY fixes were SKIPPED (LLM will handle these)
assert "de-\nveloping" in result # Hyphenation NOT fixed (would break on footnotes)
assert "Smith, J." in result # Bibliography NOT indented (could misformat)
def test_handles_empty_string(self):
result = cleanup_markdown("", log_warnings=False)
assert result == ""
def test_handles_already_clean_text(self):
text = "Clean text with proper formatting."
result = cleanup_markdown(text, log_warnings=False)
assert result == text
def test_preserves_markdown_structure(self):
text = """# Header
Paragraph with [link](http://example.com).
- List item 1
- List item 2"""
result = cleanup_markdown(text, log_warnings=False)
assert "# Header" in result
assert "[link]" in result
assert "- List item" in result
class TestRealWorldExample:
"""Test with actual PDF conversion artifacts."""
def test_academic_paper_safe_cleanup(self):
# Based on the real example from result.md
# Test that we DON'T break things with aggressive hyphenation fixes
text = """How to Scale a Code in the Human Dimen-
sion
Matthew J. Turk
Abstract: As scientists' needs for computa-
tional techniques grow.
## 1 Why 'Community?'
Astrophysics is dominated by vertically-
integrated, small-
population research."""
result = cleanup_markdown(text, log_warnings=False)
# Hyphenation should be PRESERVED (not "fixed" - LLM will handle it)
# This prevents breaking across footnotes/columns
assert "Dimen-" in result # Hyphenation preserved
assert "computa-" in result # Hyphenation preserved
def test_url_and_quote_cleanup(self) -> None:
text = 'Visit http://yt-project.org/ for \u201cmore info\u201d.'
result = cleanup_markdown(text, log_warnings=False)
# Quotes should be normalized (SAFE operation)
assert '"more info"' in result
def test_letter_spacing_cleanup(self) -> None:
"""Letter-spaced text from OCR should be collapsed."""
text = "The P r o f e s s i o n a l Licensure program has r e q u i r e m e n t s."
result = cleanup_markdown(text, log_warnings=False)
# Letter-spacing should be collapsed (SAFE operation)
assert "Professional" in result
assert "requirements" in result
# Original spaced text should be gone
assert "P r o f e s s i o n a l" not in result