1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139#!/usr/bin/env python3
"""Integration test for chained analysis pipeline.
Tests the chained analysis on a sample PDF to verify:
1. All agents run correctly
2. Manifest is assembled properly
3. Two-column layout is detected correctly
Usage:
docker exec equalify-reflow-api-gateway python scripts/test_chained_integration.py
"""
import asyncio
import sys
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
async def main():
"""Run chained analysis test."""
print("=" * 60)
print("Chained Analysis Integration Test")
print("=" * 60)
# Import after path setup
from src.agents.chained_analysis import analyze_document
from src.services.pdf_converter import PDFConverter
# Find a test PDF
pdf_paths = [
Path("project-docs/pdfs/04_usenix_security_paper.pdf"),
Path("project-docs/pdfs/07_attention_transformer_paper.pdf"),
Path("/app/project-docs/pdfs/04_usenix_security_paper.pdf"),
Path("/app/project-docs/pdfs/07_attention_transformer_paper.pdf"),
]
pdf_path = None
for p in pdf_paths:
if p.exists():
pdf_path = p
break
if not pdf_path:
print("ERROR: No test PDF found!")
print("Tried:", [str(p) for p in pdf_paths])
return 1
print(f"\nTest PDF: {pdf_path}")
print("-" * 60)
# Read PDF
with open(pdf_path, "rb") as f:
pdf_content = f.read()
print(f"PDF size: {len(pdf_content):,} bytes")
# Convert PDF to page images
print("\nConverting PDF with Docling...")
converter = PDFConverter()
result = await converter.convert_with_page_images(pdf_content)
print(f"Total pages: {result.total_pages}")
print(f"Has page images: {result.has_page_images}")
# Limit to first 3 pages for faster testing
pages = result.pages[:3]
print(f"Testing with first {len(pages)} pages")
# Run chained analysis
print("\n" + "=" * 60)
print("Running Chained Analysis (parallel mode)")
print("=" * 60)
job_id = "test-chained-001"
manifest, observations, usage = await analyze_document(
pages=pages,
job_id=job_id,
parallel=True,
)
# Print results
print("\n" + "-" * 60)
print("RESULTS")
print("-" * 60)
print(f"\nDocument Title: {manifest.document_title}")
print(f"Document Type: {manifest.document_type}")
print(f"Total Pages: {manifest.total_pages}")
print(f"Analysis Confidence: {manifest.analysis_confidence:.2f}")
print(f"Analysis Model: {manifest.analysis_model}")
print(f"\nRequired Agents: {manifest.required_agents}")
print(f"Skip Agents: {manifest.skip_agents}")
print(f"\nInitial Observations: {len(observations)}")
# Check page features
print("\nPage Features:")
for pf in manifest.page_features:
print(f" Page {pf.page_num}: layout={pf.layout_type}, "
f"images={pf.image_count}, tables={pf.table_count}, "
f"code={pf.has_code_blocks}, math={pf.has_math}")
# Parse heading tree
from src.shared.models.remediation import HeadingTree
heading_tree = HeadingTree.model_validate_json(manifest.heading_tree_json)
print(f"\nHeading Tree:")
print(f" Layout Type: {heading_tree.layout_type}")
print(f" Title: {heading_tree.document_title}")
print(f" Total Sections: {len(heading_tree.sections)}")
if heading_tree.sections:
print(" First 5 headings:")
for h in heading_tree.sections[:5]:
print(f" H{h.level}: {h.title[:50]}... (page {h.page})")
print("\nLLM Usage:")
print(f" Input tokens: {usage.input_tokens:,}")
print(f" Output tokens: {usage.output_tokens:,}")
print(f" Total tokens: {usage.total_tokens:,}")
print(f" Estimated cost: ${usage.estimated_cost_cents/100:.4f}")
print("\nAnalysis Notes:")
print(f" {manifest.analysis_notes}")
print("\n" + "=" * 60)
print("TEST PASSED!")
print("=" * 60)
return 0
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)