📦 EqualifyEverything / equalify-reflow

📄 pipeline_viewer.py · 725 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725"""Pipeline endpoint for versioned step-by-step PDF processing."""

import asyncio
import json
import logging
import time
from collections.abc import AsyncGenerator
from typing import Any, Literal

import redis.asyncio as aioredis
from fastapi import APIRouter, File, Form, HTTPException, Query, Request, UploadFile
from fastapi.responses import StreamingResponse
from pydantic import BaseModel

from ..config import settings
from ..dependencies import _get_redis_pool
from ..services.feedback_client import feedback_client
from ..services.pdf_classifier import classify_pdf, enrich_classification
from ..services.pdf_extractor import PDFExtractionError, extract_pdf_text
from ..services.pii_analyzer import get_pii_analyzer
from ..services.pipeline_viewer import PipelineViewerService
from ..services.pipeline_viewer_models import PipelineViewerResult, StepResult
from ..services.session_store import PipelineSession, session_store

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/v1/pipeline", tags=["Pipeline"])

_HEARTBEAT_INTERVAL_SECONDS = 10
_SSE_HEARTBEAT = ": heartbeat\n\n"

# How long to wait for a human PII decision before aborting the pipeline.
# The surrounding pipeline timeout still applies, but we bound the wait so a
# forgotten tab doesn't hold a docling worker indefinitely.
_PII_DECISION_TIMEOUT_SECONDS = 600  # 10 minutes


def _sse_event(event_type: str, data: Any, event_id: int | None = None) -> str:
    """Format a server-sent event string with optional ``id:`` field."""
    parts: list[str] = []
    if event_id is not None:
        parts.append(f"id: {event_id}")
    parts.append(f"event: {event_type}")
    parts.append(f"data: {json.dumps(data, default=str)}")
    return "\n".join(parts) + "\n\n"


def _slim_init_payload(result: PipelineViewerResult) -> dict[str, Any]:
    """Build an init payload with binary images stripped out.

    Removes page_images and figure image_base64 so the init event stays small.
    Images are streamed individually via page_image / figure_image events.
    """
    data = result.model_dump()
    data["page_images"] = {}
    data["figures"] = [
        {**fig, "image_base64": ""} for fig in data["figures"]
    ]
    return data


# ---------------------------------------------------------------------------
# Shared SSE buffer reader — used by both POST and reconnect endpoints
# ---------------------------------------------------------------------------

async def _buffer_reader(
    session: PipelineSession, cursor: int = 0,
) -> AsyncGenerator[str, None]:
    """Read SSE events from a session's buffer, yielding heartbeats while waiting.

    The pipeline runs in a separate background task and appends events to the
    session's ``event_buffer``.  This generator streams those events to the
    client.  If the client disconnects, this generator is cancelled but the
    pipeline task continues running.
    """
    while True:
        # Drain all buffered events from cursor position
        buffer_len = len(session.event_buffer)
        while cursor < buffer_len:
            yield session.event_buffer[cursor]
            cursor += 1

        # If pipeline finished, stop
        if session.status in ("completed", "error"):
            return

        # Wait for new events (push) or heartbeat timeout
        session.new_event.clear()
        try:
            await asyncio.wait_for(
                session.new_event.wait(), timeout=_HEARTBEAT_INTERVAL_SECONDS,
            )
        except TimeoutError:
            yield _SSE_HEARTBEAT


_SSE_HEADERS = {
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "X-Accel-Buffering": "no",
}


# ---------------------------------------------------------------------------
# Background pipeline runner — decoupled from SSE connection
# ---------------------------------------------------------------------------

async def _run_pipeline(
    session: PipelineSession,
    content: bytes,
    filename: str,
    images_scale: float,
    do_table_structure: bool,
    ocr_lang_list: list[str],
    doc_upload_task: asyncio.Task,
    skip_pii_scan: bool = False,
) -> None:
    """Execute the full processing pipeline, writing events to the session buffer.

    Runs as a background ``asyncio.Task`` — independent of any SSE connection.
    If the client disconnects, this task continues running.  When the client
    reconnects, all buffered events are replayed.
    """

    def emit(event_type: str, data: Any) -> None:
        """Write an SSE event to the session buffer and notify readers."""
        eid = session.event_counter
        session.event_counter += 1
        sse_text = _sse_event(event_type, data, event_id=eid)
        session.event_buffer.append(sse_text)
        session.new_event.set()

    # Increment jobs_in_processing so docling GPU auto-scales
    redis_client = aioredis.Redis(connection_pool=_get_redis_pool())
    try:
        await redis_client.incr("eq-pdf:metrics:jobs_in_processing")
    except Exception:
        logger.warning("Failed to increment jobs_in_processing counter")

    try:
        async with asyncio.timeout(settings.pipeline_timeout_seconds):
            await _pipeline_steps(
                session, content, filename, images_scale,
                do_table_structure, ocr_lang_list, doc_upload_task, emit,
                skip_pii_scan=skip_pii_scan,
            )
    except asyncio.CancelledError:
        logger.warning("Pipeline cancelled for session %s", session.session_id)
        session.status = "error"
        session.new_event.set()
        raise
    except TimeoutError:
        logger.error("Pipeline timed out for session %s", session.session_id)
        emit("error", {"step_name": "pipeline", "message": "Pipeline timed out"})
        session.status = "error"
        emit("done", {"total_steps": 0, "total_elapsed_ms": settings.pipeline_timeout_seconds * 1000})
    except Exception as e:
        logger.error("Pipeline crashed for session %s: %s", session.session_id, e, exc_info=True)
        emit("error", {"step_name": "pipeline", "message": str(e)})
        session.status = "error"
        emit("done", {"total_steps": 0, "total_elapsed_ms": 0})
    finally:
        try:
            await redis_client.decr("eq-pdf:metrics:jobs_in_processing")
        except Exception:
            logger.warning("Failed to decrement jobs_in_processing counter")


async def _pipeline_steps(
    session: PipelineSession,
    content: bytes,
    filename: str,
    images_scale: float,
    do_table_structure: bool,
    ocr_lang_list: list[str],
    doc_upload_task: asyncio.Task,
    emit: Any,
    *,
    skip_pii_scan: bool = False,
) -> None:
    """Inner pipeline logic — extracted for clean exception handling."""
    pipeline_start = time.time()
    total_steps = 1  # docling always runs

    service = PipelineViewerService()
    result = session.result

    # Step 0: Pre-flight PDF classification
    classification = classify_pdf(content)

    if classification.has_errors:
        error_msg = "; ".join(classification.error_messages)
        result.warnings = classification.warning_messages
        result.steps.append(
            StepResult(
                name="classification",
                display_name="PDF Classification",
                version_after="v0",
                elapsed_ms=classification.elapsed_ms,
                error=error_msg,
                metadata={
                    "document_type": classification.document_type.value,
                    "findings": [f.model_dump() for f in classification.findings],
                    "pdf_metadata": classification.metadata.model_dump(),
                },
            )
        )
        emit("init", _slim_init_payload(result))
        session.result = result
        session.status = "completed"
        emit("done", {"total_steps": 0, "total_elapsed_ms": classification.elapsed_ms})
        return

    result.warnings = classification.warning_messages

    # Step 0.5: PII scan — gate heavy processing on a human decision when
    # Presidio flags potential PII. Uses a cheap text-only docling pass
    # (no OCR/tables/images) so we don't pay for full extraction before the
    # user has approved. If findings exist, the pipeline blocks on
    # session.pii_decision_event until the decision endpoint sets it.
    if skip_pii_scan:
        emit("pii_scan", {
            "findings": [],
            "finding_count": 0,
            "elapsed_ms": 0,
            "error": None,
            "awaiting_decision": False,
            "skipped": True,
        })
    else:
        emit("processing", {"step_name": "pii_scan", "display_name": "PII Scan"})
        pii_start = time.time()
        pii_findings: list[dict[str, Any]] = []
        pii_error: str | None = None
        try:
            text_for_pii = await extract_pdf_text(content)
            analyzer = get_pii_analyzer()
            raw_findings = await asyncio.to_thread(analyzer.analyze_text, text_for_pii)
            pii_findings = [f.model_dump() for f in raw_findings]
        except PDFExtractionError as e:
            logger.warning(f"PII pre-scan text extraction failed: {e}")
            pii_error = f"Text extraction for PII scan failed: {e}"
        except Exception as e:
            logger.error(f"PII scan failed: {e}", exc_info=True)
            pii_error = f"PII scan error: {e}"

        pii_elapsed_ms = int((time.time() - pii_start) * 1000)
        result.steps.append(
            StepResult(
                name="pii_scan",
                display_name="PII Scan",
                version_after="v0",
                elapsed_ms=pii_elapsed_ms,
                error=pii_error,
                metadata={"findings": pii_findings, "finding_count": len(pii_findings)},
            )
        )

        emit("pii_scan", {
            "findings": pii_findings,
            "finding_count": len(pii_findings),
            "elapsed_ms": pii_elapsed_ms,
            "error": pii_error,
            "awaiting_decision": bool(pii_findings),
        })

        if pii_findings:
            logger.info(
                f"PII scan surfaced {len(pii_findings)} findings for session "
                f"{session.session_id}; awaiting user decision"
            )
            try:
                await asyncio.wait_for(
                    session.pii_decision_event.wait(),
                    timeout=_PII_DECISION_TIMEOUT_SECONDS,
                )
            except TimeoutError:
                emit("pii_decision", {"decision": "denied", "reason": "timeout"})
                session.status = "completed"
                emit("done", {
                    "total_steps": 0,
                    "total_elapsed_ms": int((time.time() - pipeline_start) * 1000),
                    "pii_denied": True,
                })
                return

            decision = session.pii_decision or "denied"
            emit("pii_decision", {"decision": decision})
            if decision == "denied":
                logger.info(f"Session {session.session_id} denied after PII review")
                session.status = "completed"
                emit("done", {
                    "total_steps": 0,
                    "total_elapsed_ms": int((time.time() - pipeline_start) * 1000),
                    "pii_denied": True,
                })
                return

    # Step 1: Docling extraction
    emit("processing", {"step_name": "docling", "display_name": "Extraction"})

    def _on_cold_start() -> None:
        emit("status", {
            "message": "GPU service is starting up. This may take a few minutes...",
            "type": "cold_start",
        })

    try:
        await service._step_docling(result, content, filename, images_scale, do_table_structure, on_cold_start=_on_cold_start)
    except Exception as e:
        logger.error(f"Docling extraction failed: {e}")
        emit("error", {"step_name": "docling", "message": str(e)})
        session.result = result
        session.status = "error"
        emit("done", {"total_steps": 0, "total_elapsed_ms": 0})
        return

    # Enrich classification with post-extraction signals
    enrich_classification(
        classification,
        total_pages=result.total_pages,
        total_chars=result.stats.get("total_chars", 0),
        figure_count=result.stats.get("figure_count", 0),
        layout_hints=result.stats.get("layout_hints", {}),
    )
    result.warnings = classification.warning_messages
    result.stats["classification"] = {
        "document_type": classification.document_type.value,
        "findings_count": len(classification.findings),
        "elapsed_ms": classification.elapsed_ms,
    }

    # OCR re-run for scanned documents
    if result.stats.get("is_likely_scanned", False):
        from src.services.pdf_classifier import FINDING_SCAN_PRODUCER, FINDING_SCANNED

        scanned_codes = {FINDING_SCANNED, FINDING_SCAN_PRODUCER}
        finding_codes = {f.code for f in classification.findings}
        if finding_codes & scanned_codes:
            effective_langs = ocr_lang_list
            emit("processing", {
                "step_name": "docling_ocr",
                "display_name": "OCR Re-extraction",
            })
            try:
                await service._step_docling_ocr(
                    result, content, filename, images_scale, do_table_structure,
                    languages=effective_langs,
                )
                # Update scanned finding to reflect that OCR was applied
                for finding in classification.findings:
                    if finding.code in scanned_codes:
                        finding.message = (
                            "Document appears to be scanned. OCR has been "
                            "applied, which may increase processing time and introduce "
                            "character recognition errors."
                        )
                enrich_classification(
                    classification,
                    total_pages=result.total_pages,
                    total_chars=result.stats.get("total_chars", 0),
                    figure_count=result.stats.get("figure_count", 0),
                    layout_hints=result.stats.get("layout_hints", {}),
                )
                result.warnings = classification.warning_messages
            except Exception as e:
                logger.error(f"OCR re-extraction failed: {e}")
                emit("error", {"step_name": "docling_ocr", "message": str(e)})
                # Continue with the non-OCR extraction already in v0

    # Send slim init (metadata + markdown, no binary images)
    emit("init", _slim_init_payload(result))

    # Stream page images individually (~200-500KB each, not 5-20MB at once)
    for page_key, page_b64 in result.page_images.items():
        emit("page_image", {"page": page_key, "image_base64": page_b64})

    # Stream figure images individually
    for fig in result.figures:
        if fig.image_base64:
            emit("figure_image", {"ref_id": fig.ref_id, "image_base64": fig.image_base64})

    # Step 2: Structure analysis
    structure = None
    emit("processing", {"step_name": "structure", "display_name": "Structure Analysis"})
    try:
        structure = await service._step_structure(result)
        total_steps += 1
        step = result.steps[-1]
        emit("step", {
            "step": step.model_dump(),
            "new_versions": {},
            "new_page_markdowns": {},
        })
    except Exception as e:
        logger.error(f"Structure analysis failed: {e}")
        emit("error", {"step_name": "structure", "message": str(e)})

    # Step 2b: Heading reconciliation
    if structure is not None:
        emit("processing", {"step_name": "heading_reconciliation", "display_name": "Heading Reconciliation"})
        try:
            structure = await service._step_heading_reconciliation(result, structure)
            total_steps += 1
            step = result.steps[-1]
            emit("step", {
                "step": step.model_dump(),
                "new_versions": {},
                "new_page_markdowns": {},
            })
        except Exception as e:
            logger.error(f"Heading reconciliation failed: {e}")
            emit("error", {"step_name": "heading_reconciliation", "message": str(e)})

    # Step 2c: Deterministic heading level fix
    if structure is not None:
        emit("processing", {"step_name": "heading_levels", "display_name": "Heading Levels"})
        try:
            await service._step_heading_levels(result, structure)
            total_steps += 1
            step = result.steps[-1]
            emit("step", {
                "step": step.model_dump(),
                "new_versions": {"v0": result.versions.get("v0", "")},
                "new_page_markdowns": {"v0": result.page_markdowns.get("v0", {})},
            })
        except Exception as e:
            logger.error(f"Heading level fix failed: {e}")
            emit("error", {"step_name": "heading_levels", "message": str(e)})

    # Step 3: Page content corrections (with programmatic hints)
    section_map = None
    if structure is not None:
        emit("processing", {"step_name": "page_content", "display_name": "Page Content Corrections"})
        try:
            section_map = await asyncio.to_thread(
                service._build_section_map, result, structure
            )
            page_hints = await asyncio.to_thread(
                service._generate_page_hints, result, structure, section_map
            )
            await service._step_page_content(result, structure, page_hints=page_hints)
            total_steps += 1
            step = result.steps[-1]
            emit("step", {
                "step": step.model_dump(),
                "new_versions": {"v1": result.versions.get("v1", "")},
                "new_page_markdowns": {"v1": result.page_markdowns.get("v1", {})},
            })
        except Exception as e:
            logger.error(f"Page content corrections failed: {e}")
            emit("error", {"step_name": "page_content", "message": str(e)})

    # Step 3b: Deterministic code block language tagging
    if structure is not None:
        emit("processing", {"step_name": "code_blocks", "display_name": "Code Block Languages"})
        try:
            await service._step_code_blocks(result, structure)
            total_steps += 1
            step = result.steps[-1]
            # Code blocks edit v1 (or v0) in-place — send updated version
            source_ver = "v1" if "v1" in result.page_markdowns else "v0"
            emit("step", {
                "step": step.model_dump(),
                "new_versions": {source_ver: result.versions.get(source_ver, "")},
                "new_page_markdowns": {source_ver: result.page_markdowns.get(source_ver, {})},
            })
        except Exception as e:
            logger.error(f"Code block language tagging failed: {e}")
            emit("error", {"step_name": "code_blocks", "message": str(e)})

    # Step 4: Cross-page fixes (boundaries + footnotes)
    if structure is not None:
        emit("processing", {"step_name": "boundaries", "display_name": "Cross-Page Fixes"})
        try:
            if section_map is None:
                section_map = await asyncio.to_thread(
                    service._build_section_map, result, structure
                )
            await service._step_boundaries(result, structure, section_map=section_map)
            total_steps += 1
            step = result.steps[-1]
            emit("step", {
                "step": step.model_dump(),
                "new_versions": {"v2": result.versions.get("v2", "")},
                "new_page_markdowns": {},
            })
        except Exception as e:
            logger.error(f"Cross-page fixes failed: {e}")
            emit("error", {"step_name": "boundaries", "message": str(e)})

    # Step 5: Cleanup
    if "v2" in result.versions:
        emit("processing", {"step_name": "cleanup", "display_name": "Final Cleanup"})
        try:
            await service._step_cleanup(result)
            total_steps += 1

            step = result.steps[-1]
            emit("step", {
                "step": step.model_dump(),
                "new_versions": {"v3": result.versions.get("v3", "")},
                "new_page_markdowns": {},
            })
        except Exception as e:
            logger.error(f"Cleanup failed: {e}")
            emit("error", {"step_name": "cleanup", "message": str(e)})

    total_elapsed_ms = int((time.time() - pipeline_start) * 1000)

    # Await document upload (should be done by now — ran concurrently)
    document_ref: str | None = None
    try:
        document_ref = await doc_upload_task
    except Exception:
        logger.warning("Document upload task failed", exc_info=True)

    # Finalize session with results for reconnect
    session.result = result
    session.status = "completed"

    emit("done", {
        "total_steps": total_steps,
        "total_elapsed_ms": total_elapsed_ms,
        "total_input_tokens": sum(s.input_tokens for s in result.steps),
        "total_output_tokens": sum(s.output_tokens for s in result.steps),
        "total_cost_cents": sum(s.cost_cents for s in result.steps),
        "session_id": session.session_id,
        "document_ref": document_ref,
    })


# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------

@router.post("/process")
async def process_pdf(
    file: UploadFile = File(...),
    images_scale: float = Form(default=2.0),
    do_table_structure: bool = Form(default=True),
    enable_structure: bool = Form(default=False),
    enable_page_content: bool = Form(default=False),
    enable_boundaries: bool = Form(default=False),
    ocr_languages: str = Form(
        default="eng",
        description="Comma-separated Tesseract OCR language codes (e.g. 'eng,deu')",
    ),
) -> dict[str, Any]:
    """Process a PDF through the versioned pipeline viewer.

    Synchronous dev endpoint: upload PDF, wait for extraction, get JSON back.
    Returns versioned markdown snapshots for each pipeline step.
    """


    if not file.filename or not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    content = await file.read()
    if not content:
        raise HTTPException(status_code=400, detail="Empty file")

    images_scale = max(1.0, min(3.0, images_scale))

    ocr_lang_list = [lang.strip() for lang in ocr_languages.split(",")]

    logger.info(f"Pipeline Viewer: processing {file.filename} ({len(content)} bytes)")

    service = PipelineViewerService()
    result = await service.process(
        file_content=content,
        filename=file.filename,
        images_scale=images_scale,
        do_table_structure=do_table_structure,
        enable_structure=enable_structure,
        enable_page_content=enable_page_content,
        enable_boundaries=enable_boundaries,
        ocr_languages=ocr_lang_list,
    )

    return result.model_dump()


@router.post("/process/stream")
async def process_pdf_stream(
    request: Request,
    file: UploadFile = File(...),
    images_scale: float = Form(default=2.0),
    do_table_structure: bool = Form(default=True),
    skip_pii_scan: bool = Form(
        default=False,
        description="Skip the pre-extraction PII scan (faster, but no gate).",
    ),
    ocr_languages: str = Form(
        default="eng",
        description="Comma-separated Tesseract OCR language codes (e.g. 'eng,deu')",
    ),
) -> StreamingResponse:
    """Stream pipeline processing results via SSE.

    The pipeline runs in a background task.  This endpoint streams events
    from the session buffer as they are produced.  If the client disconnects,
    the pipeline continues running.  Reconnect via the session stream endpoint.

    SSE event types:
        session — Session ID for reconnection.
        init — After Docling extraction. Metadata + markdown (no binary images).
        page_image — Individual page image (one per page, streamed after init).
        figure_image — Individual figure image (one per figure, streamed after init).
        processing — Before each subsequent step starts.
        step — After each subsequent step completes (incremental data).
        error — If a step fails (non-fatal).
        done — Stream complete.
    """

    if not file.filename or not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    content = await file.read()
    if not content:
        raise HTTPException(status_code=400, detail="Empty file")

    images_scale = max(1.0, min(3.0, images_scale))
    filename = file.filename
    ocr_lang_list = [lang.strip() for lang in ocr_languages.split(",")]

    logger.info(f"Pipeline Viewer (stream): processing {filename} ({len(content)} bytes)")

    # Fire-and-forget document upload to feedback service (zero added latency)
    doc_upload_task = asyncio.create_task(
        feedback_client.upload_document(content, filename)
    )

    # Create session and emit session event (id=0) before spawning task.
    # Stamp identity if SessionAuthMiddleware populated it — both fields stay
    # None when AUTH_MODE=none, preserving today's anonymous session shape.
    # isinstance-check to avoid a MagicMock attribute satisfying the truthy
    # path in mock-based tests.
    from ..auth.base import Identity as _AuthIdentity

    raw_identity = getattr(request.state, "identity", None)
    identity = raw_identity if isinstance(raw_identity, _AuthIdentity) else None
    session = session_store.create_for_stream(
        filename,
        identity_sub=identity.sub if identity is not None else None,
        provider_id=identity.provider_id if identity is not None else None,
    )
    session_event_payload: dict[str, Any] = {"session_id": session.session_id}
    if identity is not None:
        session_event_payload["user"] = {
            "sub": identity.sub,
            "name": identity.name,
            "email": identity.email,
            "provider_id": identity.provider_id,
        }
    session.event_buffer.append(
        _sse_event("session", session_event_payload, event_id=0)
    )
    session.event_counter = 1
    session.new_event.set()

    # Spawn pipeline in background — runs independently of SSE connection
    session.pipeline_task = asyncio.create_task(
        _run_pipeline(
            session, content, filename, images_scale,
            do_table_structure, ocr_lang_list, doc_upload_task,
            skip_pii_scan=skip_pii_scan,
        )
    )

    return StreamingResponse(
        _buffer_reader(session, cursor=0),
        media_type="text/event-stream",
        headers=_SSE_HEADERS,
    )


class PIIDecisionInput(BaseModel):
    decision: Literal["approved", "denied"]


@router.post("/sessions/{session_id}/pii-decision")
async def submit_pii_decision(session_id: str, payload: PIIDecisionInput) -> dict[str, str]:
    """Record the user's PII review decision and release the pipeline.

    The pipeline task is awaiting ``session.pii_decision_event``; setting
    the decision here unblocks it. Approved → docling + agents continue.
    Denied → the pipeline emits a done event with ``pii_denied`` and stops.
    """
    session = session_store.get(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found or expired")

    if session.pii_decision_event.is_set():
        return {"status": "already_decided", "decision": session.pii_decision or ""}

    session.pii_decision = payload.decision
    session.pii_decision_event.set()
    return {"status": "accepted", "decision": payload.decision}


@router.get("/sessions/{session_id}/stream")
async def reconnect_stream(
    session_id: str,
    last_event_id: int = Query(default=-1),
) -> StreamingResponse:
    """Reconnect to a processing session and replay buffered SSE events.

    The client sends the last event ID it received.  This endpoint replays
    all events after that ID.  If the session is still processing, it waits
    for new events with heartbeats until completion.
    """
    session = session_store.get(session_id)
    if session is None:
        raise HTTPException(status_code=404, detail="Session not found or expired")

    cursor = last_event_id + 1

    return StreamingResponse(
        _buffer_reader(session, cursor=cursor),
        media_type="text/event-stream",
        headers=_SSE_HEADERS,
    )