📦 EqualifyEverything / equalify-reflow

📄 minimal_pipeline.py · 97 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97"""Dev-only minimal pipeline endpoint for incremental PDF processing.

SECURITY: Only available when ENVIRONMENT=dev
"""

import logging
from typing import Any

from fastapi import APIRouter, File, Form, HTTPException, UploadFile

from ..config import settings
from ..services.minimal_pipeline import MinimalPipelineService

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/dev/minimal", tags=["Development - Minimal Pipeline"])


def _require_dev_mode() -> None:
    """Ensure endpoint only accessible in development."""
    if settings.environment != "dev":
        raise HTTPException(status_code=404, detail="Not found")


@router.post("/process")
async def process_pdf(
    file: UploadFile = File(...),
    images_scale: float = Form(default=2.0),
    do_table_structure: bool = Form(default=True),
) -> dict[str, Any]:
    """Process a PDF through the minimal pipeline (Docling only).

    Synchronous dev endpoint: upload PDF, wait for extraction, get JSON back.
    Typically takes 10-30s depending on document size.

    Args:
        file: PDF file upload.
        images_scale: Scale factor for page images (1.0-3.0).
        do_table_structure: Run Docling table structure recognition.

    Returns:
        JSON with per-page markdown, images, figures, stats, and step timings.
    """
    _require_dev_mode()

    if not file.filename or not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    content = await file.read()
    if not content:
        raise HTTPException(status_code=400, detail="Empty file")

    # Clamp images_scale to valid range
    images_scale = max(1.0, min(3.0, images_scale))

    logger.info(f"Minimal pipeline: processing {file.filename} ({len(content)} bytes)")

    service = MinimalPipelineService()
    result = await service.process(
        file_content=content,
        filename=file.filename,
        images_scale=images_scale,
        do_table_structure=do_table_structure,
    )

    return {
        "filename": result.filename,
        "total_pages": result.total_pages,
        "pages": [
            {
                "page_number": p.page_number,
                "markdown": p.markdown,
                "image_base64": p.image_base64,
            }
            for p in result.pages
        ],
        "figures": [
            {
                "ref_id": f.ref_id,
                "caption": f.caption,
                "page_number": f.page_number,
                "image_base64": f.image_base64,
            }
            for f in result.figures
        ],
        "full_markdown": result.full_markdown,
        "steps_run": [
            {
                "name": s.name,
                "description": s.description,
                "elapsed_ms": s.elapsed_ms,
            }
            for s in result.steps_run
        ],
        "stats": result.stats,
    }