📦 EqualifyEverything / equalify-reflow

📄 debug_bundle.py · 159 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159"""Debug bundle models for pipeline observability.

These models support the debug bundle feature which allows developers
to download a complete artifact package for any job, containing:
- Original PDF and page images
- All agent prompts and responses
- Intermediate outputs at each phase
- Final results and observations

Usage:
    Submit a job with generate_debug_bundle=true, then download via:
    GET /api/documents/{job_id}/debug-bundle
"""

from __future__ import annotations

from datetime import datetime
from typing import TYPE_CHECKING, Any

from pydantic import BaseModel, Field

if TYPE_CHECKING:
    pass


class DebugImageReference(BaseModel):
    """Reference to an image stored separately in the debug bundle.

    This is a standalone model to avoid circular imports with agents.models.
    It mirrors ImageReference but is used specifically for serialization.

    Attributes:
        ref_type: Type of image ("page", "element", "cropped")
        identifier: Unique identifier (page number or element target)
        path: Relative path in bundle (e.g., "images/page_001.png")
        media_type: MIME type of the image (default: "image/png")
        size_bytes: Original size of the image data in bytes
    """

    ref_type: str = Field(
        ...,
        description="Type of image: 'page' for full page, 'element' for cropped element",
    )
    identifier: str = Field(
        ...,
        description="Unique identifier (e.g., 'page_1', 'fig:1', 'table:2')",
    )
    path: str = Field(
        ...,
        description="Relative path in debug bundle (e.g., 'images/page_001.png')",
    )
    media_type: str = Field(
        default="image/png",
        description="MIME type of the image",
    )
    size_bytes: int = Field(
        default=0,
        description="Original size of the image data in bytes",
    )


class DebugArtifact(BaseModel):
    """Single artifact from an agent execution.

    Captures the complete input/output of one agent call for debugging.

    Attributes:
        agent_name: Name of the agent (e.g., "layout", "extraction", "figures")
        phase: Pipeline phase ("analyze", "extract", "refine", "assemble")
        timestamp: When this agent was executed
        input_summary: JSON description of input (manifest, page refs, etc.)
        prompt: Full rendered prompt sent to LLM
        response_raw: Raw LLM response text
        output_parsed: Parsed/validated output as JSON string
        metadata: Execution metadata (tokens, cost, duration, model, etc.)
        image_references: References to images stored separately in bundle
    """

    agent_name: str = Field(..., description="Agent identifier")
    phase: str = Field(..., description="Pipeline phase")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
    input_summary: str = Field(default="", description="JSON description of input")
    prompt: str = Field(default="", description="Full prompt sent to LLM")
    response_raw: str = Field(default="", description="Raw LLM response")
    output_parsed: str = Field(default="", description="Parsed output as JSON")
    metadata: dict[str, Any] = Field(
        default_factory=dict, description="Execution metadata (tokens, cost, model, duration)"
    )
    image_references: list[DebugImageReference] = Field(
        default_factory=list,
        description="References to images stored separately in debug bundle",
    )


class DebugPhaseSummary(BaseModel):
    """Summary of a single pipeline phase.

    Attributes:
        phase: Phase name
        started_at: When phase started
        completed_at: When phase completed
        duration_seconds: Total phase duration
        agents_run: List of agents executed in this phase
        total_tokens: Combined token usage
        total_cost_cents: Combined cost
        success: Whether phase completed successfully
        error: Error message if phase failed
    """

    phase: str
    started_at: datetime | None = None
    completed_at: datetime | None = None
    duration_seconds: float = 0.0
    agents_run: list[str] = Field(default_factory=list)
    total_tokens: int = 0
    total_cost_cents: float = 0.0
    success: bool = True
    error: str | None = None


class DebugBundleManifest(BaseModel):
    """Manifest for a complete debug bundle.

    This is the top-level metadata file included in the debug bundle zip.

    Attributes:
        job_id: Job identifier
        created_at: When bundle was generated
        document_name: Original filename
        total_pages: Number of pages in document
        status: Final job status
        total_duration_seconds: End-to-end processing time
        total_tokens: Combined token usage across all agents
        total_cost_cents: Combined cost across all agents
        phases: Summary of each pipeline phase
        agents_executed: Total count of agent executions
        artifacts_count: Number of artifact files in bundle
    """

    job_id: str
    created_at: datetime = Field(default_factory=datetime.utcnow)
    document_name: str = ""
    total_pages: int = 0
    status: str = "unknown"
    total_duration_seconds: float = 0.0
    total_tokens: int = 0
    total_cost_cents: float = 0.0
    phases: list[DebugPhaseSummary] = Field(default_factory=list)
    agents_executed: int = 0
    artifacts_count: int = 0


__all__ = [
    "DebugArtifact",
    "DebugImageReference",
    "DebugPhaseSummary",
    "DebugBundleManifest",
]