📦 EqualifyEverything / equalify-reflow

📄 pii.py · 105 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105"""PII detection models for Presidio integration."""


from pydantic import BaseModel, ConfigDict, Field


class PIIFinding(BaseModel):
    """Represents a single PII entity detected in document text.

    Used by Microsoft Presidio analyzer to flag sensitive information
    that requires manual review before processing.

    Attributes:
        entity_type: Type of PII detected (e.g., "PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER")
        start: Character position where entity starts in text
        end: Character position where entity ends in text
        score: Confidence score from 0.0 to 1.0
        text: The actual detected text snippet

    Example:
        >>> finding = PIIFinding(
        ...     entity_type="EMAIL_ADDRESS",
        ...     start=45,
        ...     end=63,
        ...     score=0.95,
        ...     text="student@example.com"
        ... )
    """
    entity_type: str = Field(
        ...,
        description="PII entity type from Presidio",
        min_length=1
    )
    start: int = Field(
        ...,
        ge=0,
        description="Start character position"
    )
    end: int = Field(
        ...,
        ge=0,
        description="End character position"
    )
    score: float = Field(
        ...,
        ge=0.0,
        le=1.0,
        description="Detection confidence score"
    )
    text: str = Field(
        ...,
        min_length=1,
        description="Detected text snippet"
    )

    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "entity_type": "PERSON",
                "start": 120,
                "end": 132,
                "score": 0.85,
                "text": "John Student"
            }
        }
    )


class PIIResult(BaseModel):
    """Complete PII scan results for a document.

    Aggregates all PII findings from a single document scan,
    including metadata about the scan operation itself.

    Attributes:
        job_id: Unique job identifier (UUID)
        findings: List of detected PII entities
        scan_completed_at: UTC timestamp when scan finished
        total_findings: Count of PII entities found
    """
    job_id: str = Field(
        ...,
        pattern=r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$',
        description="UUID format job identifier"
    )
    findings: list[PIIFinding] = Field(
        default_factory=list,
        description="List of PII entities detected"
    )
    total_findings: int = Field(
        ...,
        ge=0,
        description="Total count of findings"
    )

    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "job_id": "550e8400-e29b-41d4-a716-446655440000",
                "findings": [],
                "total_findings": 0
            }
        }
    )