📦 EqualifyEverything / equalify-reflow

📄 pii_service.py · 287 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287"""PII detection service orchestration."""

import asyncio
import logging
from datetime import UTC, datetime, timedelta
from typing import TYPE_CHECKING

from ..shared.constants.queues import APPROVAL_QUEUE
from ..shared.constants.statuses import STATUS_AWAITING_APPROVAL, STATUS_FAILED, STATUS_PII_SCANNING, STATUS_PROCESSING
from ..shared.models.pii import PIIFinding
from ..shared.models.queue import ApprovalQueuePayload, PIIQueuePayload
from ..utils.retry_helpers import retry_with_backoff
from ..utils.token_generator import generate_secure_token
from .job_service import JobService
from .pdf_extractor import PDFExtractionError, extract_pdf_text
from .pii_analyzer import get_pii_analyzer
from .queue_service import QueueService
from .storage_service import StorageService

if TYPE_CHECKING:
    from .s3_url_service import S3URLService

logger = logging.getLogger(__name__)

# Configuration
APPROVAL_TIMEOUT_HOURS = 4
MAX_RETRY_ATTEMPTS = 1


class PIIDetectionService:
    """Orchestrates PII detection workflow.

    Coordinates PDF download, text extraction, PII scanning,
    and routing to approval or direct processing.
    """

    def __init__(
        self,
        storage_service: StorageService,
        queue_service: QueueService,
        job_service: JobService,
        s3_url_service: "S3URLService | None" = None,
    ):
        """Initialize PII detection service.

        Args:
            storage_service: S3 storage operations
            queue_service: Redis queue operations
            job_service: Job status management
            s3_url_service: S3 URL generation service (required for processing trigger)
        """
        self.storage = storage_service
        self.queue = queue_service
        self.jobs = job_service
        self.s3_url_service = s3_url_service
        self.pii_analyzer = get_pii_analyzer()

    async def process_pii_job(self, job: PIIQueuePayload, retry_count: int = 0) -> None:
        """Process a single PII detection job.

        Main orchestration method that:
        1. Downloads PDF from S3 (with retry on transient errors)
        2. Extracts text (with retry on transient errors)
        3. Runs PII analysis
        4. Routes based on findings (with retry on transient errors)

        All external service calls (S3, Redis) are wrapped with exponential
        backoff retry logic to handle transient network/service failures.

        Args:
            job: PII queue payload with job details
            retry_count: Current retry attempt (0-indexed, deprecated - kept for compatibility)

        Raises:
            Exception: On unrecoverable errors after retries
        """
        logger.info(f"Processing PII job {job.job_id}")

        try:
            # Update status to scanning (with retry for Redis failures)
            await retry_with_backoff(
                lambda: self.jobs.update_job_status(job.job_id, STATUS_PII_SCANNING),
                max_attempts=3,
                operation_name=f"Update job {job.job_id} status to PII_SCANNING"
            )

            # Step 1: Download PDF from S3 (StorageService handles retries internally)
            pdf_content = await self.storage.download_temp_file(job.s3_key)
            logger.info(f"Downloaded PDF for job {job.job_id}: {len(pdf_content)} bytes")

            # Step 2: Extract text content (with retry on transient extraction errors)
            text_content: str = await retry_with_backoff(
                lambda: extract_pdf_text(pdf_content),
                max_attempts=MAX_RETRY_ATTEMPTS + 1,  # Maintain existing retry count for PDF extraction
                operation_name=f"Extract text from PDF for job {job.job_id}"
            )
            logger.info(f"Extracted {len(text_content)} characters from job {job.job_id}")

            # Step 3: Run PII analysis (synchronous, no network calls - no retry needed)
            pii_findings = self.pii_analyzer.analyze_text(text_content)
            logger.info(f"Found {len(pii_findings)} PII entities in job {job.job_id}")

            # Step 4: Route based on findings (with retry for queue/status operations)
            if pii_findings:
                await self._queue_for_approval_with_retry(job, pii_findings)
            else:
                await self._queue_for_processing_with_retry(job)

        except PDFExtractionError as e:
            # PDF extraction failed after retries - permanent failure
            logger.error(f"PDF extraction failed for job {job.job_id} after retries: {e}")
            error_msg = f"PDF extraction failed: {str(e)}"
            await retry_with_backoff(
                lambda: self.jobs.update_job_status(
                    job.job_id,
                    STATUS_FAILED,
                    error=error_msg
                ),
                max_attempts=3,
                operation_name=f"Update job {job.job_id} to FAILED"
            )

        except Exception as e:
            # Unexpected error (after any retries)
            logger.error(f"PII processing failed for job {job.job_id}: {e}", exc_info=True)
            error_msg = f"PII scan error: {str(e)}"
            await retry_with_backoff(
                lambda: self.jobs.update_job_status(
                    job.job_id,
                    STATUS_FAILED,
                    error=error_msg
                ),
                max_attempts=3,
                operation_name=f"Update job {job.job_id} to FAILED"
            )

    async def _queue_for_approval(self, job: PIIQueuePayload, findings: list[PIIFinding]) -> None:
        """Queue job for manual approval with PII details.

        NOTE: This method does not include retry logic internally.
        Use _queue_for_approval_with_retry() for automatic retries on transient failures.

        Args:
            job: Original PII queue payload
            findings: Detected PII entities
        """
        logger.info(f"Queueing job {job.job_id} for approval with {len(findings)} PII findings")

        # Generate secure approval token
        approval_token = generate_secure_token()
        expires_at = datetime.now(UTC) + timedelta(hours=APPROVAL_TIMEOUT_HOURS)

        # Create approval queue payload
        approval_payload = ApprovalQueuePayload(
            job_id=job.job_id,
            s3_key=job.s3_key,
            pii_findings=findings,
            approval_token=approval_token,
            expires_at=expires_at
        )

        # Push to approval queue
        await self.queue.enqueue(APPROVAL_QUEUE, approval_payload)

        # Update job status with PII findings
        await self.jobs.update_job_status(
            job.job_id,
            STATUS_AWAITING_APPROVAL,
            pii_findings=[f.model_dump() for f in findings],
            approval_token=approval_token,
            approval_expires_at=expires_at.isoformat()
        )

        # Add to timeout tracking so timeout worker can find it
        await self.queue.add_to_timeout_tracking(job.job_id, expires_at)

        # Store token-to-job mapping for O(1) lookup (expires with approval)
        await self.jobs.store_approval_token_mapping(
            approval_token,
            job.job_id,
            ttl_hours=APPROVAL_TIMEOUT_HOURS
        )

        logger.info(f"Job {job.job_id} queued for approval, token: {approval_token[:8]}...")

    async def _queue_for_approval_with_retry(self, job: PIIQueuePayload, findings: list[PIIFinding]) -> None:
        """Queue job for approval with retry logic for transient failures.

        Wraps _queue_for_approval with exponential backoff retry for:
        - Redis queue operations (enqueue)
        - Redis status updates
        - Redis timeout tracking

        Args:
            job: Original PII queue payload
            findings: Detected PII entities
        """
        await retry_with_backoff(
            lambda: self._queue_for_approval(job, findings),
            max_attempts=3,
            operation_name=f"Queue job {job.job_id} for approval"
        )

    async def _queue_for_processing(self, job: PIIQueuePayload) -> None:
        """Trigger document processing directly after PII scan passes.

        Instead of queuing to an orphan queue, this method triggers the
        DocumentProcessingService directly via asyncio.create_task, similar
        to how skip_pii_scan works in the API layer.

        NOTE: This method does not include retry logic internally.
        Use _queue_for_processing_with_retry() for automatic retries on transient failures.

        Args:
            job: Original PII queue payload
        """
        logger.info(f"Starting processing for job {job.job_id} (no PII detected)")

        # Update job status to processing
        await self.jobs.update_job_status(job.job_id, STATUS_PROCESSING)

        # Get job data for filename and review_mode
        job_data = await self.jobs.get_job(job.job_id)
        if not job_data:
            logger.error(f"Job {job.job_id} not found in Redis, cannot process")
            await self.jobs.update_job_status(
                job.job_id,
                STATUS_FAILED,
                error="Job metadata not found after PII scan"
            )
            return

        filename = job_data.get("original_filename", "document.pdf")
        review_mode = job_data.get("review_mode", "auto")
        max_rounds = int(job_data.get("max_rounds", "1"))
        ocr_languages_raw = job_data.get("ocr_languages", "")
        ocr_languages = ocr_languages_raw.split(",") if ocr_languages_raw else None

        # Import here to avoid circular imports
        from .document_processing_service import DocumentProcessingService

        if self.s3_url_service is None:
            logger.error(f"S3URLService not configured for job {job.job_id}")
            await self.jobs.update_job_status(
                job.job_id,
                STATUS_FAILED,
                error="S3URLService not configured"
            )
            return

        # Create processing service
        processing_service = DocumentProcessingService(
            redis_client=self.queue.redis,  # Get redis client from queue service
            storage_service=self.storage,
            s3_url_service=self.s3_url_service,
        )

        # Trigger processing in background (non-blocking)
        asyncio.create_task(
            processing_service.process_document(
                job_id=job.job_id,
                s3_key=job.s3_key,
                filename=filename,
                review_mode=review_mode,
                max_rounds=max_rounds,
                ocr_languages=ocr_languages,
            )
        )

        logger.info(f"Job {job.job_id} processing started in background (max_rounds={max_rounds})")

    async def _queue_for_processing_with_retry(self, job: PIIQueuePayload) -> None:
        """Queue job for processing with retry logic for transient failures.

        Wraps _queue_for_processing with exponential backoff retry for:
        - Redis queue operations (enqueue)
        - Redis status updates

        Args:
            job: Original PII queue payload
        """
        await retry_with_backoff(
            lambda: self._queue_for_processing(job),
            max_attempts=3,
            operation_name=f"Queue job {job.job_id} for processing"
        )