📦 EqualifyEverything / equalify-reflow

📄 test_invalid_pdfs.py · 323 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323"""Edge case tests for invalid and malformed PDF handling."""

from io import BytesIO

import pytest
from fastapi import HTTPException, UploadFile
from src.config import settings
from src.services.storage_service import StorageService

from tests.e2e.edge_cases.helpers import (
    create_corrupted_pdf,
    create_empty_pdf,
    create_encrypted_pdf,
    create_non_pdf_file,
    create_truncated_pdf,
)


@pytest.fixture
def mock_s3_client(mocker):
    """Create mock S3 client."""
    client = mocker.MagicMock()
    return client


@pytest.fixture
def storage_service(mock_s3_client):
    """Create storage service with mock client."""
    return StorageService(
        s3_client=mock_s3_client,
        temp_bucket=settings.s3_temp_bucket,
        results_bucket=settings.s3_results_bucket,
    )


class TestInvalidPdfHandling:
    """Tests for handling various invalid PDF formats."""

    @pytest.mark.asyncio
    async def test_corrupted_pdf_rejected(self, storage_service, mocker):
        """Test that corrupted PDF structure is rejected."""
        corrupted_pdf = create_corrupted_pdf()
        file = BytesIO(corrupted_pdf)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "corrupted.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        # Mock S3 upload (to test if validation happens before upload)
        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Note: Current implementation may accept any file with .pdf extension
        # This test validates the behavior - update expectation if validation is added
        try:
            job_id, s3_key = await storage_service.store_document(upload_file)
            # If accepted, verify it was uploaded
            assert job_id is not None
        except HTTPException as exc:
            # If rejected, verify proper error code
            assert exc.status_code in [400, 415]

    @pytest.mark.asyncio
    async def test_encrypted_pdf_handling(self, storage_service, mocker):
        """Test handling of encrypted/password-protected PDF."""
        encrypted_pdf = create_encrypted_pdf()
        file = BytesIO(encrypted_pdf)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "encrypted.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Encrypted PDFs may be accepted but fail during processing
        try:
            job_id, s3_key = await storage_service.store_document(upload_file)
            # If accepted, it should fail later during processing
            assert job_id is not None
        except HTTPException as exc:
            # If rejected early, verify error
            assert exc.status_code in [400, 415]
            assert "encrypted" in exc.detail.lower() or "password" in exc.detail.lower()

    @pytest.mark.asyncio
    async def test_empty_file_rejected(self, storage_service, mocker):
        """Test that empty (0 byte) file is rejected."""
        empty_pdf = create_empty_pdf()
        file = BytesIO(empty_pdf)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "empty.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        # Empty file should be rejected
        with pytest.raises(HTTPException) as exc:
            await storage_service.store_document(upload_file)

        assert exc.value.status_code in [400, 413]

    @pytest.mark.asyncio
    async def test_truncated_pdf_handling(self, storage_service, mocker):
        """Test handling of truncated/incomplete PDF."""
        truncated_pdf = create_truncated_pdf()
        file = BytesIO(truncated_pdf)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "truncated.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Truncated PDF might be accepted but fail during processing
        try:
            job_id, s3_key = await storage_service.store_document(upload_file)
            assert job_id is not None
        except HTTPException:
            # If rejected, that's also acceptable
            pass

    @pytest.mark.asyncio
    async def test_non_pdf_file_rejected(self, storage_service, mocker):
        """Test that non-PDF file is rejected even with .pdf extension."""
        non_pdf = create_non_pdf_file()
        file = BytesIO(non_pdf)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "notapdf.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Current implementation may accept based on extension/content-type
        # This test documents the behavior
        try:
            job_id, s3_key = await storage_service.store_document(upload_file)
            assert job_id is not None
        except HTTPException as exc:
            assert exc.status_code in [400, 415]

    @pytest.mark.asyncio
    async def test_wrong_content_type_rejected(self, storage_service, mocker):
        """Test that file with wrong content-type is rejected."""
        # Valid PDF but wrong content-type
        pdf_content = b"%PDF-1.4\n%Test content\n%%EOF"
        file = BytesIO(pdf_content)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "test.pdf"
        upload_file.file = file
        upload_file.content_type = "text/plain"  # Wrong content type

        # Should be rejected based on content-type
        with pytest.raises(HTTPException) as exc:
            await storage_service.store_document(upload_file)

        assert exc.value.status_code == 400
        assert "PDF files" in exc.value.detail

    @pytest.mark.asyncio
    async def test_image_file_with_pdf_extension(self, storage_service, mocker):
        """Test that image file with .pdf extension is rejected."""
        # PNG file header
        png_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR"
        file = BytesIO(png_content)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "image.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # May be accepted based on extension but will fail in processing
        try:
            job_id, s3_key = await storage_service.store_document(upload_file)
            assert job_id is not None
        except HTTPException as exc:
            assert exc.status_code in [400, 415]

    @pytest.mark.asyncio
    async def test_pdf_without_extension(self, storage_service, mocker):
        """Test valid PDF without .pdf extension."""
        pdf_content = b"%PDF-1.4\n" + b"%Test content line\n" * 10 + b"%%EOF"
        file = BytesIO(pdf_content)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "document"  # No extension
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Should be accepted if content-type is correct
        job_id, s3_key = await storage_service.store_document(upload_file)
        assert job_id is not None
        # Extension should be added
        assert s3_key.endswith(".pdf")

    @pytest.mark.asyncio
    async def test_pdf_with_null_bytes(self, storage_service, mocker):
        """Test PDF containing null bytes."""
        # Create PDF larger than 100 bytes minimum
        pdf_content = b"%PDF-1.4\n\x00\x00\x00Content with nulls\x00\x00" + b"padding" * 15 + b"%%EOF"
        file = BytesIO(pdf_content)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "nullbytes.pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Should accept (null bytes are valid in binary PDF)
        job_id, s3_key = await storage_service.store_document(upload_file)
        assert job_id is not None

    @pytest.mark.asyncio
    async def test_pdf_with_special_characters_in_filename(self, storage_service, mocker):
        """Test PDF with special characters in filename."""
        pdf_content = b"%PDF-1.4\n" + b"%Test content line\n" * 10 + b"%%EOF"
        file = BytesIO(pdf_content)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "test file!@#$%^&*().pdf"
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Should accept and sanitize filename
        job_id, s3_key = await storage_service.store_document(upload_file)
        assert job_id is not None

    @pytest.mark.asyncio
    async def test_pdf_with_unicode_filename(self, storage_service, mocker):
        """Test PDF with unicode characters in filename."""
        pdf_content = b"%PDF-1.4\n" + b"%Test content line\n" * 10 + b"%%EOF"
        file = BytesIO(pdf_content)

        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = "文档测试.pdf"  # Chinese characters
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Should accept unicode filenames
        job_id, s3_key = await storage_service.store_document(upload_file)
        assert job_id is not None

    @pytest.mark.asyncio
    async def test_pdf_with_very_long_filename(self, storage_service, mocker):
        """Test PDF with extremely long filename."""
        pdf_content = b"%PDF-1.4\n" + b"%Test content line\n" * 10 + b"%%EOF"
        file = BytesIO(pdf_content)

        # Create 300 character filename
        long_name = "a" * 300 + ".pdf"
        upload_file = mocker.Mock(spec=UploadFile)
        upload_file.filename = long_name
        upload_file.file = file
        upload_file.content_type = "application/pdf"

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        # Should handle long filenames (may truncate)
        try:
            job_id, s3_key = await storage_service.store_document(upload_file)
            assert job_id is not None
        except HTTPException as exc:
            # May reject if filename is too long
            assert exc.status_code == 400

    @pytest.mark.asyncio
    async def test_multiple_pdf_versions(self, storage_service, mocker):
        """Test handling of different PDF versions."""
        pdf_versions = [
            b"%PDF-1.0\n" + b"%Test line\n" * 10 + b"%%EOF",  # Very old
            b"%PDF-1.4\n" + b"%Test line\n" * 10 + b"%%EOF",  # Common
            b"%PDF-1.7\n" + b"%Test line\n" * 10 + b"%%EOF",  # Modern
            b"%PDF-2.0\n" + b"%Test line\n" * 10 + b"%%EOF",  # Latest
        ]

        mock_s3_client = mocker.MagicMock()
        mock_s3_client.upload_fileobj.return_value = None
        storage_service.s3_client = mock_s3_client

        for idx, pdf_content in enumerate(pdf_versions):
            file = BytesIO(pdf_content)
            upload_file = mocker.Mock(spec=UploadFile)
            upload_file.filename = f"version_{idx}.pdf"
            upload_file.file = file
            upload_file.content_type = "application/pdf"

            # All versions should be accepted
            job_id, s3_key = await storage_service.store_document(upload_file)
            assert job_id is not None