📦 EqualifyEverything / equalify-reflow

📄 test_pii_false_positives.py · 292 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292"""Integration tests for PII false positive validation."""


import pytest
from src.services.pii_analyzer import ENTITY_TYPES, PIIAnalyzer


class TestPIIFalsePositiveReduction:
    """Integration tests verifying false positive reduction for course materials."""

    @pytest.fixture
    def pii_analyzer(self):
        """Create real PII analyzer (not mocked) for integration testing."""
        # Explicitly set threshold to 0.85 (matches production config)
        return PIIAnalyzer(confidence_threshold=0.85)

    @pytest.mark.integration
    def test_resume_false_positive_rate(self, pii_analyzer):
        """Test that Dylan Isaac resume has <10% false positive rate.

        Previously detected 25 entities with 52% false positive rate (13/25).
        After changes: Should detect only actual PII (email, phone if present).
        """
        # Sample resume content similar to Dylan Isaac's resume
        # Note: Using realistic phone format (not 555) to match production confidence scores
        resume_text = """
        Dylan Isaac
        Software Engineer

        San Francisco Bay Area
        dylan.isaac@example.com
        (415) 867-5309

        EXPERIENCE

        Senior Software Engineer at TechCorp
        September 2019 - Present
        - Led development of microservices architecture
        - Collaborated with product managers on feature roadmap
        - Mentored junior engineers

        Software Engineer at StartupCo
        June 2017 - August 2019
        - Built RESTful APIs using Python and FastAPI
        - Implemented CI/CD pipelines
        - Worked with AWS services

        EDUCATION

        Bachelor of Science in Computer Science
        University of California, Berkeley
        Graduated 2017

        SKILLS

        Python, JavaScript, TypeScript, React, FastAPI, Docker, AWS
        """

        findings = pii_analyzer.analyze_text(resume_text)

        # Should only detect EMAIL_ADDRESS and PHONE_NUMBER (2 entities)
        # Previously detected 25 entities (13 false positives: 7 PERSON, 4 DATE_TIME, 2 LOCATION)
        assert len(findings) <= 2, f"Too many PII findings: {len(findings)}"

        # Verify only expected entity types
        entity_types = {f.entity_type for f in findings}
        allowed_types = {"EMAIL_ADDRESS", "PHONE_NUMBER"}
        unexpected = entity_types - allowed_types
        assert not unexpected, f"Unexpected entity types: {unexpected}"

        # Verify false positive rate <10%
        # Total entities = 2 actual PII (email, phone)
        # False positives should be 0
        expected_pii_count = 2
        actual_findings = len(findings)

        if actual_findings > 0:
            false_positive_rate = max(0, (actual_findings - expected_pii_count) / actual_findings)
            assert false_positive_rate < 0.1, f"False positive rate {false_positive_rate:.2%} exceeds 10%"

    @pytest.mark.integration
    def test_syllabus_false_positive_rate(self, pii_analyzer):
        """Test that course syllabus has <5% false positive rate."""
        syllabus_text = """
        CS 101: Introduction to Computer Science
        University of Illinois Chicago
        Fall 2024

        INSTRUCTOR INFORMATION

        Professor Jane Smith
        Office: Engineering Building, Room 305
        Email: jsmith@uic.edu
        Office Hours: Tuesdays 2-4 PM

        COURSE DESCRIPTION

        This course introduces fundamental concepts of computer science including
        programming, algorithms, and data structures. Students will learn Python
        programming and apply computational thinking to problem solving.

        SCHEDULE

        Week 1 (Aug 26): Introduction to Programming
        Week 2 (Sep 2): Variables and Data Types
        Week 3 (Sep 9): Control Flow
        Week 4 (Sep 16): Functions

        GRADING

        Assignments: 40%
        Midterm: 25%
        Final Project: 25%
        Participation: 10%

        TEXTBOOK

        "Introduction to Python Programming" by John Doe
        Publisher: Tech Press, 2023
        ISBN: 978-1-234567-89-0
        """

        findings = pii_analyzer.analyze_text(syllabus_text)

        # Should only detect professor's email (1 entity)
        # Dates, names, locations should NOT be detected
        assert len(findings) <= 1, f"Too many PII findings: {len(findings)}"

        # Verify only EMAIL_ADDRESS detected
        if findings:
            entity_types = {f.entity_type for f in findings}
            assert entity_types == {"EMAIL_ADDRESS"}, f"Unexpected entity types: {entity_types}"

            # Verify it's the professor's email
            assert "@uic.edu" in findings[0].text.lower()

        # False positive rate should be 0% (only expected PII detected)
        expected_pii_count = 1  # Professor's email only
        false_positive_rate = max(0, (len(findings) - expected_pii_count) / max(1, len(findings)))
        assert false_positive_rate < 0.05, f"False positive rate {false_positive_rate:.2%} exceeds 5%"

    @pytest.mark.integration
    def test_technical_document_no_false_positives(self, pii_analyzer):
        """Test that technical documentation has no false positives."""
        technical_text = """
        API Documentation

        The DocumentProcessor class handles PDF conversion. Initialize with:

        processor = DocumentProcessor(
            api_key="sk-1234567890",
            endpoint="https://api.example.com"
        )

        Methods:
        - process(file_path: str) -> Document
        - convert_to_html(doc: Document) -> str

        Error Codes:
        400 - Bad Request
        401 - Unauthorized
        500 - Internal Server Error

        Release Notes v2.3.0 (2024-01-15):
        - Added support for large files
        - Fixed memory leak in parser
        - Updated dependencies
        """

        findings = pii_analyzer.analyze_text(technical_text)

        # Should detect nothing (no actual PII in technical docs)
        # API keys, URLs, dates, version numbers should NOT be flagged
        assert len(findings) == 0, f"False positives detected: {[f.entity_type for f in findings]}"

    @pytest.mark.integration
    def test_actual_pii_detected_with_realistic_data(self, pii_analyzer):
        """Test that actual PII is detected with realistic (non-555) data."""
        # Use realistic phone numbers (not 555) to match production behavior
        pii_text = """
        Student Information Form

        Please provide your contact information:

        Email: student@example.com
        Phone: (312) 867-5309
        SSN: 123-45-6789

        Emergency Contact:
        Name: Jane Doe
        Phone: (312) 867-5308
        """

        findings = pii_analyzer.analyze_text(pii_text)

        # Should detect EMAIL, PHONE, SSN with high confidence
        assert len(findings) >= 1, f"Failed to detect any PII: {len(findings)} findings"

        entity_types = {f.entity_type for f in findings}
        assert "EMAIL_ADDRESS" in entity_types, "Failed to detect email"

        # With realistic data, all findings should meet 0.85 threshold
        for finding in findings:
            assert finding.score >= 0.85, \
                f"Low confidence finding: {finding.entity_type} ({finding.score}). " \
                "Realistic PII should score >= 0.85"

    @pytest.mark.integration
    def test_entity_types_configuration(self, pii_analyzer):
        """Test that only pattern-based entity types are enabled."""
        # Verify ENTITY_TYPES only contains pattern-based detectors
        expected_types = {
            "EMAIL_ADDRESS",
            "PHONE_NUMBER",
            "US_SSN",
            "CREDIT_CARD",
            "IBAN_CODE",
            "US_DRIVER_LICENSE",
        }

        assert set(ENTITY_TYPES) == expected_types, f"Unexpected entity types: {set(ENTITY_TYPES)}"

        # Verify NER-based types are disabled
        disabled_types = {"PERSON", "DATE_TIME", "LOCATION"}
        assert not (set(ENTITY_TYPES) & disabled_types), "NER-based entity types should be disabled"

    @pytest.mark.integration
    def test_company_names_not_detected(self, pii_analyzer):
        """Test that company names are not flagged as PERSON entities."""
        company_text = """
        I worked at Microsoft, Google, Apple, and Amazon.
        Previously worked at TechCorp and StartupCo.
        Collaborated with OpenAI and Anthropic teams.
        """

        findings = pii_analyzer.analyze_text(company_text)

        # Should detect nothing (company names are not PII)
        person_findings = [f for f in findings if f.entity_type == "PERSON"]
        assert len(person_findings) == 0, f"Company names incorrectly detected as PERSON: {person_findings}"

    @pytest.mark.integration
    def test_job_titles_not_detected(self, pii_analyzer):
        """Test that job titles are not flagged as PERSON entities."""
        title_text = """
        Position: Senior Software Engineer
        Reporting to: Engineering Manager
        Team: Product Development
        Role: Technical Lead
        """

        findings = pii_analyzer.analyze_text(title_text)

        # Should detect nothing (job titles are not PII)
        person_findings = [f for f in findings if f.entity_type == "PERSON"]
        assert len(person_findings) == 0, f"Job titles incorrectly detected as PERSON: {person_findings}"

    @pytest.mark.integration
    def test_dates_not_detected(self, pii_analyzer):
        """Test that various date formats are not flagged as PII."""
        date_text = """
        Assignment due: January 15, 2024
        Exam date: 2024-03-20
        Office hours: Mondays 2-4 PM
        Semester: Fall 2024
        Published: September 2019
        Updated: 2023-11-30
        """

        findings = pii_analyzer.analyze_text(date_text)

        # Should detect nothing (dates in course materials are not PII)
        date_findings = [f for f in findings if f.entity_type == "DATE_TIME"]
        assert len(date_findings) == 0, f"Dates incorrectly detected as PII: {date_findings}"

    @pytest.mark.integration
    def test_locations_not_detected(self, pii_analyzer):
        """Test that locations are not flagged as PII."""
        location_text = """
        University of Illinois Chicago
        Engineering Building, Room 305
        Chicago, Illinois
        San Francisco Bay Area
        Office location: Downtown Campus
        """

        findings = pii_analyzer.analyze_text(location_text)

        # Should detect nothing (institutional locations are not PII)
        location_findings = [f for f in findings if f.entity_type == "LOCATION"]
        assert len(location_findings) == 0, f"Locations incorrectly detected as PII: {location_findings}"