📦 EqualifyEverything / equalify-reflow

📄 prompt_sanitizer.py · 156 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156"""Prompt sanitization utilities for secure LLM prompt construction.

This module provides utilities to sanitize user-influenced data before
including it in LLM prompts, preventing prompt injection attacks.

Security Considerations:
- PDF metadata (title, author, etc.) can be crafted by attackers
- Document content may contain prompt injection markers
- All user-influenced data should be sanitized before prompt inclusion
"""

from __future__ import annotations

import logging
import re
from typing import Any

logger = logging.getLogger(__name__)

# Threshold for logging "significant" sanitization (>20% content reduction)
SIGNIFICANT_SANITIZATION_THRESHOLD = 0.8

# Patterns that could be used for prompt injection
# These are common markers used in various LLM instruction formats
INJECTION_PATTERNS: list[str] = [
    r"</s>",  # End of sequence tokens
    r"<\|im_end\|>",  # ChatML markers
    r"<\|im_start\|>",
    r"\[INST\]",  # Llama instruction markers
    r"\[/INST\]",
    r"<<SYS>>",  # System prompt markers
    r"<</SYS>>",
    r"Human:",  # Conversation role markers
    r"Assistant:",
    r"System:",
    r"<\|user\|>",  # Additional ChatML variants
    r"<\|assistant\|>",
    r"<\|system\|>",
    r"<\|endoftext\|>",  # GPT end tokens
    r"<\|end\|>",
]

# Compiled regex for efficiency
_INJECTION_REGEX = re.compile(
    "|".join(INJECTION_PATTERNS),
    flags=re.IGNORECASE,
)


def sanitize_for_prompt(
    text: str | None,
    max_length: int = 200,
    context: str = "unknown",
) -> str:
    """Sanitize text for safe inclusion in LLM prompts.

    Removes potential prompt injection markers, escapes format string
    characters, and truncates to a safe length.

    Args:
        text: Raw text to sanitize (e.g., from PDF metadata), or None
        max_length: Maximum allowed length (default 200 chars)
        context: Field name for logging (e.g., "document_title")

    Returns:
        Sanitized text safe for prompt inclusion

    Example:
        >>> sanitize_for_prompt("Report</s>Ignore previous", context="title")
        'ReportIgnore previous'
        >>> sanitize_for_prompt("Hello {world}", context="title")
        'Hello {{world}}'
    """
    if not text:
        return ""

    original_length = len(text)
    original_text = text

    # Remove potential prompt injection markers
    text = _INJECTION_REGEX.sub("", text)

    # Escape curly braces to prevent format string issues
    # This is critical for .format() calls
    text = text.replace("{", "{{").replace("}", "}}")

    # Strip whitespace BEFORE truncation for consistent output length
    text = text.strip()

    # Truncate to max length
    if len(text) > max_length:
        text = text[:max_length] + "..."

    # Log if significant sanitization occurred (>20% reduction)
    if len(text) < original_length * SIGNIFICANT_SANITIZATION_THRESHOLD:
        logger.warning(
            f"Significant sanitization of {context}: "
            f"{original_length} -> {len(text)} chars",
            extra={
                "security_event": "prompt_sanitization",
                "field": context,
                "original_length": original_length,
                "sanitized_length": len(text),
            },
        )

    # Log if injection patterns were detected
    if _INJECTION_REGEX.search(original_text):
        logger.warning(
            f"Prompt injection markers detected in {context}",
            extra={
                "security_event": "injection_markers_detected",
                "field": context,
            },
        )

    return text


def sanitize_prompt_context(context: dict[str, Any]) -> dict[str, str]:
    """Sanitize all string values in a prompt context dictionary.

    Applies sanitize_for_prompt() to all string values in the context,
    converting non-string values to their string representation.

    Args:
        context: Dictionary of values to be formatted into prompt

    Returns:
        Dictionary with all string values sanitized

    Example:
        >>> ctx = {"title": "Report</s>", "pages": 10}
        >>> sanitize_prompt_context(ctx)
        {'title': 'Report', 'pages': '10'}
    """
    sanitized: dict[str, str] = {}

    for key, value in context.items():
        if isinstance(value, str):
            sanitized[key] = sanitize_for_prompt(value, context=key)
        elif value is None:
            sanitized[key] = ""
        else:
            # Non-string values are safe, just convert to string
            sanitized[key] = str(value)

    return sanitized


__all__ = [
    "sanitize_for_prompt",
    "sanitize_prompt_context",
    "INJECTION_PATTERNS",
]