pythonadvanced

Output Guardrails for LLM Responses

Validate and sanitize LLM outputs to prevent hallucination and injection.

pythonPress ⌘/Ctrl + Shift + C to copy

import re
import json
from dataclasses import dataclass
from typing import Optional

@dataclass
class GuardrailResult:
    passed: bool
    content: str
    violations: list[str]

def check_guardrails(response: str) -> GuardrailResult:
    violations = []

    # 1. Check for PII patterns
    pii_patterns = [
        (r'\b\d{3}-\d{2}-\d{4}\b', 'SSN detected'),
        (r'\b\d{16}\b', 'Credit card number detected'),
        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'Email detected'),
    ]
    for pattern, msg in pii_patterns:
        if re.search(pattern, response):
            violations.append(msg)

    # 2. Check response length
    if len(response) > 10000:
        violations.append("Response exceeds max length")

    # 3. Check for prompt injection attempts
    injection_markers = [
        "ignore previous instructions",
        "system prompt",
        "you are now",
    ]
    lower = response.lower()
    for marker in injection_markers:
        if marker in lower:
            violations.append(f"Potential injection: '{marker}'")

    # 4. Check for confident hallucination markers
    hedge_words = ["I'm not sure", "I don't have", "I cannot verify"]
    has_hedging = any(h in response for h in hedge_words)

    passed = len(violations) == 0
    return GuardrailResult(
        passed=passed,
        content=response if passed else "[BLOCKED]",
        violations=violations
    )

# Usage
result = check_guardrails("The answer is 42. Contact john@email.com for details.")
print(f"Passed: {result.passed}")
print(f"Violations: {result.violations}")