pythonadvanced
Output Guardrails for LLM Responses
Validate and sanitize LLM outputs to prevent hallucination and injection.
pythonPress ⌘/Ctrl + Shift + C to copy
import re
import json
from dataclasses import dataclass
from typing import Optional
@dataclass
class GuardrailResult:
passed: bool
content: str
violations: list[str]
def check_guardrails(response: str) -> GuardrailResult:
violations = []
# 1. Check for PII patterns
pii_patterns = [
(r'\b\d{3}-\d{2}-\d{4}\b', 'SSN detected'),
(r'\b\d{16}\b', 'Credit card number detected'),
(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'Email detected'),
]
for pattern, msg in pii_patterns:
if re.search(pattern, response):
violations.append(msg)
# 2. Check response length
if len(response) > 10000:
violations.append("Response exceeds max length")
# 3. Check for prompt injection attempts
injection_markers = [
"ignore previous instructions",
"system prompt",
"you are now",
]
lower = response.lower()
for marker in injection_markers:
if marker in lower:
violations.append(f"Potential injection: '{marker}'")
# 4. Check for confident hallucination markers
hedge_words = ["I'm not sure", "I don't have", "I cannot verify"]
has_hedging = any(h in response for h in hedge_words)
passed = len(violations) == 0
return GuardrailResult(
passed=passed,
content=response if passed else "[BLOCKED]",
violations=violations
)
# Usage
result = check_guardrails("The answer is 42. Contact john@email.com for details.")
print(f"Passed: {result.passed}")
print(f"Violations: {result.violations}")Use Cases
- Safety filtering
- PII protection
- Injection prevention
Tags
Related Snippets
Similar patterns you can reuse in the same workflow.
typescriptbeginner
Content Moderation with OpenAI
Check user input for harmful content using the OpenAI Moderation API before processing.
Best for: user input safety
#openai#moderation
typescriptadvanced
AI Guardrails & Safety Pattern
Implement input/output guardrails for LLM applications with content filtering and response validation.
Best for: User-facing chatbots
#guardrails#safety
pythonadvanced
NeMo Guardrails for Safe LLM
Apply NVIDIA NeMo Guardrails to enforce topic boundaries and prevent prompt injection in LLM apps.
Best for: LLM safety
#nemo#guardrails
pythonadvanced
Text-to-SQL with Validation Safety
Convert natural language to SQL with LLM and validate queries before execution for safety.
Best for: safe NL2SQL
#nl2sql#sql