Constitutional AI and Safety
Learn to build safe and aligned LLM applications using Constitutional AI principles and practical safety techniques.
What You'll Learn: Constitutional AI (CAI) is Anthropic's approach to AI safety, using AI to critique and revise its own outputs based on a set of principles. We'll explore these concepts and build practical safety systems.
Understanding Constitutional AI
Core Principles
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
class SafetyPrinciple(Enum):
"""Constitutional AI principles"""
HARMLESSNESS = "harmlessness"
HELPFULNESS = "helpfulness"
HONESTY = "honesty"
PRIVACY = "privacy"
FAIRNESS = "fairness"
@dataclass
class Constitution:
"""A set of principles for AI behavior"""
principles: Dict[SafetyPrinciple, List[str]]
def get_critique_prompt(self, principle: SafetyPrinciple) -> str:
"""Get critique prompt for a principle"""
prompts = {
SafetyPrinciple.HARMLESSNESS: """
Review the AI's response. Does it:
- Avoid harmful, dangerous, or illegal content?
- Refuse to help with harmful requests?
- Provide appropriate warnings when necessary?
If the response violates these, explain how to revise it to be harmless.
""",
SafetyPrinciple.HELPFULNESS: """
Review the AI's response. Does it:
- Actually answer the user's question?
- Provide useful, actionable information?
- Give appropriate detail without being excessive?
If not, suggest how to make it more helpful.
""",
SafetyPrinciple.HONESTY: """
Review the AI's response. Does it:
- Provide accurate information?
- Acknowledge uncertainty when appropriate?
- Avoid making up facts or hallucinating?
If there are issues, suggest corrections.
""",
SafetyPrinciple.PRIVACY: """
Review the AI's response. Does it:
- Protect user privacy?
- Avoid requesting unnecessary personal information?
- Handle sensitive data appropriately?
Suggest privacy improvements if needed.
""",
SafetyPrinciple.FAIRNESS: """
Review the AI's response. Does it:
- Treat all groups fairly?
- Avoid stereotypes and biases?
- Provide balanced perspectives?
Identify any fairness issues and suggest improvements.
"""
}
return prompts[principle]
class ConstitutionalAI:
"""Implement Constitutional AI approach"""
def __init__(self, model_client, constitution: Constitution):
"""
Initialize Constitutional AI system
Args:
model_client: LLM client for generation
constitution: Set of principles to enforce
"""
self.client = model_client
self.constitution = constitution
def generate_response(self, prompt: str) -> str:
"""Generate initial response"""
# This would call your LLM
# For demo, we'll return a placeholder
return "Initial AI response to: " + prompt
def critique_response(
self,
prompt: str,
response: str,
principle: SafetyPrinciple
) -> str:
"""
Critique response according to a principle
Returns critique and suggestions for improvement
"""
critique_prompt = f"""User request: {prompt}
AI response: {response}
{self.constitution.get_critique_prompt(principle)}
Provide your critique:"""
# Call LLM for critique
critique = self.generate_response(critique_prompt)
return critique
def revise_response(
self,
prompt: str,
response: str,
critique: str
) -> str:
"""Revise response based on critique"""
revision_prompt = f"""Original request: {prompt}
Original response: {response}
Critique: {critique}
Please revise the response to address the critique while maintaining helpfulness:"""
# Call LLM for revision
revised = self.generate_response(revision_prompt)
return revised
def constitutional_generation(
self,
prompt: str,
num_iterations: int = 2,
principles: Optional[List[SafetyPrinciple]] = None
) -> Tuple[str, List[Dict]]:
"""
Generate response using Constitutional AI
Process:
1. Generate initial response
2. Critique against each principle
3. Revise based on critiques
4. Repeat for num_iterations
Returns:
Final response and iteration history
"""
if principles is None:
principles = list(SafetyPrinciple)
# Initial generation
response = self.generate_response(prompt)
history = [{
"iteration": 0,
"response": response,
"critiques": {}
}]
# Iterative refinement
for iteration in range(num_iterations):
iteration_critiques = {}
# Critique against each principle
for principle in principles:
critique = self.critique_response(prompt, response, principle)
iteration_critiques[principle.value] = critique
# Combine critiques
combined_critique = "\n\n".join([
f"{principle}: {critique}"
for principle, critique in iteration_critiques.items()
])
# Revise
response = self.revise_response(prompt, response, combined_critique)
history.append({
"iteration": iteration + 1,
"response": response,
"critiques": iteration_critiques
})
return response, history
# Example constitution
example_constitution = Constitution(
principles={
SafetyPrinciple.HARMLESSNESS: [
"Avoid harmful, dangerous, or illegal content",
"Refuse to help with harmful requests",
"Provide warnings when appropriate"
],
SafetyPrinciple.HELPFULNESS: [
"Answer the user's actual question",
"Provide useful, actionable information",
"Give appropriate detail"
],
SafetyPrinciple.HONESTY: [
"Provide accurate information",
"Acknowledge uncertainty",
"Avoid hallucination"
]
}
)
# Example usage
def demonstrate_constitutional_ai():
"""Demonstrate Constitutional AI process"""
# Initialize (would use real LLM client)
cai = ConstitutionalAI(None, example_constitution)
prompt = "How do I make a website?"
# Generate with constitutional approach
final_response, history = cai.constitutional_generation(
prompt,
num_iterations=2,
principles=[SafetyPrinciple.HELPFULNESS, SafetyPrinciple.HONESTY]
)
print("Constitutional AI Generation Process:")
for entry in history:
print(f"\nIteration {entry['iteration']}:")
print(f"Response: {entry['response'][:100]}...")
if entry['critiques']:
print("Critiques:")
for principle, critique in entry['critiques'].items():
print(f" {principle}: {critique[:80]}...")
demonstrate_constitutional_ai()
Implementing Safety Guardrails
Safety First: Always implement multiple layers of safety checks for production LLM applications to prevent harmful outputs and protect users.
Input and Output Filtering
import re
from typing import List, Dict, Tuple
class SafetyFilter:
"""Multi-layer safety filtering system"""
def __init__(self):
# Harmful patterns (examples - expand for production)
self.harmful_patterns = [
r'\b(how to (make|build|create) (a )?bomb)\b',
r'\b(illegal (drug|weapon))\b',
r'\b(hack|exploit|vulnerability)\b',
# Add more patterns
]
# PII patterns
self.pii_patterns = {
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
'credit_card': r'\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b'
}
# Toxic categories
self.toxic_categories = [
'hate_speech',
'violence',
'sexual_content',
'self_harm',
'harassment'
]
def check_harmful_input(self, text: str) -> Tuple[bool, List[str]]:
"""
Check if input contains harmful patterns
Returns:
(is_safe, violations)
"""
violations = []
for pattern in self.harmful_patterns:
if re.search(pattern, text, re.IGNORECASE):
violations.append(f"Harmful pattern detected: {pattern}")
return len(violations) == 0, violations
def detect_pii(self, text: str) -> Dict[str, List[str]]:
"""Detect personally identifiable information"""
detected = {}
for pii_type, pattern in self.pii_patterns.items():
matches = re.findall(pattern, text)
if matches:
detected[pii_type] = matches
return detected
def redact_pii(self, text: str) -> str:
"""Redact PII from text"""
redacted = text
for pii_type, pattern in self.pii_patterns.items():
redacted = re.sub(pattern, f"[REDACTED_{pii_type.upper()}]", redacted)
return redacted
def check_toxicity(self, text: str) -> Dict[str, float]:
"""
Check text toxicity using a toxicity classifier
In production, use models like:
- Perspective API
- Detoxify
- Custom trained models
"""
# Placeholder - use real toxicity detection
scores = {
'toxicity': 0.1,
'severe_toxicity': 0.05,
'obscene': 0.08,
'threat': 0.02,
'insult': 0.15,
'identity_hate': 0.03
}
return scores
def is_safe(
self,
text: str,
check_harmful: bool = True,
check_pii: bool = True,
check_toxicity: bool = True,
toxicity_threshold: float = 0.7
) -> Tuple[bool, Dict]:
"""
Comprehensive safety check
Returns:
(is_safe, details)
"""
details = {
'harmful': {'safe': True, 'violations': []},
'pii': {'detected': {}},
'toxicity': {'scores': {}, 'above_threshold': False}
}
# Check harmful patterns
if check_harmful:
is_safe, violations = self.check_harmful_input(text)
details['harmful'] = {'safe': is_safe, 'violations': violations}
if not is_safe:
return False, details
# Check PII
if check_pii:
pii_detected = self.detect_pii(text)
details['pii']['detected'] = pii_detected
# Check toxicity
if check_toxicity:
toxicity_scores = self.check_toxicity(text)
details['toxicity']['scores'] = toxicity_scores
max_toxicity = max(toxicity_scores.values())
details['toxicity']['above_threshold'] = max_toxicity > toxicity_threshold
if max_toxicity > toxicity_threshold:
return False, details
return True, details
class SafetyGuardrails:
"""Complete safety guardrail system"""
def __init__(self, llm_client):
self.client = llm_client
self.filter = SafetyFilter()
def safe_generate(
self,
prompt: str,
max_retries: int = 2,
**generation_kwargs
) -> Dict:
"""
Generate with safety checks
Process:
1. Check input safety
2. Generate response
3. Check output safety
4. Retry if unsafe
"""
# Check input
input_safe, input_details = self.filter.is_safe(prompt)
if not input_safe:
return {
'success': False,
'error': 'Unsafe input detected',
'details': input_details,
'response': None
}
# Redact PII from input
if input_details['pii']['detected']:
prompt = self.filter.redact_pii(prompt)
# Generate
for attempt in range(max_retries + 1):
# Call LLM (placeholder)
response = f"Generated response for: {prompt}"
# Check output safety
output_safe, output_details = self.filter.is_safe(response)
if output_safe:
# Redact any PII in output
if output_details['pii']['detected']:
response = self.filter.redact_pii(response)
return {
'success': True,
'response': response,
'input_details': input_details,
'output_details': output_details,
'attempts': attempt + 1
}
# If unsafe and retries left, add safety instruction
if attempt < max_retries:
prompt = (
f"{prompt}\n\nIMPORTANT: Provide a safe, helpful response "
"without harmful, toxic, or inappropriate content."
)
return {
'success': False,
'error': 'Could not generate safe response after retries',
'details': output_details,
'response': None
}
# Example usage
def demonstrate_safety():
"""Demonstrate safety system"""
guardrails = SafetyGuardrails(None)
# Test safe input
print("Testing safe input:")
result = guardrails.safe_generate("How do I bake a cake?")
print(f"Success: {result['success']}")
print(f"Response: {result.get('response', result.get('error'))}")
# Test unsafe input
print("\nTesting unsafe input:")
result = guardrails.safe_generate("How to make a bomb")
print(f"Success: {result['success']}")
print(f"Error: {result.get('error')}")
# Test PII
print("\nTesting PII detection:")
text = "My email is user@example.com and phone is 555-123-4567"
safe, details = SafetyFilter().is_safe(text)
print(f"PII detected: {details['pii']['detected']}")
redacted = SafetyFilter().redact_pii(text)
print(f"Redacted: {redacted}")
demonstrate_safety()
Content Moderation System
Layered Defense: Implement multiple layers of content moderation including pre-filtering, post-filtering, and user feedback loops.
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
@dataclass
class ModerationResult:
"""Result of content moderation"""
is_safe: bool
confidence: float
categories_violated: List[str]
severity: str # 'low', 'medium', 'high'
recommended_action: str
timestamp: datetime
class ContentModerator:
"""Advanced content moderation system"""
def __init__(self):
self.filter = SafetyFilter()
self.moderation_history = []
def moderate_content(
self,
content: str,
context: Optional[Dict] = None
) -> ModerationResult:
"""
Comprehensive content moderation
Args:
content: Text to moderate
context: Additional context (user history, etc.)
Returns:
ModerationResult with decision and metadata
"""
violations = []
max_severity = 'low'
# Layer 1: Pattern matching
harmful_safe, harmful_violations = self.filter.check_harmful_input(content)
if not harmful_safe:
violations.extend(['harmful_content'])
max_severity = 'high'
# Layer 2: PII detection
pii_detected = self.filter.detect_pii(content)
if pii_detected:
violations.append('pii_exposure')
max_severity = max(max_severity, 'medium', key=lambda x: ['low', 'medium', 'high'].index(x))
# Layer 3: Toxicity
toxicity_scores = self.filter.check_toxicity(content)
max_toxicity = max(toxicity_scores.values())
if max_toxicity > 0.8:
violations.append('high_toxicity')
max_severity = 'high'
elif max_toxicity > 0.5:
violations.append('moderate_toxicity')
max_severity = max(max_severity, 'medium', key=lambda x: ['low', 'medium', 'high'].index(x))
# Determine action
is_safe = len(violations) == 0
if not is_safe:
if max_severity == 'high':
action = 'block'
elif max_severity == 'medium':
action = 'warn'
else:
action = 'flag'
else:
action = 'allow'
result = ModerationResult(
is_safe=is_safe,
confidence=0.9, # Placeholder
categories_violated=violations,
severity=max_severity,
recommended_action=action,
timestamp=datetime.now()
)
# Log
self.moderation_history.append({
'content': content[:100],
'result': result
})
return result
def get_moderation_stats(self) -> Dict:
"""Get moderation statistics"""
total = len(self.moderation_history)
if total == 0:
return {}
blocked = sum(
1 for entry in self.moderation_history
if entry['result'].recommended_action == 'block'
)
return {
'total_moderated': total,
'blocked': blocked,
'block_rate': blocked / total,
'common_violations': self._get_common_violations()
}
def _get_common_violations(self) -> Dict[str, int]:
"""Get most common violation types"""
from collections import Counter
all_violations = []
for entry in self.moderation_history:
all_violations.extend(entry['result'].categories_violated)
return dict(Counter(all_violations).most_common(5))
# Example usage
def demonstrate_moderation():
"""Demonstrate content moderation"""
moderator = ContentModerator()
test_cases = [
"This is a safe message",
"My email is test@example.com",
"I hate everyone", # Toxic
"How to hack a website" # Harmful
]
print("Content Moderation Results:\n")
for content in test_cases:
result = moderator.moderate_content(content)
print(f"Content: {content}")
print(f" Safe: {result.is_safe}")
print(f" Action: {result.recommended_action}")
print(f" Severity: {result.severity}")
if result.categories_violated:
print(f" Violations: {', '.join(result.categories_violated)}")
print()
# Statistics
stats = moderator.get_moderation_stats()
print("Moderation Statistics:")
print(f" Total: {stats['total_moderated']}")
print(f" Blocked: {stats['blocked']}")
print(f" Block rate: {stats['block_rate']*100:.1f}%")
demonstrate_moderation()
RLHF and Alignment
class RLHFFeedbackCollector:
"""Collect human feedback for RLHF"""
def __init__(self):
self.feedback_data = []
def collect_comparison_feedback(
self,
prompt: str,
response_a: str,
response_b: str,
preference: str, # 'a', 'b', or 'equal'
reasons: List[str]
):
"""
Collect comparison feedback
Used for training reward models in RLHF
"""
self.feedback_data.append({
'type': 'comparison',
'prompt': prompt,
'response_a': response_a,
'response_b': response_b,
'preference': preference,
'reasons': reasons,
'timestamp': datetime.now()
})
def collect_rating_feedback(
self,
prompt: str,
response: str,
rating: int, # 1-5
aspects: Dict[str, int] # helpfulness, harmlessness, honesty
):
"""Collect rating feedback"""
self.feedback_data.append({
'type': 'rating',
'prompt': prompt,
'response': response,
'rating': rating,
'aspects': aspects,
'timestamp': datetime.now()
})
def export_for_training(self) -> List[Dict]:
"""Export feedback data for model training"""
return self.feedback_data
# Example
collector = RLHFFeedbackCollector()
# Comparison feedback
collector.collect_comparison_feedback(
prompt="Explain quantum computing",
response_a="Quantum computing uses quantum bits...",
response_b="Quantum computers are very fast...",
preference='a',
reasons=['more accurate', 'better explanation']
)
# Rating feedback
collector.collect_rating_feedback(
prompt="How to learn Python?",
response="Start with Python tutorials...",
rating=4,
aspects={
'helpfulness': 5,
'harmlessness': 5,
'honesty': 4
}
)
print(f"Collected {len(collector.feedback_data)} feedback entries")
Production Safety System
class ProductionSafetySystem:
"""Complete production-ready safety system"""
def __init__(self, llm_client):
self.client = llm_client
self.moderator = ContentModerator()
self.guardrails = SafetyGuardrails(llm_client)
self.feedback_collector = RLHFFeedbackCollector()
def safe_completion(
self,
prompt: str,
user_id: Optional[str] = None,
session_id: Optional[str] = None
) -> Dict:
"""
Generate completion with full safety pipeline
Pipeline:
1. Input moderation
2. Content generation with guardrails
3. Output moderation
4. PII redaction
5. Logging and monitoring
"""
# Moderate input
input_moderation = self.moderator.moderate_content(prompt)
if not input_moderation.is_safe:
return {
'success': False,
'error': 'Input violates content policy',
'details': input_moderation
}
# Generate with guardrails
generation_result = self.guardrails.safe_generate(prompt)
if not generation_result['success']:
return generation_result
response = generation_result['response']
# Moderate output
output_moderation = self.moderator.moderate_content(response)
if not output_moderation.is_safe:
return {
'success': False,
'error': 'Generated content violates policy',
'details': output_moderation
}
# Log for monitoring
self._log_interaction(
prompt=prompt,
response=response,
user_id=user_id,
session_id=session_id,
input_moderation=input_moderation,
output_moderation=output_moderation
)
return {
'success': True,
'response': response,
'safety_checks': {
'input': input_moderation,
'output': output_moderation
}
}
def _log_interaction(self, **kwargs):
"""Log interaction for monitoring"""
# In production: log to database/monitoring system
pass
# Example
system = ProductionSafetySystem(None)
result = system.safe_completion("How do I learn programming?")
if result['success']:
print(f"Response: {result['response']}")
else:
print(f"Error: {result['error']}")
Quiz
Test your understanding of Constitutional AI and safety:
Summary
In this lesson, you learned:
- Constitutional AI: Using AI to critique and improve its own outputs
- Safety guardrails: Multi-layer input and output filtering
- Content moderation: Comprehensive moderation systems
- PII protection: Detecting and redacting sensitive information
- Production safety: Building complete safety systems for deployment
Safety and alignment are critical for responsible AI deployment. These techniques help ensure LLMs behave helpfully, harmlessly, and honestly.