Cost Optimization for LLM Applications
Master strategies to minimize LLM inference costs while maintaining quality and performance in production applications.
What You'll Learn: LLM costs can quickly spiral in production. We'll explore proven strategies to optimize costs including smart model selection, batching, caching, and intelligent routing between models.
Understanding LLM Costs
Cost Breakdown
from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime, timedelta
import pandas as pd
@dataclass
class ModelPricing:
"""Pricing information for different models"""
name: str
input_price_per_1k: float # USD per 1K input tokens
output_price_per_1k: float # USD per 1K output tokens
context_window: int
avg_tokens_per_second: float
# Current pricing (as of 2024 - check latest prices)
MODELS = {
"gpt-4-turbo": ModelPricing(
name="GPT-4 Turbo",
input_price_per_1k=0.01,
output_price_per_1k=0.03,
context_window=128000,
avg_tokens_per_second=40
),
"gpt-3.5-turbo": ModelPricing(
name="GPT-3.5 Turbo",
input_price_per_1k=0.0005,
output_price_per_1k=0.0015,
context_window=16000,
avg_tokens_per_second=90
),
"claude-3-opus": ModelPricing(
name="Claude 3 Opus",
input_price_per_1k=0.015,
output_price_per_1k=0.075,
context_window=200000,
avg_tokens_per_second=35
),
"claude-3-sonnet": ModelPricing(
name="Claude 3 Sonnet",
input_price_per_1k=0.003,
output_price_per_1k=0.015,
context_window=200000,
avg_tokens_per_second=60
),
"claude-3-haiku": ModelPricing(
name="Claude 3 Haiku",
input_price_per_1k=0.00025,
output_price_per_1k=0.00125,
context_window=200000,
avg_tokens_per_second=100
),
"llama-2-70b": ModelPricing(
name="Llama 2 70B (self-hosted)",
input_price_per_1k=0.0, # Self-hosted
output_price_per_1k=0.0,
context_window=4096,
avg_tokens_per_second=25 # Depends on hardware
)
}
class CostCalculator:
"""Calculate and analyze LLM costs"""
def __init__(self, model_pricing: Dict[str, ModelPricing]):
self.models = model_pricing
def calculate_request_cost(
self,
model_name: str,
input_tokens: int,
output_tokens: int
) -> float:
"""Calculate cost for a single request"""
model = self.models[model_name]
input_cost = (input_tokens / 1000) * model.input_price_per_1k
output_cost = (output_tokens / 1000) * model.output_price_per_1k
return input_cost + output_cost
def estimate_monthly_cost(
self,
model_name: str,
requests_per_day: int,
avg_input_tokens: int,
avg_output_tokens: int,
days: int = 30
) -> Dict[str, float]:
"""Estimate monthly costs"""
cost_per_request = self.calculate_request_cost(
model_name,
avg_input_tokens,
avg_output_tokens
)
total_requests = requests_per_day * days
total_cost = cost_per_request * total_requests
return {
"cost_per_request": cost_per_request,
"daily_cost": cost_per_request * requests_per_day,
"monthly_cost": total_cost,
"total_requests": total_requests
}
def compare_models(
self,
requests_per_day: int,
avg_input_tokens: int,
avg_output_tokens: int
) -> pd.DataFrame:
"""Compare costs across models"""
results = []
for model_name, model in self.models.items():
costs = self.estimate_monthly_cost(
model_name,
requests_per_day,
avg_input_tokens,
avg_output_tokens
)
results.append({
"Model": model.name,
"Cost per Request": f"${costs['cost_per_request']:.6f}",
"Daily Cost": f"${costs['daily_cost']:.2f}",
"Monthly Cost": f"${costs['monthly_cost']:.2f}",
"Tokens/sec": model.avg_tokens_per_second
})
return pd.DataFrame(results)
def find_optimal_model(
self,
budget: float,
requests_per_day: int,
avg_input_tokens: int,
avg_output_tokens: int,
min_quality_score: float = 0.0
) -> List[str]:
"""Find models within budget"""
affordable_models = []
for model_name in self.models.keys():
costs = self.estimate_monthly_cost(
model_name,
requests_per_day,
avg_input_tokens,
avg_output_tokens
)
if costs["monthly_cost"] <= budget:
affordable_models.append({
"model": model_name,
"monthly_cost": costs["monthly_cost"]
})
return sorted(affordable_models, key=lambda x: x["monthly_cost"])
# Example usage
calculator = CostCalculator(MODELS)
# Calculate single request cost
cost = calculator.calculate_request_cost(
"gpt-4-turbo",
input_tokens=500,
output_tokens=200
)
print(f"Single request cost: ${cost:.4f}")
# Estimate monthly costs
monthly = calculator.estimate_monthly_cost(
"gpt-3.5-turbo",
requests_per_day=10000,
avg_input_tokens=500,
avg_output_tokens=200
)
print(f"\nMonthly cost estimate: ${monthly['monthly_cost']:.2f}")
# Compare models
comparison = calculator.compare_models(
requests_per_day=10000,
avg_input_tokens=500,
avg_output_tokens=200
)
print("\nModel Comparison:")
print(comparison)
# Find models within budget
affordable = calculator.find_optimal_model(
budget=1000, # $1000/month
requests_per_day=10000,
avg_input_tokens=500,
avg_output_tokens=200
)
print(f"\nModels within $1000/month budget:")
for model in affordable:
print(f" {model['model']}: ${model['monthly_cost']:.2f}/month")
Model Selection Strategies
Smart Model Selection: Use smaller, cheaper models when possible and reserve expensive models for complex tasks. This can reduce costs by 70-90%.
Task-Based Model Routing
from typing import Callable, Any
import re
class ModelRouter:
"""Route requests to appropriate models based on task complexity"""
def __init__(self, cost_calculator: CostCalculator):
self.cost_calculator = cost_calculator
self.routing_rules = []
self.model_stats = {}
def add_rule(
self,
name: str,
condition: Callable[[str], bool],
model: str,
priority: int = 0
):
"""Add routing rule"""
self.routing_rules.append({
"name": name,
"condition": condition,
"model": model,
"priority": priority
})
# Sort by priority
self.routing_rules.sort(key=lambda x: x["priority"], reverse=True)
def route(self, prompt: str, metadata: Dict[str, Any] = None) -> str:
"""Determine which model to use"""
metadata = metadata or {}
# Check rules in priority order
for rule in self.routing_rules:
if rule["condition"](prompt):
model = rule["model"]
# Track usage
if model not in self.model_stats:
self.model_stats[model] = {"count": 0, "rule": rule["name"]}
self.model_stats[model]["count"] += 1
return model
# Default to cheapest model
return "gpt-3.5-turbo"
def get_stats(self) -> Dict:
"""Get routing statistics"""
total = sum(stats["count"] for stats in self.model_stats.values())
return {
"total_requests": total,
"model_distribution": self.model_stats
}
# Create router with rules
router = ModelRouter(calculator)
# Rule 1: Simple queries -> cheapest model
router.add_rule(
name="simple_query",
condition=lambda p: len(p.split()) < 20,
model="claude-3-haiku",
priority=1
)
# Rule 2: Complex reasoning -> best model
router.add_rule(
name="complex_reasoning",
condition=lambda p: any(word in p.lower() for word in [
"analyze", "reasoning", "complex", "explain in detail"
]),
model="gpt-4-turbo",
priority=3
)
# Rule 3: Code generation -> good code model
router.add_rule(
name="code_generation",
condition=lambda p: any(word in p.lower() for word in [
"code", "function", "class", "implement", "python", "javascript"
]),
model="claude-3-sonnet",
priority=2
)
# Rule 4: Long context -> model with large context window
router.add_rule(
name="long_context",
condition=lambda p: len(p.split()) > 1000,
model="claude-3-opus",
priority=2
)
# Test routing
test_prompts = [
"What is the capital of France?",
"Implement a binary search tree in Python with insert, delete, and search methods",
"Analyze the geopolitical implications of climate change on global trade patterns",
"Hi there!"
]
print("Model Routing Examples:")
for prompt in test_prompts:
model = router.route(prompt)
print(f"\nPrompt: {prompt[:60]}...")
print(f"Routed to: {model}")
# Get routing stats
stats = router.get_stats()
print(f"\nRouting Statistics:")
print(f"Total requests: {stats['total_requests']}")
for model, info in stats['model_distribution'].items():
print(f" {model}: {info['count']} ({info['rule']})")
Adaptive Model Selection
class AdaptiveModelSelector:
"""Dynamically select models based on quality feedback"""
def __init__(self):
self.model_performance = {} # Track quality scores
self.model_costs = {} # Track actual costs
self.request_history = []
def select_model(
self,
task_type: str,
quality_threshold: float = 0.7,
budget_limit: Optional[float] = None
) -> str:
"""Select model based on historical performance"""
# Get candidate models for task type
candidates = self._get_candidate_models(task_type)
# Filter by quality threshold
qualified = [
model for model in candidates
if self._get_avg_quality(model, task_type) >= quality_threshold
]
if not qualified:
# If no model meets quality, use best available
qualified = candidates
# Sort by cost efficiency (quality / cost)
ranked = sorted(
qualified,
key=lambda m: self._get_cost_efficiency(m, task_type),
reverse=True
)
# Apply budget constraint
if budget_limit:
ranked = [
m for m in ranked
if self._get_avg_cost(m, task_type) <= budget_limit
]
return ranked[0] if ranked else "gpt-3.5-turbo"
def record_result(
self,
model: str,
task_type: str,
quality_score: float,
cost: float,
tokens_used: int
):
"""Record model performance"""
key = f"{model}:{task_type}"
if key not in self.model_performance:
self.model_performance[key] = []
self.model_costs[key] = []
self.model_performance[key].append(quality_score)
self.model_costs[key].append(cost)
self.request_history.append({
"model": model,
"task_type": task_type,
"quality": quality_score,
"cost": cost,
"tokens": tokens_used,
"timestamp": datetime.now()
})
def _get_candidate_models(self, task_type: str) -> List[str]:
"""Get candidate models for task"""
# Simple rule-based candidates
if task_type == "simple":
return ["claude-3-haiku", "gpt-3.5-turbo"]
elif task_type == "complex":
return ["gpt-4-turbo", "claude-3-opus", "claude-3-sonnet"]
elif task_type == "code":
return ["gpt-4-turbo", "claude-3-sonnet"]
else:
return ["gpt-3.5-turbo", "claude-3-sonnet"]
def _get_avg_quality(self, model: str, task_type: str) -> float:
"""Get average quality score for model on task type"""
key = f"{model}:{task_type}"
if key not in self.model_performance:
return 0.5 # Default assumption
scores = self.model_performance[key]
return sum(scores) / len(scores)
def _get_avg_cost(self, model: str, task_type: str) -> float:
"""Get average cost for model on task type"""
key = f"{model}:{task_type}"
if key not in self.model_costs:
return float('inf')
costs = self.model_costs[key]
return sum(costs) / len(costs)
def _get_cost_efficiency(self, model: str, task_type: str) -> float:
"""Calculate cost efficiency (quality per dollar)"""
quality = self._get_avg_quality(model, task_type)
cost = self._get_avg_cost(model, task_type)
if cost == 0:
return quality
return quality / cost
def get_recommendations(self) -> Dict:
"""Get model recommendations based on history"""
recommendations = {}
for task_type in ["simple", "complex", "code"]:
candidates = self._get_candidate_models(task_type)
ranked = sorted(
candidates,
key=lambda m: self._get_cost_efficiency(m, task_type),
reverse=True
)
recommendations[task_type] = {
"best_model": ranked[0] if ranked else None,
"efficiency": self._get_cost_efficiency(ranked[0], task_type) if ranked else 0
}
return recommendations
# Example usage
selector = AdaptiveModelSelector()
# Simulate some requests and record results
selector.record_result("gpt-3.5-turbo", "simple", quality_score=0.9, cost=0.001, tokens_used=100)
selector.record_result("claude-3-haiku", "simple", quality_score=0.85, cost=0.0005, tokens_used=100)
selector.record_result("gpt-4-turbo", "complex", quality_score=0.95, cost=0.02, tokens_used=500)
selector.record_result("claude-3-sonnet", "complex", quality_score=0.92, cost=0.008, tokens_used=500)
# Select model for new request
recommended = selector.select_model("simple", quality_threshold=0.8)
print(f"Recommended model for simple task: {recommended}")
recommendations = selector.get_recommendations()
print("\nModel Recommendations:")
for task, rec in recommendations.items():
print(f"{task}: {rec['best_model']} (efficiency: {rec['efficiency']:.2f})")
Batching Optimization
Batching: Process multiple requests together to amortize overhead and reduce per-request costs. This is especially effective for API-based models with per-request pricing.
Intelligent Batching
import asyncio
from collections import defaultdict
import time
class IntelligentBatcher:
"""Batch requests intelligently to minimize costs"""
def __init__(
self,
max_batch_size: int = 10,
max_wait_time: float = 1.0,
cost_calculator: CostCalculator = None
):
self.max_batch_size = max_batch_size
self.max_wait_time = max_wait_time
self.cost_calculator = cost_calculator
self.queues = defaultdict(list) # Separate queues per model
self.processing = False
async def add_request(
self,
prompt: str,
model: str,
**kwargs
) -> str:
"""Add request to batch queue"""
future = asyncio.Future()
self.queues[model].append({
"prompt": prompt,
"kwargs": kwargs,
"future": future,
"timestamp": time.time()
})
# Start processing if not already running
if not self.processing:
asyncio.create_task(self._process_queues())
return await future
async def _process_queues(self):
"""Process all queues"""
self.processing = True
while any(self.queues.values()):
# Process each model's queue
for model, queue in list(self.queues.items()):
if queue:
await self._process_batch(model, queue)
await asyncio.sleep(0.1)
self.processing = False
async def _process_batch(self, model: str, queue: List[Dict]):
"""Process a batch for a specific model"""
# Determine batch size
batch_size = min(self.max_batch_size, len(queue))
# Check if we should wait for more requests
oldest_request_age = time.time() - queue[0]["timestamp"]
if batch_size < self.max_batch_size and oldest_request_age < self.max_wait_time:
return # Wait for more requests
# Extract batch
batch = queue[:batch_size]
self.queues[model] = queue[batch_size:]
# Process batch (simulated)
prompts = [req["prompt"] for req in batch]
# Here you would call the actual model API with batching
# For demonstration, we'll simulate
await asyncio.sleep(0.5) # Simulate API call
# Calculate cost savings from batching
individual_costs = sum(
self.cost_calculator.calculate_request_cost(
model,
len(req["prompt"].split()),
50 # Estimated output tokens
)
for req in batch
)
# Batching reduces overhead (simulated 20% savings)
batch_cost = individual_costs * 0.8
savings = individual_costs - batch_cost
print(f"Processed batch of {len(batch)} for {model}")
print(f" Individual cost: ${individual_costs:.4f}")
print(f" Batch cost: ${batch_cost:.4f}")
print(f" Savings: ${savings:.4f} ({savings/individual_costs*100:.1f}%)")
# Return results to futures
for i, req in enumerate(batch):
response = f"Response for: {req['prompt'][:50]}..."
req["future"].set_result(response)
# Example usage with batching
async def test_batching():
calculator = CostCalculator(MODELS)
batcher = IntelligentBatcher(
max_batch_size=5,
max_wait_time=2.0,
cost_calculator=calculator
)
# Send multiple requests
tasks = []
for i in range(10):
task = batcher.add_request(
f"Question {i}: What is the meaning of life?",
model="gpt-3.5-turbo"
)
tasks.append(task)
# Small delay to simulate requests arriving over time
await asyncio.sleep(0.3)
# Wait for all responses
responses = await asyncio.gather(*tasks)
print(f"\nReceived {len(responses)} responses")
# asyncio.run(test_batching())
Caching for Cost Reduction
class CostOptimizedCache:
"""Cache with cost tracking"""
def __init__(self, redis_cache=None):
self.cache = {}
self.redis_cache = redis_cache
self.cost_saved = 0.0
self.cost_calculator = CostCalculator(MODELS)
def get(
self,
prompt: str,
model: str,
expected_tokens: int = 100
) -> Optional[str]:
"""Get from cache and track savings"""
key = hashlib.sha256(f"{model}:{prompt}".encode()).hexdigest()
if key in self.cache:
# Calculate cost that would have been incurred
saved_cost = self.cost_calculator.calculate_request_cost(
model,
len(prompt.split()),
expected_tokens
)
self.cost_saved += saved_cost
print(f"Cache HIT - Saved ${saved_cost:.4f}")
return self.cache[key]
return None
def set(self, prompt: str, model: str, response: str):
"""Store in cache"""
key = hashlib.sha256(f"{model}:{prompt}".encode()).hexdigest()
self.cache[key] = response
def get_savings_report(self) -> Dict:
"""Get cost savings report"""
return {
"total_saved": self.cost_saved,
"cache_size": len(self.cache),
"avg_saved_per_hit": self.cost_saved / max(len(self.cache), 1)
}
# Example
cache = CostOptimizedCache()
# First request - miss
response = cache.get("What is AI?", "gpt-4-turbo")
if not response:
response = "AI is artificial intelligence..." # Simulated API call
cache.set("What is AI?", "gpt-4-turbo", response)
# Second request - hit
response = cache.get("What is AI?", "gpt-4-turbo")
# Get report
report = cache.get_savings_report()
print(f"\nCost Savings Report:")
print(f"Total saved: ${report['total_saved']:.4f}")
print(f"Cache entries: {report['cache_size']}")
Monitoring and Optimization
class CostMonitor:
"""Monitor and analyze costs"""
def __init__(self):
self.requests = []
self.daily_costs = defaultdict(float)
def log_request(
self,
model: str,
input_tokens: int,
output_tokens: int,
cost: float,
cached: bool = False
):
"""Log request for analysis"""
self.requests.append({
"timestamp": datetime.now(),
"model": model,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"cost": cost,
"cached": cached
})
# Update daily cost
date = datetime.now().date()
self.daily_costs[date] += cost
def get_cost_analysis(self, days: int = 30) -> Dict:
"""Analyze costs over period"""
cutoff = datetime.now() - timedelta(days=days)
recent_requests = [
r for r in self.requests
if r["timestamp"] > cutoff
]
total_cost = sum(r["cost"] for r in recent_requests)
cached_requests = sum(1 for r in recent_requests if r["cached"])
total_requests = len(recent_requests)
# Cost by model
model_costs = defaultdict(float)
for r in recent_requests:
model_costs[r["model"]] += r["cost"]
return {
"period_days": days,
"total_cost": total_cost,
"total_requests": total_requests,
"avg_cost_per_request": total_cost / max(total_requests, 1),
"cache_hit_rate": cached_requests / max(total_requests, 1),
"daily_avg_cost": total_cost / days,
"model_breakdown": dict(model_costs),
"projected_monthly": total_cost / days * 30
}
def get_optimization_suggestions(self) -> List[str]:
"""Get cost optimization suggestions"""
analysis = self.get_cost_analysis()
suggestions = []
# Check cache hit rate
if analysis["cache_hit_rate"] < 0.3:
suggestions.append(
f"Cache hit rate is only {analysis['cache_hit_rate']*100:.1f}%. "
"Consider implementing semantic caching."
)
# Check model distribution
model_costs = analysis["model_breakdown"]
total = analysis["total_cost"]
for model, cost in model_costs.items():
percentage = cost / total * 100
if percentage > 50 and "gpt-4" in model:
suggestions.append(
f"{model} accounts for {percentage:.1f}% of costs. "
"Consider routing simple queries to cheaper models."
)
# Check projected monthly cost
if analysis["projected_monthly"] > 1000:
suggestions.append(
f"Projected monthly cost is ${analysis['projected_monthly']:.2f}. "
"Consider implementing request batching and aggressive caching."
)
return suggestions
# Example usage
monitor = CostMonitor()
# Simulate some requests
monitor.log_request("gpt-4-turbo", 500, 200, 0.011, cached=False)
monitor.log_request("gpt-3.5-turbo", 300, 150, 0.00045, cached=False)
monitor.log_request("gpt-4-turbo", 500, 200, 0.0, cached=True)
# Get analysis
analysis = monitor.get_cost_analysis(days=7)
print("Cost Analysis (7 days):")
print(f"Total cost: ${analysis['total_cost']:.4f}")
print(f"Average per request: ${analysis['avg_cost_per_request']:.4f}")
print(f"Cache hit rate: {analysis['cache_hit_rate']*100:.1f}%")
print(f"Projected monthly: ${analysis['projected_monthly']:.2f}")
# Get suggestions
suggestions = monitor.get_optimization_suggestions()
print("\nOptimization Suggestions:")
for i, suggestion in enumerate(suggestions, 1):
print(f"{i}. {suggestion}")
Quiz
Test your understanding of cost optimization:
Summary
In this lesson, you learned:
- Cost analysis: Understanding and calculating LLM costs
- Model routing: Intelligently selecting models based on task complexity
- Batching: Reducing per-request costs through efficient batching
- Caching strategies: Maximizing cache hit rates for cost savings
- Monitoring: Tracking costs and identifying optimization opportunities
Effective cost optimization can reduce LLM inference costs by 70-90% while maintaining quality, making production applications economically viable.