Model Quantization Techniques
Learn how to reduce model size and increase inference speed through advanced quantization techniques while maintaining model quality.
What You'll Learn: Quantization reduces model precision from FP16/FP32 to INT8/INT4, dramatically reducing memory usage and increasing inference speed. We'll explore state-of-the-art techniques used in production.
Understanding Quantization
Precision Formats
Quantization converts high-precision weights to lower precision:
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
# Demonstrate precision differences
def show_precision_comparison():
"""Compare different precision formats"""
# Original FP32 value
fp32_value = np.float32(3.14159265359)
# FP16 (half precision)
fp16_value = np.float16(fp32_value)
# INT8 quantization (symmetric)
def quantize_int8(value, scale):
return np.clip(np.round(value / scale), -128, 127).astype(np.int8)
scale = fp32_value / 127
int8_value = quantize_int8(fp32_value, scale)
dequantized = int8_value * scale
print(f"FP32: {fp32_value}")
print(f"FP16: {fp16_value} (error: {abs(fp32_value - fp16_value)})")
print(f"INT8: {int8_value} -> {dequantized} (error: {abs(fp32_value - dequantized)})")
print(f"\nMemory savings:")
print(f"FP16 vs FP32: {(1 - 16/32) * 100:.1f}%")
print(f"INT8 vs FP32: {(1 - 8/32) * 100:.1f}%")
print(f"INT4 vs FP32: {(1 - 4/32) * 100:.1f}%")
show_precision_comparison()
Quantization Methods
# Symmetric vs Asymmetric Quantization
class QuantizationDemo:
@staticmethod
def symmetric_quantization(tensor, bits=8):
"""Symmetric quantization: zero point is 0"""
qmin = -(2 ** (bits - 1))
qmax = 2 ** (bits - 1) - 1
scale = tensor.abs().max() / qmax
quantized = torch.clamp(torch.round(tensor / scale), qmin, qmax)
return quantized, scale
@staticmethod
def asymmetric_quantization(tensor, bits=8):
"""Asymmetric quantization: uses zero point"""
qmin = 0
qmax = 2 ** bits - 1
min_val = tensor.min()
max_val = tensor.max()
scale = (max_val - min_val) / (qmax - qmin)
zero_point = qmin - torch.round(min_val / scale)
quantized = torch.clamp(
torch.round(tensor / scale + zero_point),
qmin, qmax
)
return quantized, scale, zero_point
@staticmethod
def compare_methods():
"""Compare quantization methods"""
# Create sample tensor with asymmetric distribution
tensor = torch.randn(1000) * 2 + 5
# Symmetric
quant_sym, scale_sym = QuantizationDemo.symmetric_quantization(tensor)
dequant_sym = quant_sym * scale_sym
error_sym = torch.mean((tensor - dequant_sym) ** 2)
# Asymmetric
quant_asym, scale_asym, zp = QuantizationDemo.asymmetric_quantization(tensor)
dequant_asym = (quant_asym - zp) * scale_asym
error_asym = torch.mean((tensor - dequant_asym) ** 2)
print(f"Symmetric MSE: {error_sym:.6f}")
print(f"Asymmetric MSE: {error_asym:.6f}")
print(f"Asymmetric is {error_sym/error_asym:.2f}x better for asymmetric data")
QuantizationDemo.compare_methods()
GPTQ Quantization
GPTQ (Gradient-based Post-Training Quantization) is one of the most popular methods for 4-bit quantization, offering excellent quality-size tradeoffs.
GPTQ Implementation
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch
class GPTQQuantizer:
def __init__(self, model_name: str):
self.model_name = model_name
self.device = "cuda" if torch.cuda.is_available() else "cpu"
def quantize_model(
self,
bits: int = 4,
group_size: int = 128,
dataset: str = "c4",
use_exllama: bool = True
):
"""
Quantize model using GPTQ
Args:
bits: Number of bits (4 or 8)
group_size: Size of quantization groups
dataset: Calibration dataset
use_exllama: Use ExLlama kernels for faster inference
"""
print(f"Quantizing {self.model_name} to {bits}-bit...")
# Configure GPTQ
gptq_config = GPTQConfig(
bits=bits,
group_size=group_size,
dataset=dataset,
use_exllama=use_exllama,
desc_act=False # Disable activation quantization
)
# Load and quantize model
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=gptq_config,
torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# Get model size
param_count = sum(p.numel() for p in model.parameters())
print(f"Quantized model parameters: {param_count:,}")
return model, tokenizer
def save_quantized_model(self, model, tokenizer, output_dir: str):
"""Save quantized model"""
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Saved quantized model to {output_dir}")
def benchmark_model(self, model, tokenizer, prompt: str):
"""Benchmark quantized model"""
import time
inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
# Warmup
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=50)
# Benchmark
start = time.time()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=False
)
end = time.time()
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
tokens_per_second = 100 / (end - start)
return {
"text": generated_text,
"tokens_per_second": tokens_per_second,
"latency": end - start
}
# Example usage
quantizer = GPTQQuantizer("meta-llama/Llama-2-7b-hf")
# Quantize to 4-bit
model_4bit, tokenizer = quantizer.quantize_model(bits=4, group_size=128)
# Benchmark
results = quantizer.benchmark_model(
model_4bit,
tokenizer,
"Explain quantum computing in simple terms:"
)
print(f"\nGenerated: {results['text'][:100]}...")
print(f"Speed: {results['tokens_per_second']:.2f} tokens/sec")
Advanced GPTQ Configuration
class AdvancedGPTQConfig:
@staticmethod
def create_optimized_config(use_case: str):
"""Create optimized GPTQ config for different use cases"""
configs = {
"quality": GPTQConfig(
bits=4,
group_size=128, # Smaller groups = better quality
dataset="c4",
use_exllama=True,
desc_act=True, # Activation quantization
damp_percent=0.01
),
"speed": GPTQConfig(
bits=4,
group_size=256, # Larger groups = faster
dataset="c4",
use_exllama=True,
use_exllama_v2=True, # Faster kernels
desc_act=False
),
"memory": GPTQConfig(
bits=3, # Ultra-low precision
group_size=128,
dataset="c4",
use_exllama=False
)
}
return configs.get(use_case, configs["quality"])
@staticmethod
def compare_configurations(model_name: str):
"""Compare different GPTQ configurations"""
results = {}
for use_case in ["quality", "speed", "memory"]:
config = AdvancedGPTQConfig.create_optimized_config(use_case)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=config,
torch_dtype=torch.float16
)
# Calculate metrics
param_size = sum(
p.element_size() * p.numel() for p in model.parameters()
) / 1024**3 # GB
results[use_case] = {
"config": config.to_dict(),
"size_gb": param_size
}
del model
torch.cuda.empty_cache()
return results
AWQ Quantization
AWQ (Activation-aware Weight Quantization) protects important weights based on activation magnitudes, often outperforming GPTQ in quality.
AWQ Implementation
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
class AWQQuantizer:
def __init__(self, model_name: str):
self.model_name = model_name
def quantize_model(
self,
bits: int = 4,
group_size: int = 128,
zero_point: bool = True
):
"""Quantize model using AWQ"""
print(f"Loading model for AWQ quantization...")
# Load model
model = AutoAWQForCausalLM.from_pretrained(
self.model_name,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# Quantization config
quant_config = {
"zero_point": zero_point,
"q_group_size": group_size,
"w_bit": bits,
"version": "GEMM"
}
# Prepare calibration data
calib_data = self.prepare_calibration_data(tokenizer)
# Quantize
print("Quantizing model...")
model.quantize(
tokenizer,
quant_config=quant_config,
calib_data=calib_data
)
return model, tokenizer
def prepare_calibration_data(self, tokenizer, num_samples: int = 128):
"""Prepare calibration dataset"""
from datasets import load_dataset
# Load calibration dataset
dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
# Prepare samples
samples = []
for i, sample in enumerate(dataset):
if i >= num_samples:
break
text = sample["text"]
tokens = tokenizer(
text,
return_tensors="pt",
max_length=512,
truncation=True
)
samples.append(tokens["input_ids"])
return samples
def compare_awq_gptq(self, prompt: str):
"""Compare AWQ vs GPTQ quantization"""
import time
results = {}
# AWQ quantization
print("Testing AWQ...")
model_awq, tokenizer = self.quantize_model(bits=4)
start = time.time()
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs_awq = model_awq.generate(**inputs, max_new_tokens=100)
awq_time = time.time() - start
results["awq"] = {
"text": tokenizer.decode(outputs_awq[0], skip_special_tokens=True),
"time": awq_time,
"memory_gb": torch.cuda.max_memory_allocated() / 1024**3
}
del model_awq
torch.cuda.empty_cache()
# GPTQ quantization
print("Testing GPTQ...")
gptq_config = GPTQConfig(bits=4, group_size=128, dataset="c4")
model_gptq = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=gptq_config
)
start = time.time()
outputs_gptq = model_gptq.generate(**inputs, max_new_tokens=100)
gptq_time = time.time() - start
results["gptq"] = {
"text": tokenizer.decode(outputs_gptq[0], skip_special_tokens=True),
"time": gptq_time,
"memory_gb": torch.cuda.max_memory_allocated() / 1024**3
}
return results
# Example usage
awq_quantizer = AWQQuantizer("meta-llama/Llama-2-7b-hf")
comparison = awq_quantizer.compare_awq_gptq(
"Explain the theory of relativity:"
)
for method, metrics in comparison.items():
print(f"\n{method.upper()}:")
print(f"Time: {metrics['time']:.2f}s")
print(f"Memory: {metrics['memory_gb']:.2f} GB")
print(f"Output: {metrics['text'][:100]}...")
GGUF Format for CPU Inference
GGUF (GPT-Generated Unified Format) enables efficient CPU inference with llama.cpp, perfect for edge deployment and local inference.
GGUF Conversion and Usage
import subprocess
import os
from pathlib import Path
class GGUFConverter:
def __init__(self, llama_cpp_path: str):
self.llama_cpp_path = Path(llama_cpp_path)
def convert_to_gguf(
self,
model_path: str,
output_path: str,
quantization_type: str = "Q4_K_M"
):
"""
Convert model to GGUF format
Quantization types:
- Q4_0: Original 4-bit quantization
- Q4_K_M: Medium quality 4-bit
- Q5_K_M: Medium quality 5-bit
- Q8_0: 8-bit quantization
"""
# Convert to GGUF format
convert_script = self.llama_cpp_path / "convert.py"
print(f"Converting to GGUF format...")
subprocess.run([
"python",
str(convert_script),
str(model_path),
"--outfile", output_path,
"--outtype", "f16"
], check=True)
# Quantize
print(f"Quantizing to {quantization_type}...")
quantize_bin = self.llama_cpp_path / "quantize"
quantized_output = output_path.replace(".gguf", f"-{quantization_type}.gguf")
subprocess.run([
str(quantize_bin),
output_path,
quantized_output,
quantization_type
], check=True)
# Get file sizes
original_size = os.path.getsize(output_path) / 1024**3
quantized_size = os.path.getsize(quantized_output) / 1024**3
print(f"\nOriginal: {original_size:.2f} GB")
print(f"Quantized: {quantized_size:.2f} GB")
print(f"Reduction: {(1 - quantized_size/original_size)*100:.1f}%")
return quantized_output
def run_inference(
self,
model_path: str,
prompt: str,
n_threads: int = 8,
n_gpu_layers: int = 0
):
"""Run inference with GGUF model"""
llama_bin = self.llama_cpp_path / "main"
cmd = [
str(llama_bin),
"-m", model_path,
"-p", prompt,
"-n", "100",
"-t", str(n_threads),
"-ngl", str(n_gpu_layers), # GPU layers for hybrid inference
"--temp", "0.7",
"--top-k", "40",
"--top-p", "0.9"
]
result = subprocess.run(
cmd,
capture_output=True,
text=True
)
return result.stdout
# Usage with llama-cpp-python
from llama_cpp import Llama
class GGUFInference:
def __init__(self, model_path: str, n_gpu_layers: int = 0):
"""Initialize GGUF model for inference"""
self.llm = Llama(
model_path=model_path,
n_gpu_layers=n_gpu_layers, # Number of layers to offload to GPU
n_ctx=2048, # Context window
n_threads=8, # CPU threads
n_batch=512, # Batch size
verbose=False
)
def generate(
self,
prompt: str,
max_tokens: int = 100,
temperature: float = 0.7,
top_p: float = 0.9
):
"""Generate text"""
output = self.llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
echo=False # Don't echo prompt
)
return output["choices"][0]["text"]
def stream_generate(self, prompt: str, max_tokens: int = 100):
"""Stream generation"""
for token in self.llm(
prompt,
max_tokens=max_tokens,
stream=True
):
text = token["choices"][0]["text"]
yield text
def benchmark(self, prompt: str, num_runs: int = 5):
"""Benchmark inference speed"""
import time
times = []
token_counts = []
for _ in range(num_runs):
start = time.time()
output = self.generate(prompt, max_tokens=100)
elapsed = time.time() - start
times.append(elapsed)
# Rough token count
token_counts.append(len(output.split()))
avg_time = sum(times) / len(times)
avg_tokens = sum(token_counts) / len(token_counts)
tokens_per_second = avg_tokens / avg_time
return {
"avg_latency": avg_time,
"tokens_per_second": tokens_per_second,
"total_runs": num_runs
}
# Example usage
gguf_model = GGUFInference(
model_path="./models/llama-2-7b-Q4_K_M.gguf",
n_gpu_layers=0 # CPU only
)
# Generate
response = gguf_model.generate("Explain machine learning:")
print(response)
# Stream
print("\nStreaming generation:")
for token in gguf_model.stream_generate("Once upon a time"):
print(token, end="", flush=True)
# Benchmark
metrics = gguf_model.benchmark("Write a short poem about AI:")
print(f"\n\nBenchmark results:")
print(f"Latency: {metrics['avg_latency']:.2f}s")
print(f"Speed: {metrics['tokens_per_second']:.2f} tokens/sec")
Production Quantization Pipeline
Best Practices: Always validate quantized models on your specific tasks. Quality can vary significantly depending on the model, quantization method, and use case.
class ProductionQuantizationPipeline:
def __init__(self, model_name: str, output_dir: str):
self.model_name = model_name
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def quantize_all_methods(self):
"""Quantize model using all methods for comparison"""
results = {}
# GPTQ 4-bit
print("Quantizing with GPTQ 4-bit...")
gptq_4bit = self.quantize_gptq(bits=4)
results["gptq_4bit"] = self.evaluate_model(gptq_4bit)
# AWQ 4-bit
print("Quantizing with AWQ 4-bit...")
awq_4bit = self.quantize_awq(bits=4)
results["awq_4bit"] = self.evaluate_model(awq_4bit)
# 8-bit quantization (bitsandbytes)
print("Quantizing with 8-bit...")
int8_model = self.quantize_8bit()
results["int8"] = self.evaluate_model(int8_model)
# Generate comparison report
self.generate_report(results)
return results
def quantize_gptq(self, bits: int = 4):
"""GPTQ quantization"""
gptq_config = GPTQConfig(
bits=bits,
group_size=128,
dataset="c4"
)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=gptq_config
)
# Save
save_path = self.output_dir / f"gptq_{bits}bit"
model.save_pretrained(save_path)
return model
def quantize_awq(self, bits: int = 4):
"""AWQ quantization"""
from awq import AutoAWQForCausalLM
model = AutoAWQForCausalLM.from_pretrained(self.model_name)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
quant_config = {
"zero_point": True,
"q_group_size": 128,
"w_bit": bits,
}
# Prepare calibration data
from datasets import load_dataset
dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
calib_data = []
for i, sample in enumerate(dataset):
if i >= 128:
break
tokens = tokenizer(sample["text"], return_tensors="pt", max_length=512, truncation=True)
calib_data.append(tokens["input_ids"])
model.quantize(tokenizer, quant_config=quant_config, calib_data=calib_data)
# Save
save_path = self.output_dir / f"awq_{bits}bit"
model.save_quantized(save_path)
return model
def quantize_8bit(self):
"""8-bit quantization with bitsandbytes"""
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False
)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=quantization_config
)
return model
def evaluate_model(self, model):
"""Evaluate quantized model"""
import time
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# Test prompts
prompts = [
"Explain quantum mechanics:",
"Write a Python function to sort a list:",
"What is the capital of France?"
]
metrics = {
"latencies": [],
"outputs": [],
"memory_gb": 0
}
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
start = time.time()
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
latency = time.time() - start
metrics["latencies"].append(latency)
metrics["outputs"].append(
tokenizer.decode(outputs[0], skip_special_tokens=True)
)
# Calculate memory
if torch.cuda.is_available():
metrics["memory_gb"] = torch.cuda.max_memory_allocated() / 1024**3
metrics["avg_latency"] = sum(metrics["latencies"]) / len(metrics["latencies"])
return metrics
def generate_report(self, results):
"""Generate comparison report"""
print("\n" + "="*60)
print("QUANTIZATION COMPARISON REPORT")
print("="*60)
for method, metrics in results.items():
print(f"\n{method.upper()}:")
print(f" Average Latency: {metrics['avg_latency']:.3f}s")
print(f" Memory Usage: {metrics['memory_gb']:.2f} GB")
print(f" Sample Output: {metrics['outputs'][0][:100]}...")
# Save to file
report_path = self.output_dir / "quantization_report.txt"
with open(report_path, "w") as f:
f.write("Quantization Comparison Report\n")
f.write("="*60 + "\n\n")
for method, metrics in results.items():
f.write(f"{method.upper()}:\n")
f.write(f" Average Latency: {metrics['avg_latency']:.3f}s\n")
f.write(f" Memory Usage: {metrics['memory_gb']:.2f} GB\n")
f.write(f" Outputs:\n")
for i, output in enumerate(metrics['outputs']):
f.write(f" {i+1}. {output}\n")
f.write("\n")
print(f"\nReport saved to {report_path}")
# Example usage
pipeline = ProductionQuantizationPipeline(
model_name="meta-llama/Llama-2-7b-hf",
output_dir="./quantized_models"
)
results = pipeline.quantize_all_methods()
Quiz
Test your understanding of quantization techniques:
Summary
In this lesson, you learned:
- Quantization fundamentals: Different precision formats and quantization methods
- GPTQ: Gradient-based quantization with excellent quality-size tradeoffs
- AWQ: Activation-aware quantization for superior quality preservation
- GGUF: CPU-optimized format for edge deployment
- Production pipelines: Comparing and validating quantization methods
Quantization is essential for deploying large language models efficiently in production, reducing both memory requirements and inference costs while maintaining acceptable quality.