Chapter 08: Need for Speed I — Device (CPU, GPU, torch.compile)
Moving from a CPU to a GPU can speed up matrix operations by 100–1000×. Understanding why requires knowing the difference between compute-bound and memory-bound operations, and how the GPU’s thousands of simple cores are organised to exploit data parallelism.
A CPU has a handful of powerful cores optimised for low-latency, branching, sequential code. A GPU has thousands of tiny cores that execute the same instruction on thousands of data elements simultaneously (SIMD). Matrix multiplication (A @ B) maps perfectly onto this model: every output element is an independent dot product.
torch.compile (introduced in PyTorch 2.0) applies a compilation step that fuses multiple Python-level operations into a single optimised kernel, eliminating the overhead of launching many small GPU kernels. For transformer models, compile can give a 1.5–3× additional speedup on top of moving to GPU.
In this chapter we profile CPU vs GPU execution, measure memory bandwidth, and show how to use torch.compile and torch.cuda.amp for peak throughput.
1. Device Setup
import torch
import time
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"Compute capability: {torch.cuda.get_device_capability(0)}")
else:
import platform
print(f"CPU: {platform.processor()}")
print("No GPU detected — timings will be CPU only.")
2. Moving Tensors to GPU
# Create a tensor on CPU
x_cpu = torch.randn(1024, 1024)
print(f"x_cpu device: {x_cpu.device}")
# Move to GPU
x_gpu = x_cpu.to(device)
print(f"x_gpu device: {x_gpu.device}")
# Operations stay on the same device — mixing raises an error
# x_cpu + x_gpu ← RuntimeError
# Move back to CPU for numpy / plotting
x_back = x_gpu.cpu().numpy()
print(f"Back on numpy: {x_back.shape}")
# Moving the entire model
import torch.nn as nn
model = nn.Linear(512, 512)
model = model.to(device)
print(f"Model weight device: {next(model.parameters()).device}")
3. Profiling Matrix Multiplication
def benchmark_matmul(size: int, device: str, n_runs: int = 50) -> float:
"""Returns average time in milliseconds for (size, size) @ (size, size)."""
A = torch.randn(size, size, device=device)
B = torch.randn(size, size, device=device)
# Warm up (first run has JIT / kernel launch overhead)
for _ in range(5):
_ = A @ B
if device == "cuda":
torch.cuda.synchronize() # wait for GPU to finish before timing
start = time.perf_counter()
for _ in range(n_runs):
C = A @ B
if device == "cuda":
torch.cuda.synchronize()
elapsed = (time.perf_counter() - start) * 1000 / n_runs # ms
return elapsed
sizes = [128, 256, 512, 1024, 2048, 4096]
print(f"{'Size':>6} {'CPU (ms)':>10} {'GPU (ms)':>10} {'Speedup':>8}")
print("-" * 42)
for s in sizes:
cpu_ms = benchmark_matmul(s, "cpu", n_runs=10)
gpu_ms = benchmark_matmul(s, device, n_runs=50) if device == "cuda" else cpu_ms
speedup = cpu_ms / gpu_ms if device == "cuda" else 1.0
print(f"{s:6d} {cpu_ms:10.2f} {gpu_ms:10.2f} {speedup:7.1f}×")
4. GPU Memory Management
if device == "cuda":
torch.cuda.empty_cache()
# Allocate a large tensor
x = torch.randn(4096, 4096, device="cuda")
print(f"Allocated: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
print(f"Reserved : {torch.cuda.memory_reserved() / 1e6:.1f} MB")
del x
torch.cuda.empty_cache()
print(f"After del: {torch.cuda.memory_allocated() / 1e6:.1f} MB")
# Memory summary
print(torch.cuda.memory_summary(abbreviated=True))
else:
print("Skipping GPU memory profiling (no CUDA device)")
5. torch.compile
import torch
class SmallTransformer(nn.Module):
def __init__(self, d_model=256, n_heads=4, n_layers=4, vocab=1000):
super().__init__()
self.emb = nn.Embedding(vocab, d_model)
encoder_layer = nn.TransformerEncoderLayer(
d_model, n_heads, dim_feedforward=4*d_model,
dropout=0.0, batch_first=True, norm_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, n_layers)
self.head = nn.Linear(d_model, vocab)
def forward(self, x):
return self.head(self.transformer(self.emb(x)))
model_eager = SmallTransformer().to(device).eval()
model_compiled = torch.compile(model_eager) # may take ~30s first call
x = torch.randint(0, 1000, (8, 128), device=device)
# Benchmark eager vs compiled
def time_model(m, x, n=30):
for _ in range(5): m(x) # warm up
if device == "cuda": torch.cuda.synchronize()
t0 = time.perf_counter()
for _ in range(n): m(x)
if device == "cuda": torch.cuda.synchronize()
return (time.perf_counter() - t0) * 1000 / n
with torch.no_grad():
eager_ms = time_model(model_eager, x)
compiled_ms = time_model(model_compiled, x)
print(f"Eager : {eager_ms:.2f} ms")
print(f"Compiled : {compiled_ms:.2f} ms")
print(f"Speedup : {eager_ms / compiled_ms:.2f}×")
6. Pinned Memory and DataLoader Optimisation
from torch.utils.data import DataLoader, TensorDataset
N = 100_000
X = torch.randint(0, 1000, (N, 64))
Y = torch.randint(0, 1000, (N,))
dataset = TensorDataset(X, Y)
# pin_memory=True allows async CPU→GPU transfer via DMA
# num_workers > 0 loads batches in parallel background processes
loader_slow = DataLoader(dataset, batch_size=256, pin_memory=False, num_workers=0)
loader_fast = DataLoader(dataset, batch_size=256, pin_memory=True, num_workers=2)
def time_loader(loader, label):
t0 = time.perf_counter()
for xb, yb in loader:
xb = xb.to(device, non_blocking=True)
yb = yb.to(device, non_blocking=True)
elapsed = (time.perf_counter() - t0) * 1000
print(f"{label}: {elapsed:.0f} ms")
time_loader(loader_slow, "Standard loader")
time_loader(loader_fast, "Pinned+async loader")
7. Profiling with torch.profiler
from torch.profiler import profile, record_function, ProfilerActivity
model_eager = model_eager.train()
optim = torch.optim.AdamW(model_eager.parameters(), lr=1e-3)
activities = [ProfilerActivity.CPU]
if device == "cuda":
activities.append(ProfilerActivity.CUDA)
with profile(activities=activities, record_shapes=True) as prof:
with record_function("forward"):
out = model_eager(x)
loss = out.sum()
with record_function("backward"):
loss.backward()
with record_function("optimizer"):
optim.step()
optim.zero_grad()
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
8. Summary
| Technique | Typical gain |
|---|---|
| Move to GPU | 100–1000× for matmul |
torch.compile |
1.5–3× over eager mode |
| Pinned memory | Removes CPU↔GPU copy bottleneck |
num_workers > 0 |
Overlaps data loading with GPU compute |
non_blocking=True |
Async GPU transfers |
Chapter 9 covers precision: using float16 / bfloat16 instead of float32 halves memory and often doubles throughput with near-zero accuracy cost.