Performance Profiling¶
Identifying bottlenecks in training and inference.
Identifying Bottlenecks¶
import torch.profiler
# Profile training
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
record_shapes=True,
profile_memory=True,
) as prof:
# Run a few training steps
for i, batch in enumerate(train_loader):
if i >= 10: # Profile 10 batches
break
outputs = model(batch["images"])
loss = criterion(outputs, batch["labels"])
loss.backward()
optimizer.step()
optimizer.zero_grad()
# Print results
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# Export for visualization
prof.export_chrome_trace("trace.json")
# View in chrome://tracing
Data Loading Profiling¶
import time
# Measure data loading time
loader = datamodule.train_dataloader()
times = []
for i, batch in enumerate(loader):
start = time.time()
# Just iterate, don't process
end = time.time()
times.append(end - start)
if i >= 100:
break
print(f"Average batch load time: {sum(times)/len(times):.4f}s")
print(f"Max batch load time: {max(times):.4f}s")
# If slow, increase num_workers
data = ImageDataModule(
num_workers=8,
persistent_workers=True,
pin_memory=True,
)
Memory Profiling¶
import torch
# Track memory allocations
torch.cuda.memory._record_memory_history()
# Run training
trainer.fit(model, datamodule=data)
# Get memory snapshot
snapshot = torch.cuda.memory._snapshot()
torch.cuda.memory._dump_snapshot("memory_snap.pickle")
# Analyze with https://pytorch.org/memory_viz
Related Issues¶
- Slow Training - Performance optimization tips
- OOM Errors - Memory issues
- Device Errors - Hardware problems