Hugging Face Accelerate Code Guide
I’ve created a comprehensive code guide for Hugging Face Accelerate that covers everything from basic setup to advanced features like DeepSpeed integration.
Table of Contents
Installation and Setup
Installation
pip install accelerate
Configuration
Run the configuration wizard to set up your training environment:
accelerate config
Or create a config file programmatically:
from accelerate import Accelerator
from accelerate.utils import write_basic_config
="fp16") # or "bf16", "no" write_basic_config(mixed_precision
Basic Concepts
The Accelerator Object
The Accelerator
is the main class that handles device placement, gradient synchronization, and other distributed training concerns.
from accelerate import Accelerator
# Initialize accelerator
= Accelerator()
accelerator
# Key properties
= accelerator.device
device = accelerator.is_main_process
is_main_process = accelerator.num_processes num_processes
Device Placement
Accelerate automatically handles device placement:
# Manual device placement (old way)
= model.to(device)
model = {k: v.to(device) for k, v in batch.items()}
batch
# Accelerate way (automatic)
= accelerator.prepare(model, optimizer, dataloader)
model, optimizer, dataloader # No need to move batch to device - accelerate handles it
Simple Training Loop
Basic Example
import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from transformers import AutoModel, AutoTokenizer, AdamW
def train_model():
# Initialize accelerator
= Accelerator()
accelerator
# Load model and tokenizer
= AutoModel.from_pretrained("bert-base-uncased")
model = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer
# Create optimizer
= AdamW(model.parameters(), lr=5e-5)
optimizer
# Create dataloader (your dataset here)
= DataLoader(train_dataset, batch_size=16, shuffle=True)
train_dataloader
# Prepare everything with accelerator
= accelerator.prepare(
model, optimizer, train_dataloader
model, optimizer, train_dataloader
)
# Training loop
model.train()for epoch in range(3):
for batch in train_dataloader:
# Forward pass
= model(**batch)
outputs = outputs.loss
loss
# Backward pass
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
# Print loss (only on main process)
if accelerator.is_main_process:
print(f"Loss: {loss.item():.4f}")
if __name__ == "__main__":
train_model()
Running the Training
# Single GPU
python train.py
# Multiple GPUs
accelerate launch --num_processes=2 train.py
# With config file
accelerate launch --config_file config.yaml train.py
Advanced Features
Logging and Tracking
from accelerate import Accelerator
from accelerate.logging import get_logger
# Initialize with logging
= Accelerator(log_with="tensorboard", project_dir="./logs")
accelerator
# Get logger
= get_logger(__name__)
logger
# Start tracking
"my_experiment")
accelerator.init_trackers(
# Log metrics
"train_loss": loss.item(), "epoch": epoch})
accelerator.log({
# End tracking
accelerator.end_training()
Saving and Loading Models
# Save model
"path/to/save")
accelerator.save_model(model,
# Or save state dict
"model.pt")
accelerator.save(model.state_dict(),
# Load model
"model.pt")
accelerator.load_state(
# Save complete training state
"checkpoint_dir")
accelerator.save_state(
# Load complete training state
"checkpoint_dir") accelerator.load_state(
Evaluation Loop
def evaluate_model(model, eval_dataloader, accelerator):
eval()
model.= 0
total_loss = 0
total_samples
with torch.no_grad():
for batch in eval_dataloader:
= model(**batch)
outputs = outputs.loss
loss
# Gather losses from all processes
= accelerator.gather(loss)
gathered_loss += gathered_loss.sum().item()
total_loss += gathered_loss.shape[0]
total_samples
= total_loss / total_samples
avg_loss return avg_loss
Multi-GPU Training
Data Parallel Training
from accelerate import Accelerator
def train_multi_gpu():
= Accelerator()
accelerator
# Model will be replicated across GPUs
= MyModel()
model = torch.optim.Adam(model.parameters())
optimizer
# Prepare for multi-GPU
= accelerator.prepare(
model, optimizer, train_dataloader
model, optimizer, train_dataloader
)
# Training loop remains the same
for batch in train_dataloader:
= model(**batch)
outputs = outputs.loss
loss
# Accelerate handles gradient synchronization
accelerator.backward(loss)
optimizer.step() optimizer.zero_grad()
Launch Commands
# Launch on 4 GPUs
accelerate launch --num_processes=4 --multi_gpu train.py
# Launch with specific GPUs
CUDA_VISIBLE_DEVICES=0,1,3 accelerate launch --num_processes=3 train.py
# Launch on multiple nodes
accelerate launch --num_processes=8 --num_machines=2 --main_process_ip=192.168.1.1 train.py
Mixed Precision Training
Automatic Mixed Precision
# Enable mixed precision in config or during initialization
= Accelerator(mixed_precision="fp16") # or "bf16"
accelerator
# Training loop remains exactly the same
for batch in train_dataloader:
= model(**batch)
outputs = outputs.loss
loss
# Accelerate handles scaling automatically
accelerator.backward(loss)
optimizer.step() optimizer.zero_grad()
Manual Mixed Precision Control
# Access the scaler if needed
if accelerator.mixed_precision == "fp16":
= accelerator.scaler
scaler
# Manual scaling (usually not needed)
= scaler.scale(loss)
scaled_loss
scaled_loss.backward()
scaler.step(optimizer) scaler.update()
Gradient Accumulation
Basic Gradient Accumulation
= Accelerator(gradient_accumulation_steps=4)
accelerator
for batch in train_dataloader:
# Use accumulate context manager
with accelerator.accumulate(model):
= model(**batch)
outputs = outputs.loss
loss
accelerator.backward(loss)
optimizer.step() optimizer.zero_grad()
Dynamic Gradient Accumulation
def train_with_dynamic_accumulation():
= 2
accumulation_steps
for i, batch in enumerate(train_dataloader):
= model(**batch)
outputs = outputs.loss / accumulation_steps # Scale loss
loss
accelerator.backward(loss)
if (i + 1) % accumulation_steps == 0:
optimizer.step() optimizer.zero_grad()
DeepSpeed Integration
DeepSpeed Configuration
Create a DeepSpeed config file (ds_config.json
):
{
"train_batch_size": 32,
"gradient_accumulation_steps": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 5e-5
}
},
"fp16": {
"enabled": true
},
"zero_optimization": {
"stage": 2
}
}
Using DeepSpeed
from accelerate import Accelerator
# Initialize with DeepSpeed
= Accelerator(deepspeed_plugin="ds_config.json")
accelerator
# Or programmatically
from accelerate import DeepSpeedPlugin
= DeepSpeedPlugin(
ds_plugin =4,
gradient_accumulation_steps=2,
zero_stage="cpu"
offload_optimizer_device
)= Accelerator(deepspeed_plugin=ds_plugin)
accelerator
# Training code remains the same
= accelerator.prepare(model, optimizer) model, optimizer
Launch with DeepSpeed
accelerate launch --config_file ds_config.yaml train.py
Troubleshooting
Common Issues and Solutions
Memory Issues
# Clear cache regularly
if accelerator.is_main_process:
torch.cuda.empty_cache()
# Use gradient checkpointing
model.gradient_checkpointing_enable()
# Reduce batch size or increase gradient accumulation
Synchronization Issues
# Wait for all processes
accelerator.wait_for_everyone()
# Gather data from all processes
= accelerator.gather(loss)
all_losses
# Reduce across processes
= accelerator.reduce(loss, reduction="mean") avg_loss
Debugging
# Enable debug mode
= Accelerator(debug=True)
accelerator
# Check if running in distributed mode
if accelerator.distributed_type != "NO":
print(f"Running on {accelerator.num_processes} processes")
# Print only on main process
print("This will only print once") accelerator.
Performance Tips
- Use appropriate batch sizes: Larger batch sizes generally improve GPU utilization
- Enable mixed precision: Use fp16 or bf16 for faster training
- Gradient accumulation: Simulate larger batch sizes without memory issues
- DataLoader optimization: Use
num_workers
andpin_memory=True
- Compile models: Use
torch.compile()
for PyTorch 2.0+
# Optimized setup
= Accelerator(
accelerator ="bf16",
mixed_precision=4
gradient_accumulation_steps
)
# Compile model (PyTorch 2.0+)
= torch.compile(model)
model
# Optimized DataLoader
= DataLoader(
train_dataloader
dataset,=32,
batch_size=4,
num_workers=True,
pin_memory=True
shuffle )
Complete Example: BERT Fine-tuning
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from accelerate import Accelerator
from datasets import load_dataset
def main():
# Initialize
= Accelerator(mixed_precision="fp16")
accelerator
# Load data
= load_dataset("imdb")
dataset = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length")
= dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets["train"].with_format("torch")
train_dataset
# Model and optimizer
= AutoModelForSequenceClassification.from_pretrained(
model "bert-base-uncased", num_labels=2
)= AdamW(model.parameters(), lr=5e-5)
optimizer
# DataLoader
= DataLoader(train_dataset, batch_size=16, shuffle=True)
train_dataloader
# Prepare everything
= accelerator.prepare(
model, optimizer, train_dataloader
model, optimizer, train_dataloader
)
# Training loop
= 3
num_epochs
model.train()
for epoch in range(num_epochs):
= 0
total_loss for step, batch in enumerate(train_dataloader):
= model(**batch)
outputs = outputs.loss
loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
+= loss.item()
total_loss
if step % 100 == 0 and accelerator.is_main_process:
print(f"Epoch {epoch}, Step {step}, Loss: {loss.item():.4f}")
if accelerator.is_main_process:
= total_loss / len(train_dataloader)
avg_loss print(f"Epoch {epoch} completed. Average loss: {avg_loss:.4f}")
# Save model
accelerator.wait_for_everyone()= accelerator.unwrap_model(model)
unwrapped_model "./fine_tuned_bert")
unwrapped_model.save_pretrained(
if accelerator.is_main_process:
"./fine_tuned_bert")
tokenizer.save_pretrained(
if __name__ == "__main__":
main()
This guide covers the essential aspects of using Hugging Face Accelerate for distributed training. The library abstracts away much of the complexity while providing fine-grained control when needed.