DINOv2: Comprehensive Implementation Guide

DINOv2 is a state-of-the-art self-supervised vision model developed by Meta AI Research that builds upon the original DINO (Self-Distillation with No Labels) framework. This guide will walk you through understanding, implementing, and leveraging DINOv2 for various computer vision tasks.
Introduction to DINOv2
DINOv2 is a self-supervised learning method for vision that produces high-quality visual features without requiring labeled data. It extends the original DINO architecture with several improvements:
- Training on a large and diverse dataset of images
- Enhanced teacher-student architecture
- Improved augmentation strategy
- Multi-scale feature learning
- Support for various Vision Transformer (ViT) backbones
The result is a versatile foundation model that can be adapted to numerous vision tasks with minimal fine-tuning.
Installation and Setup
To use DINOv2, you’ll need to install the official implementation:
# Install PyTorch first if not already installed
# pip install torch torchvision
# Install DINOv2
pip install git+https://github.com/facebookresearch/dinov2Alternatively, you can clone the repository and install it locally:
git clone https://github.com/facebookresearch/dinov2.git
cd dinov2
pip install -e .Dependencies
DINOv2 requires:
- Python 3.8+
- PyTorch 1.12+
- torchvision
- CUDA (for GPU acceleration)
Loading Pre-trained Models
DINOv2 provides several pre-trained models with different sizes and capabilities:
import torch
from dinov2.models import build_model_from_cfg
from dinov2.configs import get_config
# Available model sizes: 'small', 'base', 'large', 'giant'
model_size = 'base'
cfg = get_config(f"dinov2_{model_size}")
model = build_model_from_cfg(cfg)
# Load pre-trained weights
checkpoint_path = f"dinov2_{model_size}_pretrain.pth" # Download this from Meta AI's repository
checkpoint = torch.load(checkpoint_path, map_location="cpu")
model.load_state_dict(checkpoint["model"])
# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval() # Set to evaluation modeYou can also use the Hugging Face Transformers library for an easier integration:
from transformers import AutoImageProcessor, AutoModel
# Available model sizes: 'small', 'base', 'large', 'giant'
model_name = "facebook/dinov2-base"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)Feature Extraction
One of DINOv2’s key strengths is its ability to extract powerful visual features:
import torch
from PIL import Image
import torchvision.transforms as T
from transformers import AutoImageProcessor, AutoModel
# Load model
model_name = "facebook/dinov2-base"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
model.eval()
# Load and preprocess image
image = Image.open("path/to/your/image.jpg").convert("RGB")
inputs = processor(images=image, return_tensors="pt").to(model.device)
# Extract features
with torch.no_grad():
outputs = model(**inputs)
# Get CLS token features (useful for classification tasks)
cls_features = outputs.last_hidden_state[:, 0]
# Get patch features (useful for dense prediction tasks like segmentation)
patch_features = outputs.last_hidden_state[:, 1:]
print(f"CLS features shape: {cls_features.shape}")
print(f"Patch features shape: {patch_features.shape}")Fine-tuning for Downstream Tasks
DINOv2 can be fine-tuned for specific vision tasks:
import torch
import torch.nn as nn
from transformers import AutoModel
# Load pre-trained DINOv2 model
backbone = AutoModel.from_pretrained("facebook/dinov2-base")
# Create a custom classification head
class ClassificationHead(nn.Module):
def __init__(self, backbone, num_classes=1000):
super().__init__()
self.backbone = backbone
self.classifier = nn.Linear(backbone.config.hidden_size, num_classes)
def forward(self, x):
outputs = self.backbone(x)
cls_token = outputs.last_hidden_state[:, 0]
return self.classifier(cls_token)
# Create the complete model
model = ClassificationHead(backbone, num_classes=100) # For 100 classes
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
# Training loop example
def train_one_epoch(model, dataloader, optimizer, criterion, device):
model.train()
total_loss = 0
for batch in dataloader:
images = batch["pixel_values"].to(device)
labels = batch["labels"].to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)Image Classification Example
Here’s a complete example for image classification using DINOv2:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms
from transformers import AutoImageProcessor, AutoModel
# Define the dataset and transforms
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
# Load your dataset (adjust the path)
train_dataset = ImageFolder(root="path/to/train", transform=transform)
val_dataset = ImageFolder(root="path/to/val", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
# Create the model
class DINOv2Classifier(nn.Module):
def __init__(self, num_classes):
super().__init__()
self.dinov2 = AutoModel.from_pretrained("facebook/dinov2-base")
self.classifier = nn.Linear(768, num_classes) # 768 is the hidden size for base model
def forward(self, x):
# Extract features
with torch.set_grad_enabled(self.training):
features = self.dinov2(x).last_hidden_state[:, 0] # Get CLS token
# Classify
logits = self.classifier(features)
return logits
# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DINOv2Classifier(num_classes=len(train_dataset.classes))
model = model.to(device)
# Define optimizer and loss function
optimizer = torch.optim.AdamW([
{'params': model.classifier.parameters(), 'lr': 1e-3},
{'params': model.dinov2.parameters(), 'lr': 1e-5}
])
criterion = nn.CrossEntropyLoss()
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
# Training
model.train()
train_loss = 0
correct = 0
total = 0
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
train_accuracy = 100 * correct / total
# Validation
model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
for inputs, targets in val_loader:
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
val_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
val_accuracy = 100 * correct / total
print(f"Epoch {epoch+1}/{num_epochs}")
print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_accuracy:.2f}%")
print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.2f}%")Semantic Segmentation Example
DINOv2 is particularly powerful for segmentation tasks:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
class DINOv2Segmenter(nn.Module):
def __init__(self, num_classes):
super().__init__()
# Load DINOv2 backbone
self.backbone = AutoModel.from_pretrained("facebook/dinov2-base")
# Define segmentation head
hidden_dim = self.backbone.config.hidden_size
self.segmentation_head = nn.Sequential(
nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
nn.BatchNorm2d(hidden_dim),
nn.ReLU(inplace=True),
nn.Conv2d(hidden_dim, num_classes, kernel_size=1)
)
# Image size and patch size for reshaping
self.image_size = 224
self.patch_size = 14 # For ViT-Base
self.num_patches = (self.image_size // self.patch_size) ** 2
def forward(self, x):
# Get patch features
outputs = self.backbone(x)
patch_features = outputs.last_hidden_state[:, 1:] # Remove CLS token
# Reshape to 2D spatial layout
B = x.shape[0]
H = W = self.image_size // self.patch_size
patch_features = patch_features.reshape(B, H, W, -1).permute(0, 3, 1, 2)
# Apply segmentation head
segmentation_logits = self.segmentation_head(patch_features)
# Upsample to original image size
segmentation_logits = F.interpolate(
segmentation_logits,
size=(self.image_size, self.image_size),
mode='bilinear',
align_corners=False
)
return segmentation_logits
# Create model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DINOv2Segmenter(num_classes=21) # 21 classes for Pascal VOC
model = model.to(device)
# Define optimizer and loss function
optimizer = torch.optim.AdamW([
{'params': model.segmentation_head.parameters(), 'lr': 1e-3},
{'params': model.backbone.parameters(), 'lr': 1e-5}
])
criterion = nn.CrossEntropyLoss(ignore_index=255) # 255 is typically the ignore index
# Rest of the training code would be similar to the classification exampleObject Detection Example
Here’s how to use DINOv2 features for object detection with a simple detection head:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
class DINOv2Detector(nn.Module):
def __init__(self, num_classes):
super().__init__()
# Load DINOv2 backbone
self.backbone = AutoModel.from_pretrained("facebook/dinov2-base")
hidden_dim = self.backbone.config.hidden_size
# Detection heads
self.box_predictor = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(inplace=True),
nn.Linear(hidden_dim, 4) # (x1, y1, x2, y2)
)
self.class_predictor = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(inplace=True),
nn.Linear(hidden_dim, num_classes + 1) # +1 for background
)
# Image size and patch size for feature map creation
self.image_size = 224
self.patch_size = 14 # For ViT-Base
def forward(self, x):
# Get features
outputs = self.backbone(x)
features = outputs.last_hidden_state[:, 1:] # Remove CLS token
# Reshape to 2D spatial layout
B = x.shape[0]
H = W = self.image_size // self.patch_size
features = features.reshape(B, H, W, -1)
# Flatten for prediction heads
features_flat = features.reshape(B, -1, features.shape[-1])
# Predict boxes and classes
boxes = self.box_predictor(features_flat)
classes = self.class_predictor(features_flat)
return {'boxes': boxes, 'classes': classes, 'features_map': features}
# Create model and move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DINOv2Detector(num_classes=80) # 80 classes for COCO
model = model.to(device)
# Training would require a more complex detection pipeline with NMS, etc.Advanced Usage and Customization
Custom Vision Transformer Configurations
You can customize the DINOv2 model architecture:
from dinov2.configs import get_config
from dinov2.models import build_model_from_cfg
# Get default configuration and modify it
cfg = get_config("dinov2_base")
# Modify configuration
cfg.student.drop_path_rate = 0.2 # Change stochastic depth rate
cfg.student.num_registers = 16 # Change the number of registers
# Build model from modified config
model = build_model_from_cfg(cfg)Extracting Intermediate Features
For some applications, you might want to extract features from intermediate layers:
import torch
from transformers import AutoModel
from torch.utils.hooks import RemovableHandle
class FeatureExtractor:
def __init__(self, model, layers=None):
self.model = model
self.features = {}
self.hooks = []
# Default to extracting from the last block if no layers specified
self.layers = layers if layers is not None else [11] # Base model has 12 blocks (0-11)
# Register hooks
for idx in self.layers:
hook = self.model.encoder.layer[idx].register_forward_hook(
lambda module, input, output, idx=idx: self.features.update({f"layer_{idx}": output})
)
self.hooks.append(hook)
def __call__(self, x):
self.features.clear()
with torch.no_grad():
outputs = self.model(x)
return self.features
def remove_hooks(self):
for hook in self.hooks:
hook.remove()
# Usage
model = AutoModel.from_pretrained("facebook/dinov2-base")
extractor = FeatureExtractor(model, layers=[3, 7, 11])
# Extract features
features = extractor(input_image)
layer_3_features = features["layer_3"]
layer_7_features = features["layer_7"]
layer_11_features = features["layer_11"]
# Clean up
extractor.remove_hooks()Performance Benchmarks
DINOv2 achieves excellent results across various vision tasks. Here are typical performance metrics:
- ImageNet-1K Classification (top-1 accuracy):
- DINOv2-Small: ~80.0%
- DINOv2-Base: ~84.5%
- DINOv2-Large: ~86.3%
- DINOv2-Giant: ~87.0%
- Semantic Segmentation (ADE20K) (mIoU):
- DINOv2-Small: ~47.5%
- DINOv2-Base: ~50.2%
- DINOv2-Large: ~52.5%
- DINOv2-Giant: ~53.8%
- Object Detection (COCO) (AP):
- DINOv2-Small: ~48.5%
- DINOv2-Base: ~51.3%
- DINOv2-Large: ~53.2%
- DINOv2-Giant: ~54.5%
Troubleshooting
Common Issues and Solutions
- Out of Memory Errors
- Reduce batch size
- Use gradient accumulation
- Use a smaller model variant (Small or Base)
- Use mixed precision training
# Example of mixed precision training
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for inputs, targets in train_loader:
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
# Use autocast for mixed precision
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets)
# Scale loss and backprop
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()- Slow Inference
- Use batch processing
- Use model.eval() and torch.no_grad()
- Consider model distillation or quantization
- Poor Performance on Downstream Tasks
- Ensure proper data preprocessing
- Adjust learning rates (lower for backbone, higher for heads)
- Use appropriate augmentations
- Consider using a larger variant of DINOv2
Debugging Tips
- Visualize model attention maps to understand what the model focuses on:
import matplotlib.pyplot as plt
import torch.nn.functional as F
from PIL import Image
import torchvision.transforms as T
def get_attention_map(model, img_tensor):
model.eval()
with torch.no_grad():
outputs = model(img_tensor.unsqueeze(0), output_attentions=True)
# Get attention weights from the last layer
att_mat = outputs.attentions[-1]
# Average attention across heads
att_mat = att_mat.mean(dim=1)
# Extract attention for cls token to patch tokens
cls_att_map = att_mat[0, 0, 1:].reshape(14, 14)
return cls_att_map.cpu().numpy()
# Load and preprocess image
image = Image.open("path/to/image.jpg").convert("RGB")
transform = T.Compose([
T.Resize((224, 224)),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
img_tensor = transform(image).to(device)
# Get attention map
from transformers import AutoModel
model = AutoModel.from_pretrained("facebook/dinov2-base", output_attentions=True)
model.to(device)
attention_map = get_attention_map(model, img_tensor)
# Visualize
plt.figure(figsize=(10, 10))
plt.imshow(image.resize((224, 224)))
plt.imshow(attention_map, alpha=0.5, cmap='jet')
plt.axis('off')
plt.colorbar()
plt.savefig('attention_map.png')
plt.close()This guide should help you get started with DINOv2 and explore its capabilities for various computer vision tasks. As a self-supervised vision foundation model, DINOv2 provides a strong starting point for numerous applications with minimal labeled data requirements.