DINOv2: Comprehensive Implementation Guide
DINOv2 is a state-of-the-art self-supervised vision model developed by Meta AI Research that builds upon the original DINO (Self-Distillation with No Labels) framework. This guide will walk you through understanding, implementing, and leveraging DINOv2 for various computer vision tasks.
Table of Contents
Introduction to DINOv2
DINOv2 is a self-supervised learning method for vision that produces high-quality visual features without requiring labeled data. It extends the original DINO architecture with several improvements:
- Training on a large and diverse dataset of images
- Enhanced teacher-student architecture
- Improved augmentation strategy
- Multi-scale feature learning
- Support for various Vision Transformer (ViT) backbones
The result is a versatile foundation model that can be adapted to numerous vision tasks with minimal fine-tuning.
Installation and Setup
To use DINOv2, you’ll need to install the official implementation:
# Install PyTorch first if not already installed
# pip install torch torchvision
# Install DINOv2
pip install git+https://github.com/facebookresearch/dinov2
Alternatively, you can clone the repository and install it locally:
git clone https://github.com/facebookresearch/dinov2.git
cd dinov2
pip install -e .
Dependencies
DINOv2 requires:
- Python 3.8+
- PyTorch 1.12+
- torchvision
- CUDA (for GPU acceleration)
Loading Pre-trained Models
DINOv2 provides several pre-trained models with different sizes and capabilities:
import torch
from dinov2.models import build_model_from_cfg
from dinov2.configs import get_config
# Available model sizes: 'small', 'base', 'large', 'giant'
= 'base'
model_size = get_config(f"dinov2_{model_size}")
cfg = build_model_from_cfg(cfg)
model
# Load pre-trained weights
= f"dinov2_{model_size}_pretrain.pth" # Download this from Meta AI's repository
checkpoint_path = torch.load(checkpoint_path, map_location="cpu")
checkpoint "model"])
model.load_state_dict(checkpoint[
# Move to GPU if available
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = model.to(device)
model eval() # Set to evaluation mode model.
You can also use the Hugging Face Transformers library for an easier integration:
from transformers import AutoImageProcessor, AutoModel
# Available model sizes: 'small', 'base', 'large', 'giant'
= "facebook/dinov2-base"
model_name = AutoImageProcessor.from_pretrained(model_name)
processor = AutoModel.from_pretrained(model_name)
model
# Move to GPU if available
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = model.to(device) model
Feature Extraction
One of DINOv2’s key strengths is its ability to extract powerful visual features:
import torch
from PIL import Image
import torchvision.transforms as T
from transformers import AutoImageProcessor, AutoModel
# Load model
= "facebook/dinov2-base"
model_name = AutoImageProcessor.from_pretrained(model_name)
processor = AutoModel.from_pretrained(model_name)
model "cuda" if torch.cuda.is_available() else "cpu"))
model.to(torch.device(eval()
model.
# Load and preprocess image
= Image.open("path/to/your/image.jpg").convert("RGB")
image = processor(images=image, return_tensors="pt").to(model.device)
inputs
# Extract features
with torch.no_grad():
= model(**inputs)
outputs
# Get CLS token features (useful for classification tasks)
= outputs.last_hidden_state[:, 0]
cls_features
# Get patch features (useful for dense prediction tasks like segmentation)
= outputs.last_hidden_state[:, 1:]
patch_features
print(f"CLS features shape: {cls_features.shape}")
print(f"Patch features shape: {patch_features.shape}")
Fine-tuning for Downstream Tasks
DINOv2 can be fine-tuned for specific vision tasks:
import torch
import torch.nn as nn
from transformers import AutoModel
# Load pre-trained DINOv2 model
= AutoModel.from_pretrained("facebook/dinov2-base")
backbone
# Create a custom classification head
class ClassificationHead(nn.Module):
def __init__(self, backbone, num_classes=1000):
super().__init__()
self.backbone = backbone
self.classifier = nn.Linear(backbone.config.hidden_size, num_classes)
def forward(self, x):
= self.backbone(x)
outputs = outputs.last_hidden_state[:, 0]
cls_token return self.classifier(cls_token)
# Create the complete model
= ClassificationHead(backbone, num_classes=100) # For 100 classes
model
# Define optimizer and loss function
= torch.optim.AdamW(model.parameters(), lr=1e-5)
optimizer = nn.CrossEntropyLoss()
criterion
# Training loop example
def train_one_epoch(model, dataloader, optimizer, criterion, device):
model.train()= 0
total_loss
for batch in dataloader:
= batch["pixel_values"].to(device)
images = batch["labels"].to(device)
labels
optimizer.zero_grad()= model(images)
outputs = criterion(outputs, labels)
loss
loss.backward()
optimizer.step()
+= loss.item()
total_loss
return total_loss / len(dataloader)
Image Classification Example
Here’s a complete example for image classification using DINOv2:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms
from transformers import AutoImageProcessor, AutoModel
# Define the dataset and transforms
= transforms.Compose([
transform 224, 224)),
transforms.Resize((
transforms.ToTensor(),=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
transforms.Normalize(mean
])
# Load your dataset (adjust the path)
= ImageFolder(root="path/to/train", transform=transform)
train_dataset = ImageFolder(root="path/to/val", transform=transform)
val_dataset
= DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
train_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
val_loader
# Create the model
class DINOv2Classifier(nn.Module):
def __init__(self, num_classes):
super().__init__()
self.dinov2 = AutoModel.from_pretrained("facebook/dinov2-base")
self.classifier = nn.Linear(768, num_classes) # 768 is the hidden size for base model
def forward(self, x):
# Extract features
with torch.set_grad_enabled(self.training):
= self.dinov2(x).last_hidden_state[:, 0] # Get CLS token
features
# Classify
= self.classifier(features)
logits return logits
# Initialize model
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = DINOv2Classifier(num_classes=len(train_dataset.classes))
model = model.to(device)
model
# Define optimizer and loss function
= torch.optim.AdamW([
optimizer 'params': model.classifier.parameters(), 'lr': 1e-3},
{'params': model.dinov2.parameters(), 'lr': 1e-5}
{
])= nn.CrossEntropyLoss()
criterion
# Training loop
= 10
num_epochs for epoch in range(num_epochs):
# Training
model.train()= 0
train_loss = 0
correct = 0
total
for inputs, targets in train_loader:
= inputs.to(device), targets.to(device)
inputs, targets
optimizer.zero_grad()= model(inputs)
outputs = criterion(outputs, targets)
loss
loss.backward()
optimizer.step()
+= loss.item()
train_loss = outputs.max(1)
_, predicted += targets.size(0)
total += predicted.eq(targets).sum().item()
correct
= 100 * correct / total
train_accuracy
# Validation
eval()
model.= 0
val_loss = 0
correct = 0
total
with torch.no_grad():
for inputs, targets in val_loader:
= inputs.to(device), targets.to(device)
inputs, targets = model(inputs)
outputs = criterion(outputs, targets)
loss
+= loss.item()
val_loss = outputs.max(1)
_, predicted += targets.size(0)
total += predicted.eq(targets).sum().item()
correct
= 100 * correct / total
val_accuracy
print(f"Epoch {epoch+1}/{num_epochs}")
print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_accuracy:.2f}%")
print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.2f}%")
Semantic Segmentation Example
DINOv2 is particularly powerful for segmentation tasks:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
class DINOv2Segmenter(nn.Module):
def __init__(self, num_classes):
super().__init__()
# Load DINOv2 backbone
self.backbone = AutoModel.from_pretrained("facebook/dinov2-base")
# Define segmentation head
= self.backbone.config.hidden_size
hidden_dim self.segmentation_head = nn.Sequential(
=3, padding=1),
nn.Conv2d(hidden_dim, hidden_dim, kernel_size
nn.BatchNorm2d(hidden_dim),=True),
nn.ReLU(inplace=1)
nn.Conv2d(hidden_dim, num_classes, kernel_size
)
# Image size and patch size for reshaping
self.image_size = 224
self.patch_size = 14 # For ViT-Base
self.num_patches = (self.image_size // self.patch_size) ** 2
def forward(self, x):
# Get patch features
= self.backbone(x)
outputs = outputs.last_hidden_state[:, 1:] # Remove CLS token
patch_features
# Reshape to 2D spatial layout
= x.shape[0]
B = W = self.image_size // self.patch_size
H = patch_features.reshape(B, H, W, -1).permute(0, 3, 1, 2)
patch_features
# Apply segmentation head
= self.segmentation_head(patch_features)
segmentation_logits
# Upsample to original image size
= F.interpolate(
segmentation_logits
segmentation_logits, =(self.image_size, self.image_size),
size='bilinear',
mode=False
align_corners
)
return segmentation_logits
# Create model and move to device
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = DINOv2Segmenter(num_classes=21) # 21 classes for Pascal VOC
model = model.to(device)
model
# Define optimizer and loss function
= torch.optim.AdamW([
optimizer 'params': model.segmentation_head.parameters(), 'lr': 1e-3},
{'params': model.backbone.parameters(), 'lr': 1e-5}
{
])= nn.CrossEntropyLoss(ignore_index=255) # 255 is typically the ignore index
criterion
# Rest of the training code would be similar to the classification example
Object Detection Example
Here’s how to use DINOv2 features for object detection with a simple detection head:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
class DINOv2Detector(nn.Module):
def __init__(self, num_classes):
super().__init__()
# Load DINOv2 backbone
self.backbone = AutoModel.from_pretrained("facebook/dinov2-base")
= self.backbone.config.hidden_size
hidden_dim
# Detection heads
self.box_predictor = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),=True),
nn.ReLU(inplace4) # (x1, y1, x2, y2)
nn.Linear(hidden_dim,
)
self.class_predictor = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),=True),
nn.ReLU(inplace+ 1) # +1 for background
nn.Linear(hidden_dim, num_classes
)
# Image size and patch size for feature map creation
self.image_size = 224
self.patch_size = 14 # For ViT-Base
def forward(self, x):
# Get features
= self.backbone(x)
outputs = outputs.last_hidden_state[:, 1:] # Remove CLS token
features
# Reshape to 2D spatial layout
= x.shape[0]
B = W = self.image_size // self.patch_size
H = features.reshape(B, H, W, -1)
features
# Flatten for prediction heads
= features.reshape(B, -1, features.shape[-1])
features_flat
# Predict boxes and classes
= self.box_predictor(features_flat)
boxes = self.class_predictor(features_flat)
classes
return {'boxes': boxes, 'classes': classes, 'features_map': features}
# Create model and move to device
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = DINOv2Detector(num_classes=80) # 80 classes for COCO
model = model.to(device)
model
# Training would require a more complex detection pipeline with NMS, etc.
Advanced Usage and Customization
Custom Vision Transformer Configurations
You can customize the DINOv2 model architecture:
from dinov2.configs import get_config
from dinov2.models import build_model_from_cfg
# Get default configuration and modify it
= get_config("dinov2_base")
cfg
# Modify configuration
= 0.2 # Change stochastic depth rate
cfg.student.drop_path_rate = 16 # Change the number of registers
cfg.student.num_registers
# Build model from modified config
= build_model_from_cfg(cfg) model
Extracting Intermediate Features
For some applications, you might want to extract features from intermediate layers:
import torch
from transformers import AutoModel
from torch.utils.hooks import RemovableHandle
class FeatureExtractor:
def __init__(self, model, layers=None):
self.model = model
self.features = {}
self.hooks = []
# Default to extracting from the last block if no layers specified
self.layers = layers if layers is not None else [11] # Base model has 12 blocks (0-11)
# Register hooks
for idx in self.layers:
= self.model.encoder.layer[idx].register_forward_hook(
hook lambda module, input, output, idx=idx: self.features.update({f"layer_{idx}": output})
)self.hooks.append(hook)
def __call__(self, x):
self.features.clear()
with torch.no_grad():
= self.model(x)
outputs return self.features
def remove_hooks(self):
for hook in self.hooks:
hook.remove()
# Usage
= AutoModel.from_pretrained("facebook/dinov2-base")
model = FeatureExtractor(model, layers=[3, 7, 11])
extractor
# Extract features
= extractor(input_image)
features = features["layer_3"]
layer_3_features = features["layer_7"]
layer_7_features = features["layer_11"]
layer_11_features
# Clean up
extractor.remove_hooks()
Performance Benchmarks
DINOv2 achieves excellent results across various vision tasks. Here are typical performance metrics:
- ImageNet-1K Classification (top-1 accuracy):
- DINOv2-Small: ~80.0%
- DINOv2-Base: ~84.5%
- DINOv2-Large: ~86.3%
- DINOv2-Giant: ~87.0%
- Semantic Segmentation (ADE20K) (mIoU):
- DINOv2-Small: ~47.5%
- DINOv2-Base: ~50.2%
- DINOv2-Large: ~52.5%
- DINOv2-Giant: ~53.8%
- Object Detection (COCO) (AP):
- DINOv2-Small: ~48.5%
- DINOv2-Base: ~51.3%
- DINOv2-Large: ~53.2%
- DINOv2-Giant: ~54.5%
Troubleshooting
Common Issues and Solutions
- Out of Memory Errors
- Reduce batch size
- Use gradient accumulation
- Use a smaller model variant (Small or Base)
- Use mixed precision training
# Example of mixed precision training
from torch.cuda.amp import autocast, GradScaler
= GradScaler()
scaler
for inputs, targets in train_loader:
= inputs.to(device), targets.to(device)
inputs, targets
optimizer.zero_grad()
# Use autocast for mixed precision
with autocast():
= model(inputs)
outputs = criterion(outputs, targets)
loss
# Scale loss and backprop
scaler.scale(loss).backward()
scaler.step(optimizer) scaler.update()
- Slow Inference
- Use batch processing
- Use model.eval() and torch.no_grad()
- Consider model distillation or quantization
- Poor Performance on Downstream Tasks
- Ensure proper data preprocessing
- Adjust learning rates (lower for backbone, higher for heads)
- Use appropriate augmentations
- Consider using a larger variant of DINOv2
Debugging Tips
- Visualize model attention maps to understand what the model focuses on:
import matplotlib.pyplot as plt
import torch.nn.functional as F
from PIL import Image
import torchvision.transforms as T
def get_attention_map(model, img_tensor):
eval()
model.with torch.no_grad():
= model(img_tensor.unsqueeze(0), output_attentions=True)
outputs
# Get attention weights from the last layer
= outputs.attentions[-1]
att_mat
# Average attention across heads
= att_mat.mean(dim=1)
att_mat
# Extract attention for cls token to patch tokens
= att_mat[0, 0, 1:].reshape(14, 14)
cls_att_map
return cls_att_map.cpu().numpy()
# Load and preprocess image
= Image.open("path/to/image.jpg").convert("RGB")
image = T.Compose([
transform 224, 224)),
T.Resize((
T.ToTensor(),=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
T.Normalize(mean
])= transform(image).to(device)
img_tensor
# Get attention map
from transformers import AutoModel
= AutoModel.from_pretrained("facebook/dinov2-base", output_attentions=True)
model
model.to(device)= get_attention_map(model, img_tensor)
attention_map
# Visualize
=(10, 10))
plt.figure(figsize224, 224)))
plt.imshow(image.resize((=0.5, cmap='jet')
plt.imshow(attention_map, alpha'off')
plt.axis(
plt.colorbar()'attention_map.png')
plt.savefig( plt.close()
This guide should help you get started with DINOv2 and explore its capabilities for various computer vision tasks. As a self-supervised vision foundation model, DINOv2 provides a strong starting point for numerous applications with minimal labeled data requirements.