Object Detection Inference¶
This guide covers how to use trained object detection models for inference and prediction.
Loading a Detection Model¶
import autotimm as at # recommended alias
from autotimm import ObjectDetector, MetricConfig, TransformConfig
# Define metrics
metrics = [
MetricConfig(
name="mAP",
backend="torchmetrics",
metric_class="MeanAveragePrecision",
params={"box_format": "xyxy"},
stages=["val"],
),
]
# Load model with TransformConfig for preprocessing
model = ObjectDetector.load_from_checkpoint(
"path/to/checkpoint.ckpt",
backbone="resnet50",
num_classes=80,
compile_model=False, # skip compilation for inference
metrics=metrics, # not saved in checkpoint
transform_config=TransformConfig(), # not saved in checkpoint; enables preprocess()
)
model.eval()
Single Image Detection (Recommended)¶
Use the built-in preprocess() method for correct model-specific normalization:
import torch
from PIL import Image
# Load image
image = Image.open("image.jpg").convert("RGB")
# Preprocess using model's native normalization
input_tensor = model.preprocess(image) # Returns (1, 3, 640, 640)
# Detect objects
with torch.inference_mode():
detections = model.predict_step(input_tensor, batch_idx=0)
Single Image Detection (Manual)¶
If you need manual control over transforms:
import torch
from PIL import Image
from torchvision import transforms
# Prepare transform
transform = transforms.Compose([
transforms.Resize((640, 640)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
# Load and transform image
image = Image.open("image.jpg").convert("RGB")
input_tensor = transform(image).unsqueeze(0) # (1, 3, 640, 640)
# Detect objects
model.eval()
with torch.inference_mode():
detections = model.predict_step(input_tensor, batch_idx=0)
# detections is a dict with:
# - "boxes": Tensor of shape (N, 4) in [x1, y1, x2, y2] format
# - "scores": Tensor of shape (N,) with confidence scores
# - "labels": Tensor of shape (N,) with class indices
boxes = detections["boxes"]
scores = detections["scores"]
labels = detections["labels"]
print(f"Found {len(boxes)} objects:")
for box, score, label in zip(boxes, scores, labels):
print(f" Class {label.item()}: {score.item():.2%} confidence at {box.tolist()}")
Tip: Use model.get_data_config() to get the correct normalization values for manual transforms:
config = model.get_data_config()
print(f"Mean: {config['mean']}")
print(f"Std: {config['std']}")
print(f"Input size: {config['input_size']}")
Visualize Detections¶
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
def visualize_detections(
image_path,
boxes,
labels,
scores,
class_names=None,
threshold=0.5,
figsize=(12, 8)
):
"""Visualize object detection results."""
# Load image
image = Image.open(image_path).convert("RGB")
fig, ax = plt.subplots(1, figsize=figsize)
ax.imshow(image)
# Filter by confidence threshold
keep = scores >= threshold
boxes = boxes[keep]
labels = labels[keep]
scores = scores[keep]
# Draw boxes
for box, label, score in zip(boxes, labels, scores):
x1, y1, x2, y2 = box.tolist()
width = x2 - x1
height = y2 - y1
# Create rectangle
rect = patches.Rectangle(
(x1, y1), width, height,
linewidth=2, edgecolor='red', facecolor='none'
)
ax.add_patch(rect)
# Add label
class_name = class_names[label] if class_names else f"Class {label}"
label_text = f"{class_name}: {score:.2f}"
ax.text(
x1, y1 - 5,
label_text,
bbox=dict(facecolor='red', alpha=0.5),
fontsize=10, color='white'
)
ax.axis('off')
plt.tight_layout()
plt.savefig('detections.jpg', dpi=150, bbox_inches='tight')
plt.show()
# Usage
visualize_detections(
"image.jpg",
boxes=detections["boxes"],
labels=detections["labels"],
scores=detections["scores"],
class_names=["person", "bicycle", "car", ...], # COCO classes
threshold=0.3,
)
Batch Detection¶
from torch.utils.data import DataLoader
from autotimm import DetectionDataModule
# Prepare data
data = DetectionDataModule(
data_dir="./test_images",
image_size=640,
batch_size=8,
)
data.setup("test")
# Batch inference
model.eval()
all_detections = []
with torch.inference_mode():
for batch in data.test_dataloader():
images = batch["image"]
batch_detections = model.predict_step(images, batch_idx=0)
all_detections.append(batch_detections)
# Process results
for i, dets in enumerate(all_detections):
print(f"Batch {i}: Found {len(dets['boxes'])} objects")
Complete Detection Pipeline¶
Production-ready detection pipeline with TransformConfig:
import torch
from PIL import Image
from autotimm import ObjectDetector, MetricConfig, TransformConfig
class DetectionPipeline:
"""End-to-end object detection pipeline."""
def __init__(
self,
checkpoint_path,
backbone,
num_classes,
class_names=None,
score_threshold=0.3,
image_size=640,
):
# Load model with TransformConfig for preprocessing
metrics = [
MetricConfig(
name="mAP",
backend="torchmetrics",
metric_class="MeanAveragePrecision",
params={"box_format": "xyxy"},
stages=["val"],
),
]
self.model = ObjectDetector.load_from_checkpoint(
checkpoint_path,
backbone=backbone,
num_classes=num_classes,
compile_model=False, # skip compilation for inference
metrics=metrics, # not saved in checkpoint
transform_config=TransformConfig(image_size=image_size), # not saved in checkpoint; enables preprocess()
)
self.model.eval()
# Setup device
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model = self.model.to(self.device)
self.class_names = class_names
self.score_threshold = score_threshold
self.image_size = image_size
def predict(self, image_path):
"""Detect objects in a single image."""
# Load and preprocess using model's native normalization
image = Image.open(image_path).convert("RGB")
original_size = image.size # (width, height)
input_tensor = self.model.preprocess(image).to(self.device)
# Detect
with torch.inference_mode():
detections = self.model.predict_step(input_tensor, batch_idx=0)
# Filter by threshold
keep = detections["scores"] >= self.score_threshold
boxes = detections["boxes"][keep]
scores = detections["scores"][keep]
labels = detections["labels"][keep]
# Scale boxes back to original image size
scale_x = original_size[0] / self.image_size
scale_y = original_size[1] / self.image_size
boxes[:, [0, 2]] *= scale_x
boxes[:, [1, 3]] *= scale_y
# Format results
results = []
for box, score, label in zip(boxes, scores, labels):
class_name = self.class_names[label] if self.class_names else label.item()
results.append({
"class": class_name,
"class_index": label.item(),
"confidence": score.item(),
"bbox": box.cpu().tolist(), # [x1, y1, x2, y2]
})
return results
def predict_batch(self, image_paths):
"""Detect objects in multiple images."""
return [self.predict(path) for path in image_paths]
# Usage
pipeline = DetectionPipeline(
checkpoint_path="best-detector.ckpt",
backbone="resnet50",
num_classes=80,
class_names=["person", "bicycle", "car", ...], # COCO classes
score_threshold=0.3,
image_size=640,
)
# Detect objects
results = pipeline.predict("test_image.jpg")
for det in results:
print(f"{det['class']}: {det['confidence']:.2%} at {det['bbox']}")
COCO Class Names¶
For COCO datasets, use these 80 class names:
COCO_CLASSES = [
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
'toothbrush',
]
# Use with pipeline
pipeline = DetectionPipeline(
checkpoint_path="best-detector.ckpt",
backbone="resnet50",
num_classes=80,
class_names=COCO_CLASSES,
)
Detection Tuning¶
Adjust Score Threshold¶
Control the confidence threshold for detections:
# Low threshold - more detections, more false positives
pipeline = DetectionPipeline(score_threshold=0.1)
# Medium threshold - balanced (recommended)
pipeline = DetectionPipeline(score_threshold=0.3)
# High threshold - fewer detections, more precision
pipeline = DetectionPipeline(score_threshold=0.7)
NMS Threshold¶
Adjust Non-Maximum Suppression to control duplicate detection:
# During model creation
model = ObjectDetector(
backbone="resnet50",
num_classes=80,
nms_thresh=0.5, # Default
)
# Lower = stricter NMS (fewer duplicates)
model = ObjectDetector(nms_thresh=0.3)
# Higher = more lenient NMS (may have duplicates)
model = ObjectDetector(nms_thresh=0.7)
Image Size¶
Balance between speed and accuracy:
# Fast inference (smaller objects may be missed)
pipeline = DetectionPipeline(image_size=512)
# Balanced (recommended)
pipeline = DetectionPipeline(image_size=640)
# Better small object detection (slower)
pipeline = DetectionPipeline(image_size=800)
# Maximum accuracy (much slower)
pipeline = DetectionPipeline(image_size=1024)
Performance Tips¶
1. Batch Processing¶
Process multiple images in one forward pass:
def predict_batch_efficient(model, image_paths, transform, device, batch_size=8):
"""Efficient batch detection."""
model.eval()
all_results = []
for i in range(0, len(image_paths), batch_size):
batch_paths = image_paths[i:i + batch_size]
# Load and transform batch
images = []
for path in batch_paths:
img = Image.open(path).convert("RGB")
images.append(transform(img))
# Stack into batch
batch_tensor = torch.stack(images).to(device)
# Detect
with torch.inference_mode():
detections = model.predict_step(batch_tensor, batch_idx=0)
all_results.append(detections)
return all_results
2. GPU Inference¶
Always use GPU for detection when available:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Also move input to GPU
input_tensor = input_tensor.to(device)
3. Optimal Settings¶
Recommended settings for different scenarios:
Speed-optimized:
pipeline = DetectionPipeline(
image_size=512,
score_threshold=0.5,
backbone="resnet34", # Smaller backbone
)
Balanced:
Accuracy-optimized:
pipeline = DetectionPipeline(
image_size=800,
score_threshold=0.1, # More detections
backbone="resnet101", # Larger backbone
)
Saving Detection Results¶
Save to JSON¶
import json
results = pipeline.predict("image.jpg")
# Save to JSON
with open("detections.json", "w") as f:
json.dump(results, f, indent=2)
# Load from JSON
with open("detections.json", "r") as f:
loaded_results = json.load(f)
Save Annotated Image¶
from PIL import Image, ImageDraw, ImageFont
def save_annotated_image(image_path, detections, output_path, class_names=None):
"""Save image with drawn bounding boxes."""
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
# Try to load a font
try:
font = ImageFont.truetype("arial.ttf", 16)
except:
font = ImageFont.load_default()
for det in detections:
bbox = det["bbox"]
class_name = det["class"]
confidence = det["confidence"]
# Draw rectangle
draw.rectangle(bbox, outline="red", width=3)
# Draw label
label = f"{class_name}: {confidence:.2f}"
text_bbox = draw.textbbox((bbox[0], bbox[1] - 20), label, font=font)
draw.rectangle(text_bbox, fill="red")
draw.text((bbox[0], bbox[1] - 20), label, fill="white", font=font)
image.save(output_path)
# Usage
results = pipeline.predict("input.jpg")
save_annotated_image("input.jpg", results, "output.jpg", class_names=COCO_CLASSES)
Common Issues¶
For object detection inference issues, see the Troubleshooting - Export & Inference including:
- No detections
- Too many duplicate detections
- Missing small objects
- Score threshold tuning
2. Lower score threshold¶
pipeline = DetectionPipeline(score_threshold=0.1)
3. Use model trained on higher resolution¶
### Slow Inference
**Problem:** Detection is too slow
**Solutions:**
```python
# 1. Use smaller image size
pipeline = DetectionPipeline(image_size=512)
# 2. Use smaller backbone
model = ObjectDetector(backbone="resnet34")
# 3. Increase score threshold (fewer post-processing)
pipeline = DetectionPipeline(score_threshold=0.5)
# 4. Process in batches on GPU
See Also¶
- Classification Inference - For classification models
- Model Export - Export to TorchScript/ONNX
- Object Detection Examples - More examples