Dataset Preparation¶
This guide covers how to prepare datasets for all tasks supported by AutoTimm: image classification, object detection, semantic segmentation, and instance segmentation.
Image Classification¶
ImageFolder Structure¶
AutoTimm uses PyTorch's ImageFolder format for classification datasets. Organize your images with class names as subdirectories:
dataset/
├── train/
│ ├── class_a/
│ │ ├── image001.jpg
│ │ ├── image002.jpg
│ │ └── ...
│ ├── class_b/
│ │ ├── image001.jpg
│ │ └── ...
│ └── class_c/
│ └── ...
├── val/
│ ├── class_a/
│ │ └── ...
│ ├── class_b/
│ │ └── ...
│ └── class_c/
│ └── ...
└── test/ # Optional
├── class_a/
│ └── ...
└── ...
Using ImageDataModule¶
from autotimm import ImageDataModule
# Local ImageFolder dataset
data = ImageDataModule(
data_dir="./dataset",
image_size=224,
batch_size=32,
num_workers=4,
)
# Built-in torchvision datasets
data = ImageDataModule(
data_dir="./data",
dataset_name="CIFAR10", # Auto-downloads if not present
image_size=224,
batch_size=32,
)
Supported Image Formats¶
- JPEG (
.jpg,.jpeg) - Recommended for photos - PNG (
.png) - Good for images with transparency or sharp edges - WebP (
.webp) - Smaller file size, good quality - BMP (
.bmp) - Uncompressed - GIF (
.gif) - Single frame only
Label Mapping¶
Classes are automatically assigned numeric labels based on alphabetical folder order:
To get the class mapping:
data = ImageDataModule(data_dir="./dataset", image_size=224, batch_size=32)
data.setup("fit")
# Get class names
class_names = data.train_dataset.classes
print(class_names) # ['airplane', 'bird', 'car']
# Get class to index mapping
class_to_idx = data.train_dataset.class_to_idx
print(class_to_idx) # {'airplane': 0, 'bird': 1, 'car': 2}
Object Detection¶
COCO Format¶
AutoTimm expects detection datasets in COCO format, the standard for object detection.
Directory Structure¶
coco_dataset/
├── train2017/ # Training images
│ ├── 000000000001.jpg
│ ├── 000000000002.jpg
│ └── ...
├── val2017/ # Validation images
│ ├── 000000000001.jpg
│ └── ...
├── test2017/ # Test images (optional)
│ └── ...
└── annotations/
├── instances_train2017.json
├── instances_val2017.json
└── instances_test2017.json # Optional
COCO JSON Schema¶
{
"info": {
"description": "My Dataset",
"version": "1.0",
"year": 2024
},
"licenses": [],
"images": [
{
"id": 1,
"file_name": "000000000001.jpg",
"width": 640,
"height": 480
},
{
"id": 2,
"file_name": "000000000002.jpg",
"width": 800,
"height": 600
}
],
"annotations": [
{
"id": 1,
"image_id": 1,
"category_id": 1,
"bbox": [100, 150, 200, 300],
"area": 60000,
"iscrowd": 0
},
{
"id": 2,
"image_id": 1,
"category_id": 2,
"bbox": [300, 100, 150, 200],
"area": 30000,
"iscrowd": 0
}
],
"categories": [
{"id": 1, "name": "person", "supercategory": "human"},
{"id": 2, "name": "car", "supercategory": "vehicle"}
]
}
Important Notes:
bboxformat is[x, y, width, height](top-left corner + dimensions)category_idis 1-indexed in COCO formatareashould bewidth * heightof the bounding boxiscrowd: 0 for normal annotations, 1 for crowd regions
Converting from YOLO Format¶
YOLO uses a different annotation format with one .txt file per image:
# YOLO format: class_id x_center y_center width height (normalized 0-1)
0 0.5 0.5 0.2 0.3
1 0.3 0.4 0.1 0.15
Conversion script:
import json
import os
from PIL import Image
def yolo_to_coco(images_dir, labels_dir, class_names, output_path):
"""Convert YOLO format to COCO format."""
coco = {
"images": [],
"annotations": [],
"categories": [
{"id": i + 1, "name": name} for i, name in enumerate(class_names)
],
}
annotation_id = 1
image_files = sorted(
f for f in os.listdir(images_dir) if f.endswith((".jpg", ".png"))
)
for image_id, image_file in enumerate(image_files, start=1):
# Get image dimensions
image_path = os.path.join(images_dir, image_file)
with Image.open(image_path) as img:
width, height = img.size
coco["images"].append(
{
"id": image_id,
"file_name": image_file,
"width": width,
"height": height,
}
)
# Read YOLO labels
label_file = os.path.splitext(image_file)[0] + ".txt"
label_path = os.path.join(labels_dir, label_file)
if os.path.exists(label_path):
with open(label_path, "r") as f:
for line in f:
parts = line.strip().split()
if len(parts) == 5:
class_id = int(parts[0])
x_center = float(parts[1]) * width
y_center = float(parts[2]) * height
box_width = float(parts[3]) * width
box_height = float(parts[4]) * height
# Convert to COCO format (x, y, width, height)
x = x_center - box_width / 2
y = y_center - box_height / 2
coco["annotations"].append(
{
"id": annotation_id,
"image_id": image_id,
"category_id": class_id + 1, # 1-indexed
"bbox": [x, y, box_width, box_height],
"area": box_width * box_height,
"iscrowd": 0,
}
)
annotation_id += 1
with open(output_path, "w") as f:
json.dump(coco, f, indent=2)
print(f"Converted {len(coco['images'])} images, {len(coco['annotations'])} annotations")
# Usage
yolo_to_coco(
images_dir="./yolo_dataset/images/train",
labels_dir="./yolo_dataset/labels/train",
class_names=["person", "car", "dog"],
output_path="./coco_dataset/annotations/instances_train2017.json",
)
Converting from Pascal VOC Format¶
Pascal VOC uses XML annotation files:
<annotation>
<filename>000001.jpg</filename>
<size>
<width>640</width>
<height>480</height>
</size>
<object>
<name>person</name>
<bndbox>
<xmin>100</xmin>
<ymin>150</ymin>
<xmax>300</xmax>
<ymax>450</ymax>
</bndbox>
</object>
</annotation>
Conversion script:
import json
import os
import xml.etree.ElementTree as ET
def voc_to_coco(images_dir, annotations_dir, class_names, output_path):
"""Convert Pascal VOC format to COCO format."""
coco = {
"images": [],
"annotations": [],
"categories": [
{"id": i + 1, "name": name} for i, name in enumerate(class_names)
],
}
class_to_id = {name: i + 1 for i, name in enumerate(class_names)}
annotation_id = 1
xml_files = sorted(f for f in os.listdir(annotations_dir) if f.endswith(".xml"))
for image_id, xml_file in enumerate(xml_files, start=1):
tree = ET.parse(os.path.join(annotations_dir, xml_file))
root = tree.getroot()
filename = root.find("filename").text
size = root.find("size")
width = int(size.find("width").text)
height = int(size.find("height").text)
coco["images"].append(
{
"id": image_id,
"file_name": filename,
"width": width,
"height": height,
}
)
for obj in root.findall("object"):
class_name = obj.find("name").text
if class_name not in class_to_id:
continue
bbox = obj.find("bndbox")
xmin = float(bbox.find("xmin").text)
ymin = float(bbox.find("ymin").text)
xmax = float(bbox.find("xmax").text)
ymax = float(bbox.find("ymax").text)
box_width = xmax - xmin
box_height = ymax - ymin
coco["annotations"].append(
{
"id": annotation_id,
"image_id": image_id,
"category_id": class_to_id[class_name],
"bbox": [xmin, ymin, box_width, box_height],
"area": box_width * box_height,
"iscrowd": 0,
}
)
annotation_id += 1
with open(output_path, "w") as f:
json.dump(coco, f, indent=2)
# Usage
voc_to_coco(
images_dir="./voc_dataset/JPEGImages",
annotations_dir="./voc_dataset/Annotations",
class_names=["person", "car", "dog"],
output_path="./coco_dataset/annotations/instances_train2017.json",
)
Semantic Segmentation¶
PNG Mask Format¶
Semantic segmentation uses pixel-wise class labels stored as grayscale PNG images.
Directory Structure¶
segmentation_dataset/
├── images/
│ ├── train/
│ │ ├── image001.jpg
│ │ └── ...
│ └── val/
│ └── ...
└── masks/
├── train/
│ ├── image001.png # Same name, .png extension
│ └── ...
└── val/
└── ...
Mask Format¶
- Grayscale PNG: Pixel values represent class indices (0, 1, 2, ...)
- Ignore index: Use 255 for pixels to ignore (e.g., boundaries)
- Same dimensions: Mask must match image dimensions
import numpy as np
from PIL import Image
# Example: Create a segmentation mask
mask = np.zeros((480, 640), dtype=np.uint8)
mask[100:200, 100:300] = 1 # Class 1 region
mask[200:400, 200:500] = 2 # Class 2 region
mask[0:10, :] = 255 # Ignore boundary
# Save as PNG
Image.fromarray(mask).save("mask.png")
Cityscapes Format¶
Cityscapes uses a specific naming convention:
cityscapes/
├── leftImg8bit/
│ ├── train/
│ │ └── aachen/
│ │ ├── aachen_000000_000019_leftImg8bit.png
│ │ └── ...
│ └── val/
│ └── ...
└── gtFine/
├── train/
│ └── aachen/
│ ├── aachen_000000_000019_gtFine_labelIds.png
│ └── ...
└── val/
└── ...
Cityscapes class mapping (19 classes):
| ID | Class | Color |
|---|---|---|
| 0 | road | (128, 64, 128) |
| 1 | sidewalk | (244, 35, 232) |
| 2 | building | (70, 70, 70) |
| 3 | wall | (102, 102, 156) |
| ... | ... | ... |
| 255 | ignore | - |
Pascal VOC Segmentation Format¶
voc_segmentation/
├── JPEGImages/
│ ├── 2007_000027.jpg
│ └── ...
└── SegmentationClass/
├── 2007_000027.png # Colored masks
└── ...
Pascal VOC uses colored masks that need to be converted to class indices:
import numpy as np
from PIL import Image
# VOC color palette (21 classes)
VOC_COLORS = [
[0, 0, 0], # 0: background
[128, 0, 0], # 1: aeroplane
[0, 128, 0], # 2: bicycle
[128, 128, 0], # 3: bird
# ... more classes
[224, 224, 192], # 20: tvmonitor
]
def voc_color_to_class(colored_mask):
"""Convert VOC colored mask to class indices."""
mask = np.array(colored_mask)
class_mask = np.zeros(mask.shape[:2], dtype=np.uint8)
for class_id, color in enumerate(VOC_COLORS):
matches = np.all(mask == color, axis=-1)
class_mask[matches] = class_id
return class_mask
# Usage
colored = Image.open("colored_mask.png")
class_mask = voc_color_to_class(colored)
Image.fromarray(class_mask).save("class_mask.png")
Using SegmentationDataModule¶
from autotimm import SegmentationDataModule
data = SegmentationDataModule(
data_dir="./segmentation_dataset",
image_size=512,
batch_size=8,
num_workers=4,
)
Instance Segmentation¶
COCO Instance Format¶
Instance segmentation extends detection with per-instance masks. Masks can be stored as polygons or RLE (Run-Length Encoding).
Polygon Format¶
{
"annotations": [
{
"id": 1,
"image_id": 1,
"category_id": 1,
"bbox": [100, 150, 200, 300],
"segmentation": [[100, 150, 300, 150, 300, 450, 100, 450]],
"area": 60000,
"iscrowd": 0
}
]
}
segmentation: List of polygons, each polygon is[x1, y1, x2, y2, ...]- Multiple polygons for complex shapes (with holes)
RLE Format (for complex masks)¶
{
"annotations": [
{
"id": 1,
"image_id": 1,
"category_id": 1,
"bbox": [100, 150, 200, 300],
"segmentation": {
"counts": "encoded_string...",
"size": [480, 640]
},
"area": 60000,
"iscrowd": 0
}
]
}
Converting Masks to COCO Format¶
import json
import numpy as np
from pycocotools import mask as mask_utils
from PIL import Image
def mask_to_polygon(binary_mask):
"""Convert binary mask to polygon points."""
from skimage import measure
contours = measure.find_contours(binary_mask, 0.5)
polygons = []
for contour in contours:
if len(contour) < 3:
continue
# Flip x and y
contour = np.flip(contour, axis=1)
segmentation = contour.ravel().tolist()
polygons.append(segmentation)
return polygons
def mask_to_rle(binary_mask):
"""Convert binary mask to RLE format."""
rle = mask_utils.encode(np.asfortranarray(binary_mask.astype(np.uint8)))
rle["counts"] = rle["counts"].decode("utf-8")
return rle
def create_instance_annotation(
annotation_id, image_id, category_id, binary_mask, use_rle=False
):
"""Create a COCO-format instance annotation."""
# Get bounding box
rows = np.any(binary_mask, axis=1)
cols = np.any(binary_mask, axis=0)
ymin, ymax = np.where(rows)[0][[0, -1]]
xmin, xmax = np.where(cols)[0][[0, -1]]
bbox = [float(xmin), float(ymin), float(xmax - xmin), float(ymax - ymin)]
area = float(binary_mask.sum())
if use_rle:
segmentation = mask_to_rle(binary_mask)
else:
segmentation = mask_to_polygon(binary_mask)
return {
"id": annotation_id,
"image_id": image_id,
"category_id": category_id,
"bbox": bbox,
"segmentation": segmentation,
"area": area,
"iscrowd": 0,
}
Using InstanceSegmentationDataModule¶
from autotimm import InstanceSegmentationDataModule
data = InstanceSegmentationDataModule(
data_dir="./coco_instance",
image_size=640,
batch_size=4,
num_workers=4,
)
Data Validation¶
Validation Script¶
import json
import os
from PIL import Image
def validate_classification_dataset(data_dir):
"""Validate ImageFolder classification dataset."""
errors = []
stats = {"classes": 0, "images": 0}
for split in ["train", "val", "test"]:
split_dir = os.path.join(data_dir, split)
if not os.path.exists(split_dir):
if split != "test":
errors.append(f"Missing {split} directory")
continue
classes = [
d
for d in os.listdir(split_dir)
if os.path.isdir(os.path.join(split_dir, d))
]
stats["classes"] = max(stats["classes"], len(classes))
for cls in classes:
cls_dir = os.path.join(split_dir, cls)
images = [
f
for f in os.listdir(cls_dir)
if f.lower().endswith((".jpg", ".jpeg", ".png"))
]
stats["images"] += len(images)
# Validate each image
for img_file in images[:10]: # Sample first 10
try:
img_path = os.path.join(cls_dir, img_file)
with Image.open(img_path) as img:
img.verify()
except Exception as e:
errors.append(f"Corrupt image: {img_path}: {e}")
return errors, stats
def validate_coco_dataset(data_dir):
"""Validate COCO-format detection dataset."""
errors = []
stats = {}
for split in ["train", "val"]:
ann_file = os.path.join(
data_dir, "annotations", f"instances_{split}2017.json"
)
img_dir = os.path.join(data_dir, f"{split}2017")
if not os.path.exists(ann_file):
errors.append(f"Missing annotation file: {ann_file}")
continue
with open(ann_file) as f:
data = json.load(f)
stats[split] = {
"images": len(data["images"]),
"annotations": len(data["annotations"]),
"categories": len(data["categories"]),
}
# Validate images exist
for img_info in data["images"][:100]: # Sample first 100
img_path = os.path.join(img_dir, img_info["file_name"])
if not os.path.exists(img_path):
errors.append(f"Missing image: {img_path}")
# Validate annotations
image_ids = {img["id"] for img in data["images"]}
category_ids = {cat["id"] for cat in data["categories"]}
for ann in data["annotations"]:
if ann["image_id"] not in image_ids:
errors.append(f"Annotation {ann['id']} references invalid image_id")
if ann["category_id"] not in category_ids:
errors.append(f"Annotation {ann['id']} references invalid category_id")
if len(ann["bbox"]) != 4:
errors.append(f"Annotation {ann['id']} has invalid bbox")
return errors, stats
def validate_segmentation_dataset(images_dir, masks_dir):
"""Validate segmentation dataset."""
errors = []
stats = {"images": 0, "classes": set()}
for split in ["train", "val"]:
img_split = os.path.join(images_dir, split)
mask_split = os.path.join(masks_dir, split)
if not os.path.exists(img_split):
errors.append(f"Missing images/{split} directory")
continue
for img_file in os.listdir(img_split):
if not img_file.lower().endswith((".jpg", ".jpeg", ".png")):
continue
stats["images"] += 1
# Check corresponding mask exists
mask_file = os.path.splitext(img_file)[0] + ".png"
mask_path = os.path.join(mask_split, mask_file)
if not os.path.exists(mask_path):
errors.append(f"Missing mask for {img_file}")
continue
# Validate dimensions match
img = Image.open(os.path.join(img_split, img_file))
mask = Image.open(mask_path)
if img.size != mask.size:
errors.append(
f"Size mismatch: {img_file} {img.size} vs mask {mask.size}"
)
# Collect unique classes
mask_array = np.array(mask)
stats["classes"].update(np.unique(mask_array).tolist())
stats["classes"] = sorted(stats["classes"])
return errors, stats
# Usage examples
if __name__ == "__main__":
# Validate classification dataset
errors, stats = validate_classification_dataset("./my_dataset")
print(f"Classification: {stats}")
if errors:
print(f"Errors: {errors[:5]}")
# Validate COCO dataset
errors, stats = validate_coco_dataset("./coco")
print(f"Detection: {stats}")
Best Practices¶
Train/Val/Test Splits¶
Recommended split ratios:
| Dataset Size | Train | Val | Test |
|---|---|---|---|
| < 1,000 images | 70% | 15% | 15% |
| 1,000 - 10,000 | 80% | 10% | 10% |
| > 10,000 | 90% | 5% | 5% |
Splitting script:
import os
import random
import shutil
def split_dataset(source_dir, dest_dir, train_ratio=0.8, val_ratio=0.1):
"""Split images into train/val/test sets."""
random.seed(42)
for class_name in os.listdir(source_dir):
class_dir = os.path.join(source_dir, class_name)
if not os.path.isdir(class_dir):
continue
images = [
f
for f in os.listdir(class_dir)
if f.lower().endswith((".jpg", ".jpeg", ".png"))
]
random.shuffle(images)
n_train = int(len(images) * train_ratio)
n_val = int(len(images) * val_ratio)
splits = {
"train": images[:n_train],
"val": images[n_train : n_train + n_val],
"test": images[n_train + n_val :],
}
for split, split_images in splits.items():
split_dir = os.path.join(dest_dir, split, class_name)
os.makedirs(split_dir, exist_ok=True)
for img in split_images:
src = os.path.join(class_dir, img)
dst = os.path.join(split_dir, img)
shutil.copy2(src, dst)
# Usage
split_dataset("./raw_images", "./dataset")
Handling Imbalanced Data¶
from autotimm import ImageDataModule
# Option 1: Use weighted sampling
data = ImageDataModule(
data_dir="./imbalanced_dataset",
image_size=224,
batch_size=32,
weighted_sampling=True, # Oversample minority classes
)
# Option 2: Use class weights in loss
import torch
# Calculate class weights (inverse frequency)
class_counts = [1000, 500, 100] # Example: 3 classes
total = sum(class_counts)
class_weights = torch.tensor([total / c for c in class_counts])
class_weights = class_weights / class_weights.sum() # Normalize
Data Augmentation Tips¶
| Task | Recommended Augmentations |
|---|---|
| Classification | RandomResizedCrop, HorizontalFlip, ColorJitter, RandAugment |
| Detection | RandomResizedCrop (with bbox transform), HorizontalFlip |
| Segmentation | RandomResizedCrop (with mask transform), HorizontalFlip, ColorJitter |
See Also¶
- Data Loading Overview - Data module documentation
- Image Classification Data - Classification details
- Object Detection Data - Detection details
- Segmentation Data - Segmentation details