5  Working with Image Data

In this chapter, we move from grayscale MNIST to color images with CIFAR-10: 60,000 32×32 RGB images in 10 classes (airplanes, cars, birds, cats, deer, dogs, frogs, horses, ships, trucks).

5.1 Loading CIFAR-10

import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# Load CIFAR-10
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

print(f"Training images: {len(trainset)}")
print(f"Test images: {len(testset)}")
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np

# Load CIFAR-10
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

# Normalize to [-1, 1]
x_train = (x_train.astype('float32') - 127.5) / 127.5
x_test = (x_test.astype('float32') - 127.5) / 127.5

classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

print(f"Training images: {x_train.shape[0]}")
print(f"Test images: {x_test.shape[0]}")
print(f"Image shape: {x_train.shape[1:]}")

5.2 Data Augmentation

Data augmentation artificially expands your dataset by applying random transformations. This helps prevent overfitting and improves generalization.

Common augmentations: - Random horizontal flip - Random rotation (±15°) - Random crop - Color jitter (brightness, contrast)

# Augmentation pipeline
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset_aug = torchvision.datasets.CIFAR10(root='./data', train=True, transform=train_transform)

# Visualize augmentations
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    img, label = trainset_aug[0]  # Same image, different augmentation
    img = img.permute(1, 2, 0).numpy()
    img = (img * 0.5 + 0.5)  # Denormalize
    ax.imshow(np.clip(img, 0, 1))
    ax.set_title(f'{classes[label]}')
    ax.axis('off')
plt.tight_layout()
plt.show()
# Augmentation pipeline
data_augmentation = keras.Sequential([
    keras.layers.RandomFlip("horizontal"),
    keras.layers.RandomRotation(0.1),
    keras.layers.RandomContrast(0.2),
])

# Visualize augmentations
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
sample_image = x_train[0:1]
for i, ax in enumerate(axes.flat):
    augmented = data_augmentation(sample_image, training=True)
    img = augmented[0].numpy()
    img = (img + 1) / 2  # Denormalize to [0,1]
    ax.imshow(np.clip(img, 0, 1))
    ax.set_title(f'{classes[y_train[0][0]]}')
    ax.axis('off')
plt.tight_layout()
plt.show()

5.3 Building a CIFAR-10 CNN

import torch.nn as nn
import torch.nn.functional as F

class CIFAR10Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 4 * 4, 256)
        self.fc2 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 128 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

model = CIFAR10Net()
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
model = keras.Sequential([
    data_augmentation,
    keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(32, 32, 3)),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10, activation='softmax')
])

model.build((None, 32, 32, 3))
model.summary()

5.4 Training with Smart Batch Sizes

from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32 if torch.cuda.is_available() else 8

trainloader = DataLoader(trainset_aug, batch_size=batch_size, shuffle=True)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop (abbreviated)
for epoch in range(3):  # Train for 3 epochs for demo
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainloader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 200 == 0:
            print(f'[{epoch+1}, {i}] loss: {running_loss/(i+1):.3f}')
    print(f'Epoch {epoch+1} complete\n')
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

batch_size = 32 if tf.config.list_physical_devices('GPU') else 8

history = model.fit(
    x_train, y_train,
    batch_size=batch_size,
    epochs=3,
    validation_split=0.1
)

5.5 Summary

  • CIFAR-10 is more challenging than MNIST (32×32 color images, 10 classes)
  • Data augmentation improves generalization
  • Deeper CNNs (3+ conv layers) work better for complex images
  • Dropout prevents overfitting
  • Expect 70-80% accuracy with this architecture

5.6 What’s Next?

Chapter 6: Transfer Learning - leverage pre-trained models for even better results!