import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
# Load CIFAR-10
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
print(f"Training images: {len(trainset)}")
print(f"Test images: {len(testset)}")5 Working with Image Data
In this chapter, we move from grayscale MNIST to color images with CIFAR-10: 60,000 32×32 RGB images in 10 classes (airplanes, cars, birds, cats, deer, dogs, frogs, horses, ships, trucks).
5.1 Loading CIFAR-10
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
# Load CIFAR-10
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
# Normalize to [-1, 1]
x_train = (x_train.astype('float32') - 127.5) / 127.5
x_test = (x_test.astype('float32') - 127.5) / 127.5
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
print(f"Training images: {x_train.shape[0]}")
print(f"Test images: {x_test.shape[0]}")
print(f"Image shape: {x_train.shape[1:]}")5.2 Data Augmentation
Data augmentation artificially expands your dataset by applying random transformations. This helps prevent overfitting and improves generalization.
Common augmentations: - Random horizontal flip - Random rotation (±15°) - Random crop - Color jitter (brightness, contrast)
# Augmentation pipeline
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset_aug = torchvision.datasets.CIFAR10(root='./data', train=True, transform=train_transform)
# Visualize augmentations
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
img, label = trainset_aug[0] # Same image, different augmentation
img = img.permute(1, 2, 0).numpy()
img = (img * 0.5 + 0.5) # Denormalize
ax.imshow(np.clip(img, 0, 1))
ax.set_title(f'{classes[label]}')
ax.axis('off')
plt.tight_layout()
plt.show()# Augmentation pipeline
data_augmentation = keras.Sequential([
keras.layers.RandomFlip("horizontal"),
keras.layers.RandomRotation(0.1),
keras.layers.RandomContrast(0.2),
])
# Visualize augmentations
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
sample_image = x_train[0:1]
for i, ax in enumerate(axes.flat):
augmented = data_augmentation(sample_image, training=True)
img = augmented[0].numpy()
img = (img + 1) / 2 # Denormalize to [0,1]
ax.imshow(np.clip(img, 0, 1))
ax.set_title(f'{classes[y_train[0][0]]}')
ax.axis('off')
plt.tight_layout()
plt.show()5.3 Building a CIFAR-10 CNN
import torch.nn as nn
import torch.nn.functional as F
class CIFAR10Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(128 * 4 * 4, 256)
self.fc2 = nn.Linear(256, 10)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(-1, 128 * 4 * 4)
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
model = CIFAR10Net()
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")model = keras.Sequential([
data_augmentation,
keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(32, 32, 3)),
keras.layers.MaxPooling2D((2, 2)),
keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
keras.layers.MaxPooling2D((2, 2)),
keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
keras.layers.MaxPooling2D((2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(256, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(10, activation='softmax')
])
model.build((None, 32, 32, 3))
model.summary()5.4 Training with Smart Batch Sizes
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32 if torch.cuda.is_available() else 8
trainloader = DataLoader(trainset_aug, batch_size=batch_size, shuffle=True)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Training loop (abbreviated)
for epoch in range(3): # Train for 3 epochs for demo
running_loss = 0.0
for i, (inputs, labels) in enumerate(trainloader):
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 200 == 0:
print(f'[{epoch+1}, {i}] loss: {running_loss/(i+1):.3f}')
print(f'Epoch {epoch+1} complete\n')model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
batch_size = 32 if tf.config.list_physical_devices('GPU') else 8
history = model.fit(
x_train, y_train,
batch_size=batch_size,
epochs=3,
validation_split=0.1
)5.5 Summary
- CIFAR-10 is more challenging than MNIST (32×32 color images, 10 classes)
- Data augmentation improves generalization
- Deeper CNNs (3+ conv layers) work better for complex images
- Dropout prevents overfitting
- Expect 70-80% accuracy with this architecture
5.6 What’s Next?
Chapter 6: Transfer Learning - leverage pre-trained models for even better results!