3  Your First Deep Neural Network

3.1 Why Start with MNIST?

MNIST (Modified National Institute of Standards and Technology) is the “Hello World” of deep learning. It contains 70,000 images of handwritten digits (0-9), each 28×28 pixels in grayscale.

Why it’s perfect for learning: - Small enough to train quickly on CPU - Simple enough to understand - Complex enough to need deep learning - Standard benchmark everyone uses

3.2 The Task

Input: 28×28 grayscale image (784 pixels) Output: Digit class (0, 1, 2, … 9) Type: Multi-class classification (10 classes)

3.3 Building the Network

We’ll create a fully connected (dense) neural network with: - Input layer: 784 neurons (28×28 pixels flattened) - Hidden layer 1: 128 neurons + ReLU - Hidden layer 2: 64 neurons + ReLU - Output layer: 10 neurons (one per digit) + Softmax

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

# 1. Define the neural network architecture
class DigitClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28 * 28, 128)  # 784 → 128
        self.fc2 = nn.Linear(128, 64)        # 128 → 64
        self.fc3 = nn.Linear(64, 10)         # 64 → 10 classes

    def forward(self, x):
        x = self.flatten(x)          # Flatten 28x28 to 784
        x = F.relu(self.fc1(x))      # Hidden layer 1 + ReLU
        x = F.relu(self.fc2(x))      # Hidden layer 2 + ReLU
        x = self.fc3(x)              # Output layer (logits)
        return x

# Create model
model = DigitClassifier()
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# 1. Define the neural network architecture
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),     # Flatten 28x28 to 784
    layers.Dense(128, activation='relu'),     # Hidden layer 1 + ReLU
    layers.Dense(64, activation='relu'),      # Hidden layer 2 + ReLU
    layers.Dense(10, activation='softmax')    # Output layer + Softmax
])

# Display model architecture
model.build(input_shape=(None, 28, 28))
model.summary()

3.4 Loading and Exploring the Data

# 2. Load MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),                                    # Convert to tensor
    transforms.Normalize((0.1307,), (0.3081,))               # Normalize (mean, std)
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Visualize some examples
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    image, label = train_dataset[i]
    ax.imshow(image.squeeze(), cmap='gray')
    ax.set_title(f'Label: {label}')
    ax.axis('off')
plt.tight_layout()
plt.show()
# 2. Load MNIST dataset
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Normalize pixel values to 0-1 range
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

print(f"Training samples: {x_train.shape[0]}")
print(f"Test samples: {x_test.shape[0]}")
print(f"Image shape: {x_train.shape[1:]}")

# Visualize some examples
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    ax.imshow(x_train[i], cmap='gray')
    ax.set_title(f'Label: {y_train[i]}')
    ax.axis('off')
plt.tight_layout()
plt.show()

3.5 Smart Batch Size Selection

# Hardware-aware batch size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"💻 Using device: {device}")

if torch.cuda.is_available():
    batch_size = 64
    print("💡 GPU detected! Using batch_size=64")
    print(f"   If you get OOM errors, reduce to 32 or 16")
else:
    batch_size = 16
    print("💡 CPU mode. Using batch_size=16")
    print(f"   Training will take ~5 minutes. Consider using Google Colab for faster training!")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"\nBatches per epoch: {len(train_loader)}")
# Hardware-aware batch size
gpus = tf.config.list_physical_devices('GPU')
print(f"💻 Using device: {'GPU' if gpus else 'CPU'}")

if gpus:
    batch_size = 64
    print("💡 GPU detected! Using batch_size=64")
    print(f"   If you get OOM errors, reduce to 32 or 16")
else:
    batch_size = 16
    print("💡 CPU mode. Using batch_size=16")
    print(f"   Training will take ~5 minutes. Consider using Google Colab for faster training!")

print(f"\nBatches per epoch: {len(x_train) // batch_size}")

3.6 Training the Model

# 3. Setup training
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
train_losses = []
train_accuracies = []

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)

        # Forward pass
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Track metrics
        running_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

        if batch_idx % 200 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, '
                  f'Loss: {loss.item():.4f}')

    # Epoch statistics
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100. * correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)

    print(f'\nEpoch {epoch+1} Summary: Loss = {epoch_loss:.4f}, Accuracy = {epoch_acc:.2f}%\n')

print("✅ Training complete!")
# 3. Compile and train
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=batch_size,
    epochs=5,
    validation_split=0.1,
    verbose=1
)

print("\n✅ Training complete!")

3.7 Evaluation and Testing

# 4. Evaluate on test set
model.eval()
test_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        test_loss += criterion(output, target).item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

test_loss /= len(test_loader)
test_accuracy = 100. * correct / total

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.2f}%')
# 4. Evaluate on test set
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)

print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy*100:.2f}%')

3.8 Visualizing Results

# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(train_losses)
ax1.set_title('Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.grid(True, alpha=0.3)

ax2.plot(train_accuracies)
ax2.set_title('Training Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(history.history['loss'], label='Training')
ax1.plot(history.history['val_loss'], label='Validation')
ax1.set_title('Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(history.history['accuracy'], label='Training')
ax2.plot(history.history['val_accuracy'], label='Validation')
ax2.set_title('Accuracy')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

3.9 Making Predictions

# Make predictions on test samples
model.eval()
with torch.no_grad():
    # Get first batch from test set
    test_images, test_labels = next(iter(test_loader))
    test_images = test_images.to(device)

    # Make predictions
    outputs = model(test_images)
    probabilities = F.softmax(outputs, dim=1)
    predictions = outputs.argmax(dim=1)

# Visualize predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    image = test_images[i].cpu().squeeze()
    true_label = test_labels[i].item()
    pred_label = predictions[i].item()
    confidence = probabilities[i][pred_label].item()

    ax.imshow(image, cmap='gray')
    color = 'green' if pred_label == true_label else 'red'
    ax.set_title(f'True: {true_label}, Pred: {pred_label}\nConf: {confidence:.2f}', color=color)
    ax.axis('off')

plt.tight_layout()
plt.show()
# Make predictions on test samples
predictions = model.predict(x_test[:10])
predicted_classes = predictions.argmax(axis=1)

# Visualize predictions
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    ax.imshow(x_test[i], cmap='gray')
    true_label = y_test[i]
    pred_label = predicted_classes[i]
    confidence = predictions[i][pred_label]

    color = 'green' if pred_label == true_label else 'red'
    ax.set_title(f'True: {true_label}, Pred: {pred_label}\nConf: {confidence:.2f}', color=color)
    ax.axis('off')

plt.tight_layout()
plt.show()

3.10 Saving and Loading Models

# Save the model
torch.save(model.state_dict(), 'mnist_model.pth')
print("✅ Model saved to mnist_model.pth")

# Load the model
loaded_model = DigitClassifier()
loaded_model.load_state_dict(torch.load('mnist_model.pth'))
loaded_model = loaded_model.to(device)
loaded_model.eval()
print("✅ Model loaded successfully")
# Save the model
model.save('mnist_model.h5')
print("✅ Model saved to mnist_model.h5")

# Load the model
loaded_model = keras.models.load_model('mnist_model.h5')
print("✅ Model loaded successfully")

3.11 Summary

What we built: - A 3-layer deep neural network with 101,770 parameters - Trained on 60,000 images in ~5 minutes (CPU) or ~30 seconds (GPU) - Achieved ~97-98% accuracy on test set

Key concepts: - Flattening images into 1D vectors for dense layers - Using ReLU activation in hidden layers - Using softmax for multi-class classification - Smart batch size selection based on hardware - Tracking training metrics - Saving/loading models for later use

3.12 What’s Next?

In Chapter 4, we’ll learn about Convolutional Neural Networks (CNNs)—specialized architectures for image data that preserve spatial structure and achieve even better results!

🎯 Challenge

Try modifying the network: 1. Add another hidden layer with 32 neurons 2. Change learning rate to 0.0001 or 0.01 3. Train for 10 epochs instead of 5

What happens to accuracy?