import torch
import torch.nn as nn
from torchvision import models, transforms
import torch.optim as optim
# Load pre-trained ResNet18
model = models.resnet18(pretrained=True)
# Freeze all layers
for param in model.parameters():
param.requires_grad = False
# Replace final layer for binary classification
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 2) # 2 classes: cat, dog
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Frozen parameters: {sum(p.numel() for p in model.parameters() if not p.requires_grad):,}")
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.001) # Only train fc layer
print(f"\n✅ Model ready for training!")6 Transfer Learning & Pre-trained Models
6.1 What is Transfer Learning?
Instead of training from scratch, we use models pre-trained on massive datasets (ImageNet: 14M images, 1000 classes) and adapt them to our task.
Why it works: - Early layers learn universal features (edges, textures) - We only need to retrain the final layers for our specific task - Much faster and better results with less data
When to use: - ✅ Small dataset (< 10k images) - ✅ Similar task to ImageNet (object recognition) - ✅ Limited compute resources
6.2 Popular Pre-trained Models
| Model | Parameters | Top-1 Accuracy | Speed |
|---|---|---|---|
| ResNet50 | 25M | 76% | Fast |
| VGG16 | 138M | 71% | Moderate |
| EfficientNet-B0 | 5M | 77% | Fast |
| MobileNetV2 | 3.5M | 72% | Very Fast |
6.3 Transfer Learning Strategies
1. Feature Extraction: Freeze pre-trained weights, train only new classifier 2. Fine-tuning: Unfreeze some layers and train with low learning rate
6.4 Example: Cat vs Dog Classifier
from tensorflow.keras.applications import ResNet50
from tensorflow import keras
# Load pre-trained ResNet50 (without top classifier)
base_model = ResNet50(
weights='imagenet',
include_top=False,
input_shape=(224, 224, 3)
)
# Freeze base model
base_model.trainable = False
# Build classifier on top
model = keras.Sequential([
base_model,
keras.layers.GlobalAveragePooling2D(),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dropout(0.5),
keras.layers.Dense(2, activation='softmax') # 2 classes: cat, dog
])
# Compile
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
print(f"Trainable parameters: {sum([tf.size(w).numpy() for w in model.trainable_weights]):,}")
print(f"\n✅ Model ready for training!")6.5 Data Preprocessing for Pre-trained Models
Pre-trained models expect specific input sizes and normalization:
# ResNet expects 224x224 images, ImageNet normalization
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], # ImageNet stats
std=[0.229, 0.224, 0.225])
])
# For training: add augmentation
train_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])from tensorflow.keras.applications.resnet50 import preprocess_input
# ResNet expects 224x224 images
def preprocess_image(image, label):
image = tf.image.resize(image, [224, 224])
image = preprocess_input(image) # ImageNet normalization
return image, label
# For training: add augmentation
data_augmentation = keras.Sequential([
keras.layers.RandomFlip("horizontal"),
keras.layers.RandomRotation(0.1),
keras.layers.RandomZoom(0.1),
])6.6 Fine-Tuning
After training the classifier, optionally unfreeze some layers for fine-tuning:
# Unfreeze last residual block
for name, param in model.named_parameters():
if "layer4" in name or "fc" in name: # Last block + classifier
param.requires_grad = True
else:
param.requires_grad = False
# Use smaller learning rate for fine-tuning
optimizer = optim.Adam(model.parameters(), lr=0.0001)
print(f"Fine-tuning parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")# Unfreeze last layers
base_model.trainable = True
# Freeze all except last 20 layers
for layer in base_model.layers[:-20]:
layer.trainable = False
# Recompile with lower learning rate
model.compile(
optimizer=keras.optimizers.Adam(1e-5), # Lower LR for fine-tuning
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
print(f"Fine-tuning {sum([1 for w in model.trainable_weights])} layers")6.7 When to Use Which Model?
| Scenario | Recommended Model | Why |
|---|---|---|
| Limited GPU | MobileNetV2, EfficientNet-B0 | Small, fast |
| High accuracy | ResNet50, EfficientNet-B3 | Best accuracy |
| Very small dataset | ResNet18, MobileNetV2 | Less prone to overfit |
| Real-time inference | MobileNetV2 | Fastest |
6.8 Summary
- Transfer learning leverages pre-trained models (ImageNet)
- Freeze early layers, train only classifier (feature extraction)
- Optionally fine-tune last layers with low learning rate
- Achieves better results with less data and faster training
- ResNet, EfficientNet, MobileNet are popular choices
6.9 What’s Next?
Chapter 7: Recurrent Neural Networks for sequential data (text, time series)!