import torch
import torch.nn as nn
import torch.nn.functional as F
class SelfAttention(nn.Module):
def __init__(self, embed_size, heads=8):
super().__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
assert self.head_dim * heads == embed_size, "Embed size must be divisible by heads"
self.queries = nn.Linear(embed_size, embed_size)
self.keys = nn.Linear(embed_size, embed_size)
self.values = nn.Linear(embed_size, embed_size)
self.fc_out = nn.Linear(embed_size, embed_size)
def forward(self, x):
N, seq_length, _ = x.shape
# Linear projections
queries = self.queries(x) # (N, seq_len, embed_size)
keys = self.keys(x)
values = self.values(x)
# Split into multiple heads
queries = queries.reshape(N, seq_length, self.heads, self.head_dim)
keys = keys.reshape(N, seq_length, self.heads, self.head_dim)
values = values.reshape(N, seq_length, self.heads, self.head_dim)
# Scaled dot-product attention
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)
out = torch.einsum("nhql,nlhd->nqhd", [attention, values])
out = out.reshape(N, seq_length, self.embed_size)
return self.fc_out(out)
# Example usage
attention = SelfAttention(embed_size=256, heads=8)
x = torch.randn(2, 10, 256) # (batch, seq_len, embed_size)
output = attention(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")14 Advanced Topics & Modern Architectures
This chapter introduces cutting-edge deep learning architectures that power modern AI: Transformers, GANs, Autoencoders, and more. Each section includes hands-on implementations.
14.1 14.1 Attention Mechanisms
Attention allows models to focus on relevant parts of the input. It revolutionized NLP and is now used in computer vision too.
14.1.1 Self-Attention Intuition
When reading “The animal didn’t cross the street because it was too tired”, attention helps the model understand that “it” refers to “animal” not “street”.
Attention computes: 1. Query: What am I looking for? 2. Key: What do I have? 3. Value: What information should I return?
14.1.2 Implementing Attention
import tensorflow as tf
from tensorflow import keras
class SelfAttention(keras.layers.Layer):
def __init__(self, embed_size, heads=8):
super().__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
self.query_dense = keras.layers.Dense(embed_size)
self.key_dense = keras.layers.Dense(embed_size)
self.value_dense = keras.layers.Dense(embed_size)
self.combine_heads = keras.layers.Dense(embed_size)
def split_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.heads, self.head_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, x):
batch_size = tf.shape(x)[0]
query = self.query_dense(x)
key = self.key_dense(x)
value = self.value_dense(x)
query = self.split_heads(query, batch_size)
key = self.split_heads(key, batch_size)
value = self.split_heads(value, batch_size)
# Scaled dot-product attention
score = tf.matmul(query, key, transpose_b=True)
score = score / tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
attention_weights = tf.nn.softmax(score, axis=-1)
attention_output = tf.matmul(attention_weights, value)
attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(attention_output, (batch_size, -1, self.embed_size))
return self.combine_heads(concat_attention)
# Example usage
attention = SelfAttention(embed_size=256, heads=8)
x = tf.random.normal([2, 10, 256])
output = attention(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")14.2 14.2 Transformers
Transformers use self-attention to process sequences in parallel (unlike RNNs which are sequential). They power GPT, BERT, and modern NLP.
14.2.1 Transformer Architecture
Input → Embedding + Positional Encoding
↓
Multi-Head Self-Attention
↓
Feed-Forward Network
↓
Output
14.2.2 Hands-On: Text Classification with Transformers
Using Hugging Face (the easiest way to use transformers):
# Install: pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Load pre-trained BERT
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Example inference
text = "This movie was absolutely fantastic! I loved every minute."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
model.eval()
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.softmax(outputs.logits, dim=1)
sentiment = "Positive" if predictions[0][1] > 0.5 else "Negative"
confidence = predictions.max().item()
print(f"Text: {text}")
print(f"Sentiment: {sentiment}")
print(f"Confidence: {confidence:.2f}")
print("\n✅ Transformer model loaded and ready!")# Install: pip install transformers
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
# Load pre-trained BERT
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Example inference
text = "This movie was absolutely fantastic! I loved every minute."
inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
predictions = tf.nn.softmax(outputs.logits, axis=1)
sentiment = "Positive" if predictions[0][1] > 0.5 else "Negative"
confidence = tf.reduce_max(predictions).numpy()
print(f"Text: {text}")
print(f"Sentiment: {sentiment}")
print(f"Confidence: {confidence:.2f}")
print("\n✅ Transformer model loaded and ready!")14.2.3 Fine-tuning Transformers
from torch.utils.data import DataLoader, TensorDataset
# Prepare data (example)
texts = ["I love this!", "Terrible experience", "Amazing product"]
labels = [1, 0, 1] # 1=positive, 0=negative
# Tokenize
encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
# Create dataset
dataset = TensorDataset(encoded['input_ids'], encoded['attention_mask'], torch.tensor(labels))
dataloader = DataLoader(dataset, batch_size=2)
# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
# Training loop (abbreviated)
model.train()
for batch in dataloader:
input_ids, attention_mask, labels = batch
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Loss: {loss.item():.4f}")
print("✅ Fine-tuning complete!")# Prepare data
texts = ["I love this!", "Terrible experience", "Amazing product"]
labels = [1, 0, 1]
# Tokenize
encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="tf")
# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
# Train
history = model.fit(
encoded,
tf.constant(labels),
epochs=3,
batch_size=2
)
print("✅ Fine-tuning complete!")14.3 14.3 Vision Transformers (ViT)
Transformers for images! Divide image into patches and treat them like “words”.
from transformers import ViTForImageClassification, ViTFeatureExtractor
from PIL import Image
import requests
# Load pre-trained ViT
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
# Example (using a sample image URL)
# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
# image = Image.open(requests.get(url, stream=True).raw)
# inputs = feature_extractor(images=image, return_tensors="pt")
# outputs = model(**inputs)
# predicted_class = outputs.logits.argmax(-1).item()
print("✅ Vision Transformer loaded!")
print("ViT treats images as sequences of patches")from transformers import TFViTForImageClassification, ViTFeatureExtractor
# Load pre-trained ViT
model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
# Example usage (similar to PyTorch)
print("✅ Vision Transformer loaded!")
print("ViT divides images into 16x16 patches")14.4 14.4 Autoencoders
Autoencoders learn compressed representations of data. Useful for: - Dimensionality reduction - Denoising - Anomaly detection - Generative modeling
14.4.1 Building a Convolutional Autoencoder
import torch
import torch.nn as nn
class ConvAutoencoder(nn.Module):
def __init__(self):
super().__init__()
# Encoder
self.encoder = nn.Sequential(
nn.Conv2d(1, 16, 3, stride=2, padding=1), # 28x28 → 14x14
nn.ReLU(),
nn.Conv2d(16, 32, 3, stride=2, padding=1), # 14x14 → 7x7
nn.ReLU(),
nn.Conv2d(32, 64, 7) # 7x7 → 1x1 (latent space)
)
# Decoder
self.decoder = nn.Sequential(
nn.ConvTranspose2d(64, 32, 7), # 1x1 → 7x7
nn.ReLU(),
nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1), # 7x7 → 14x14
nn.ReLU(),
nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1), # 14x14 → 28x28
nn.Sigmoid()
)
def forward(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
autoencoder = ConvAutoencoder()
print(autoencoder)
# Test
x = torch.randn(1, 1, 28, 28)
reconstructed = autoencoder(x)
print(f"\nInput shape: {x.shape}")
print(f"Reconstructed shape: {reconstructed.shape}")from tensorflow import keras
# Encoder
encoder = keras.Sequential([
keras.layers.Conv2D(16, 3, strides=2, padding='same', activation='relu', input_shape=(28, 28, 1)),
keras.layers.Conv2D(32, 3, strides=2, padding='same', activation='relu'),
keras.layers.Conv2D(64, 7, padding='valid', activation='relu') # Latent space
])
# Decoder
decoder = keras.Sequential([
keras.layers.Conv2DTranspose(32, 7, padding='valid', activation='relu'),
keras.layers.Conv2DTranspose(16, 3, strides=2, padding='same', activation='relu'),
keras.layers.Conv2DTranspose(1, 3, strides=2, padding='same', activation='sigmoid')
])
# Full autoencoder
autoencoder = keras.Sequential([encoder, decoder])
autoencoder.build((None, 28, 28, 1))
autoencoder.summary()14.4.2 Training Autoencoder on MNIST
# Load MNIST (as in Chapter 3)
# from torchvision import datasets, transforms
# Train autoencoder
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)
# Training loop:
# for epoch in range(epochs):
# for data, _ in dataloader:
# output = autoencoder(data)
# loss = criterion(output, data) # Reconstruct input!
# optimizer.zero_grad()
# loss.backward()
# optimizer.step()
print("✅ Autoencoder ready for training")
print("Goal: Reconstruct input images")autoencoder.compile(optimizer='adam', loss='mse')
# Train:
# history = autoencoder.fit(x_train, x_train, epochs=10, batch_size=128)
# Note: x_train is both input AND target!
print("✅ Autoencoder ready for training")
print("Goal: Reconstruct input images")14.5 14.5 Generative Adversarial Networks (GANs)
GANs consist of two networks: - Generator: Creates fake images - Discriminator: Distinguishes real from fake
They compete: Generator tries to fool Discriminator, Discriminator tries to catch fakes.
14.5.1 Simple GAN for MNIST
class Generator(nn.Module):
def __init__(self, latent_dim=100):
super().__init__()
self.model = nn.Sequential(
nn.Linear(latent_dim, 256),
nn.ReLU(),
nn.Linear(256, 512),
nn.ReLU(),
nn.Linear(512, 1024),
nn.ReLU(),
nn.Linear(1024, 28*28),
nn.Tanh() # Output range [-1, 1]
)
def forward(self, z):
img = self.model(z)
return img.view(-1, 1, 28, 28)
class Discriminator(nn.Module):
def __init__(self):
super().__init__()
self.model = nn.Sequential(
nn.Linear(28*28, 512),
nn.LeakyReLU(0.2),
nn.Linear(512, 256),
nn.LeakyReLU(0.2),
nn.Linear(256, 1),
nn.Sigmoid() # Probability [0, 1]
)
def forward(self, img):
img_flat = img.view(-1, 28*28)
return self.model(img_flat)
generator = Generator()
discriminator = Discriminator()
print("Generator:")
print(generator)
print("\nDiscriminator:")
print(discriminator)
# Test
z = torch.randn(4, 100) # Random noise
fake_images = generator(z)
print(f"\nGenerated images shape: {fake_images.shape}")def make_generator():
model = keras.Sequential([
keras.layers.Dense(256, activation='relu', input_shape=(100,)),
keras.layers.Dense(512, activation='relu'),
keras.layers.Dense(1024, activation='relu'),
keras.layers.Dense(28*28, activation='tanh'),
keras.layers.Reshape((28, 28, 1))
])
return model
def make_discriminator():
model = keras.Sequential([
keras.layers.Flatten(input_shape=(28, 28, 1)),
keras.layers.Dense(512, activation=keras.layers.LeakyReLU(0.2)),
keras.layers.Dense(256, activation=keras.layers.LeakyReLU(0.2)),
keras.layers.Dense(1, activation='sigmoid')
])
return model
generator = make_generator()
discriminator = make_discriminator()
print("Generator:")
generator.summary()
print("\nDiscriminator:")
discriminator.summary()14.5.2 GAN Training Loop
# Optimizers
g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0002)
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.0002)
criterion = nn.BCELoss()
# Training loop (conceptual)
# for epoch in range(epochs):
# for real_images, _ in dataloader:
# batch_size = real_images.size(0)
#
# # Train Discriminator
# real_labels = torch.ones(batch_size, 1)
# fake_labels = torch.zeros(batch_size, 1)
#
# d_loss_real = criterion(discriminator(real_images), real_labels)
#
# z = torch.randn(batch_size, 100)
# fake_images = generator(z)
# d_loss_fake = criterion(discriminator(fake_images.detach()), fake_labels)
#
# d_loss = d_loss_real + d_loss_fake
# d_optimizer.zero_grad()
# d_loss.backward()
# d_optimizer.step()
#
# # Train Generator
# z = torch.randn(batch_size, 100)
# fake_images = generator(z)
# g_loss = criterion(discriminator(fake_images), real_labels) # Fool discriminator!
#
# g_optimizer.zero_grad()
# g_loss.backward()
# g_optimizer.step()
print("✅ GAN training loop ready")
print("Generator learns to create realistic images")# Compile
g_optimizer = keras.optimizers.Adam(0.0002)
d_optimizer = keras.optimizers.Adam(0.0002)
loss_fn = keras.losses.BinaryCrossentropy()
# Training loop (conceptual)
# for epoch in range(epochs):
# for real_images in dataset:
# # Train discriminator
# z = tf.random.normal([batch_size, 100])
# fake_images = generator(z)
#
# with tf.GradientTape() as tape:
# real_output = discriminator(real_images)
# fake_output = discriminator(fake_images)
# d_loss = loss_fn(tf.ones_like(real_output), real_output) + \
# loss_fn(tf.zeros_like(fake_output), fake_output)
#
# gradients = tape.gradient(d_loss, discriminator.trainable_variables)
# d_optimizer.apply_gradients(zip(gradients, discriminator.trainable_variables))
#
# # Train generator
# with tf.GradientTape() as tape:
# z = tf.random.normal([batch_size, 100])
# fake_images = generator(z)
# fake_output = discriminator(fake_images)
# g_loss = loss_fn(tf.ones_like(fake_output), fake_output)
#
# gradients = tape.gradient(g_loss, generator.trainable_variables)
# g_optimizer.apply_gradients(zip(gradients, generator.trainable_variables))
print("✅ GAN training loop ready")
print("Generator learns to create realistic images")14.6 14.6 Object Detection (Brief Overview)
Object detection locates and classifies multiple objects in an image.
Popular architectures: - YOLO (You Only Look Once): Real-time detection - Faster R-CNN: High accuracy - SSD (Single Shot Detector): Balance of speed and accuracy
# Using torchvision's pre-trained models
from torchvision.models.detection import fasterrcnn_resnet50_fpn
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()
print("✅ Faster R-CNN loaded")
print("Detects 91 object categories (COCO dataset)")# Using TensorFlow Hub
# import tensorflow_hub as hub
# model = hub.load("https://tfhub.dev/tensorflow/ssd_mobilenet_v2/2")
print("✅ SSD MobileNet available via TF Hub")
print("Detects 91 object categories (COCO dataset)")14.7 14.7 Semantic Segmentation (Brief Overview)
Semantic segmentation classifies every pixel in an image.
U-Net: Popular architecture for medical imaging and segmentation.
Encoder (downsampling) → Bottleneck → Decoder (upsampling)
↓ ↑
└─────── Skip Connections ────────┘
14.8 14.8 Beyond This Book
What to explore next: - Reinforcement Learning: DeepMind’s AlphaGo, game AI - Graph Neural Networks: Social networks, molecules - Neural Architecture Search: AutoML - Multimodal Models: CLIP, DALL-E (text + images) - Large Language Models: GPT, LLaMA
Resources: - Papers: arxiv.org - Courses: Fast.ai, Deeplearning.ai - Hardware: tensorrigs.com
14.9 Summary
What you learned: - ✅ Attention mechanisms and self-attention - ✅ Transformers for text (BERT, GPT concepts) - ✅ Vision Transformers (ViT) - ✅ Autoencoders for compression and denoising - ✅ GANs for generating realistic images - ✅ Object detection and segmentation (overview)
Congratulations! You now have a solid foundation in deep learning fundamentals and modern architectures!
14.10 What’s Next?
Keep building projects, read research papers, and stay curious. The field is evolving rapidly—continue learning and experimenting!
Visit tensorrigs.com for hardware guides when you’re ready to scale up your deep learning experiments.