7 Recurrent Neural Networks (RNNs)

7.1 Why RNNs for Sequences?

CNNs excel at spatial data (images). RNNs excel at sequential data where order matters: text, time series, speech, video.

Problems with regular networks for sequences: - Fixed input size (sequences have variable length) - No memory of previous inputs - Can’t capture temporal dependencies

RNNs solve this: They have a “memory” that captures information from previous time steps.

7.2 RNN Intuition

Think of reading a sentence word-by-word. Each word’s meaning depends on previous words.

"The cat sat on the ___"

RNN processes:
"The" → hidden_state_1
"cat" + hidden_state_1 → hidden_state_2
"sat" + hidden_state_2 → hidden_state_3
"on" + hidden_state_3 → hidden_state_4
"the" + hidden_state_4 → hidden_state_5 → predict "mat"

7.3 RNN Types

1. Vanilla RNN: Simple, but suffers from vanishing gradients 2. LSTM (Long Short-Term Memory): Solves vanishing gradients, most popular 3. GRU (Gated Recurrent Unit): Simpler than LSTM, similar performance

7.4 LSTM Architecture

LSTM has gates that control information flow: - Forget gate: What to remove from memory - Input gate: What new information to add - Output gate: What to output

7.5 Example: Sentiment Analysis (IMDB)

We’ll classify movie reviews as positive or negative.

PyTorch
TensorFlow

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Simple LSTM for sentiment analysis
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)  # Binary: positive/negative

    def forward(self, x):
        # x: [batch, seq_len]
        embedded = self.embedding(x)  # [batch, seq_len, embedding_dim]
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Use last hidden state
        out = self.fc(hidden[-1])  # [batch, 2]
        return out

vocab_size = 10000
model = SentimentLSTM(vocab_size=vocab_size, embedding_dim=128, hidden_dim=256)
print(model)
print(f"\nParameters: {sum(p.numel() for p in model.parameters()):,}")

from tensorflow import keras
from tensorflow.keras import layers

# Simple LSTM for sentiment analysis
vocab_size = 10000
model = keras.Sequential([
    layers.Embedding(vocab_size, 128, input_length=None),  # Variable length
    layers.LSTM(256, return_sequences=False),
    layers.Dense(2, activation='softmax')  # Binary: positive/negative
])

model.build(input_shape=(None, None))  # Variable sequence length
model.summary()

from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter

# Load IMDB (simplified - normally you'd use DataLoader)
print("Loading IMDB dataset...")
print("This dataset contains 50,000 movie reviews")
print("25,000 for training, 25,000 for testing")
print("Task: Classify reviews as positive (1) or negative (0)")

# Example tokenization
tokenizer = get_tokenizer("basic_english")
text = "This movie was absolutely fantastic! I loved it."
tokens = tokenizer(text)
print(f"\nExample text: {text}")
print(f"Tokens: {tokens}")

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load IMDB
max_features = 10000
maxlen = 200

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print(f"Training samples: {len(x_train)}")
print(f"Test samples: {len(x_test)}")
print(f"\nExample review (indices): {x_train[0][:20]}...")
print(f"Label: {'Positive' if y_train[0] == 1 else 'Negative'}")

# Pad sequences to same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
print(f"\nPadded shape: {x_train.shape}")

7.7 Training the LSTM

PyTorch
TensorFlow

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32 if torch.cuda.is_available() else 8

model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(f"💻 Device: {device}")
print(f"📦 Batch size: {batch_size}")
print("\n✅ Model ready for training!")
print("\nNote: Full training takes ~20 minutes on GPU")

# Compile model
batch_size = 32 if tf.config.list_physical_devices('GPU') else 8

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(f"💻 Device: {'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'}")
print(f"📦 Batch size: {batch_size}")
print("\n✅ Model ready for training!")

# Train (abbreviated for demo)
# history = model.fit(x_train, y_train, batch_size=batch_size, epochs=3, validation_split=0.2)

7.8 Bidirectional LSTM

Process sequences in both directions for better context:

PyTorch
TensorFlow

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 2)  # *2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Concatenate forward and backward hidden states
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.fc(hidden)

bi_model = BiLSTM(vocab_size=vocab_size, embedding_dim=128, hidden_dim=256)
print(f"Bidirectional LSTM parameters: {sum(p.numel() for p in bi_model.parameters()):,}")

bi_model = keras.Sequential([
    layers.Embedding(vocab_size, 128),
    layers.Bidirectional(layers.LSTM(256)),
    layers.Dense(2, activation='softmax')
])

bi_model.build(input_shape=(None, None))
print(f"Bidirectional LSTM")
bi_model.summary()

7.9 Time Series Example

RNNs also work great for time series forecasting:

PyTorch
TensorFlow

# Example: Temperature prediction
class TemperatureLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)  # Predict single value

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        prediction = self.fc(lstm_out[:, -1, :])  # Use last time step
        return prediction

temp_model = TemperatureLSTM(input_size=1, hidden_size=64, num_layers=2)
print(temp_model)

# Example input: 10 time steps
example_input = torch.randn(4, 10, 1)  # [batch=4, seq_len=10, features=1]
output = temp_model(example_input)
print(f"\nInput shape: {example_input.shape}")
print(f"Output shape: {output.shape}")

# Temperature prediction model
temp_model = keras.Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=(None, 1)),
    layers.LSTM(64),
    layers.Dense(1)  # Predict single value
])

temp_model.summary()

# Example input
example_input = tf.random.normal([4, 10, 1])  # [batch=4, seq_len=10, features=1]
output = temp_model(example_input)
print(f"\nInput shape: {example_input.shape}")
print(f"Output shape: {output.shape}")

7.10 Summary

RNNs process sequential data with memory of previous inputs
LSTM solves vanishing gradient problem with gates
Bidirectional LSTMs process sequences in both directions
Applications: sentiment analysis, time series, machine translation
Expect 85-90% accuracy on IMDB sentiment analysis

7.11 What’s Next?

Chapter 8: Working with text data - tokenization, embeddings, and NLP pipelines!