import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import numpy as np
# Simple LSTM for sentiment analysis
class SentimentLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, 2) # Binary: positive/negative
def forward(self, x):
# x: [batch, seq_len]
embedded = self.embedding(x) # [batch, seq_len, embedding_dim]
lstm_out, (hidden, cell) = self.lstm(embedded)
# Use last hidden state
out = self.fc(hidden[-1]) # [batch, 2]
return out
vocab_size = 10000
model = SentimentLSTM(vocab_size=vocab_size, embedding_dim=128, hidden_dim=256)
print(model)
print(f"\nParameters: {sum(p.numel() for p in model.parameters()):,}")7 Recurrent Neural Networks (RNNs)
7.1 Why RNNs for Sequences?
CNNs excel at spatial data (images). RNNs excel at sequential data where order matters: text, time series, speech, video.
Problems with regular networks for sequences: - Fixed input size (sequences have variable length) - No memory of previous inputs - Can’t capture temporal dependencies
RNNs solve this: They have a “memory” that captures information from previous time steps.
7.2 RNN Intuition
Think of reading a sentence word-by-word. Each word’s meaning depends on previous words.
"The cat sat on the ___"
RNN processes:
"The" → hidden_state_1
"cat" + hidden_state_1 → hidden_state_2
"sat" + hidden_state_2 → hidden_state_3
"on" + hidden_state_3 → hidden_state_4
"the" + hidden_state_4 → hidden_state_5 → predict "mat"
7.3 RNN Types
1. Vanilla RNN: Simple, but suffers from vanishing gradients 2. LSTM (Long Short-Term Memory): Solves vanishing gradients, most popular 3. GRU (Gated Recurrent Unit): Simpler than LSTM, similar performance
7.4 LSTM Architecture
LSTM has gates that control information flow: - Forget gate: What to remove from memory - Input gate: What new information to add - Output gate: What to output
7.5 Example: Sentiment Analysis (IMDB)
We’ll classify movie reviews as positive or negative.
from tensorflow import keras
from tensorflow.keras import layers
# Simple LSTM for sentiment analysis
vocab_size = 10000
model = keras.Sequential([
layers.Embedding(vocab_size, 128, input_length=None), # Variable length
layers.LSTM(256, return_sequences=False),
layers.Dense(2, activation='softmax') # Binary: positive/negative
])
model.build(input_shape=(None, None)) # Variable sequence length
model.summary()7.6 Loading IMDB Dataset
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter
# Load IMDB (simplified - normally you'd use DataLoader)
print("Loading IMDB dataset...")
print("This dataset contains 50,000 movie reviews")
print("25,000 for training, 25,000 for testing")
print("Task: Classify reviews as positive (1) or negative (0)")
# Example tokenization
tokenizer = get_tokenizer("basic_english")
text = "This movie was absolutely fantastic! I loved it."
tokens = tokenizer(text)
print(f"\nExample text: {text}")
print(f"Tokens: {tokens}")from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Load IMDB
max_features = 10000
maxlen = 200
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(f"Training samples: {len(x_train)}")
print(f"Test samples: {len(x_test)}")
print(f"\nExample review (indices): {x_train[0][:20]}...")
print(f"Label: {'Positive' if y_train[0] == 1 else 'Negative'}")
# Pad sequences to same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
print(f"\nPadded shape: {x_train.shape}")7.7 Training the LSTM
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32 if torch.cuda.is_available() else 8
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(f"💻 Device: {device}")
print(f"📦 Batch size: {batch_size}")
print("\n✅ Model ready for training!")
print("\nNote: Full training takes ~20 minutes on GPU")# Compile model
batch_size = 32 if tf.config.list_physical_devices('GPU') else 8
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
print(f"💻 Device: {'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'}")
print(f"📦 Batch size: {batch_size}")
print("\n✅ Model ready for training!")
# Train (abbreviated for demo)
# history = model.fit(x_train, y_train, batch_size=batch_size, epochs=3, validation_split=0.2)7.8 Bidirectional LSTM
Process sequences in both directions for better context:
class BiLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, 2) # *2 for bidirectional
def forward(self, x):
embedded = self.embedding(x)
lstm_out, (hidden, cell) = self.lstm(embedded)
# Concatenate forward and backward hidden states
hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
return self.fc(hidden)
bi_model = BiLSTM(vocab_size=vocab_size, embedding_dim=128, hidden_dim=256)
print(f"Bidirectional LSTM parameters: {sum(p.numel() for p in bi_model.parameters()):,}")bi_model = keras.Sequential([
layers.Embedding(vocab_size, 128),
layers.Bidirectional(layers.LSTM(256)),
layers.Dense(2, activation='softmax')
])
bi_model.build(input_shape=(None, None))
print(f"Bidirectional LSTM")
bi_model.summary()7.9 Time Series Example
RNNs also work great for time series forecasting:
# Example: Temperature prediction
class TemperatureLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=2):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, 1) # Predict single value
def forward(self, x):
lstm_out, _ = self.lstm(x)
prediction = self.fc(lstm_out[:, -1, :]) # Use last time step
return prediction
temp_model = TemperatureLSTM(input_size=1, hidden_size=64, num_layers=2)
print(temp_model)
# Example input: 10 time steps
example_input = torch.randn(4, 10, 1) # [batch=4, seq_len=10, features=1]
output = temp_model(example_input)
print(f"\nInput shape: {example_input.shape}")
print(f"Output shape: {output.shape}")# Temperature prediction model
temp_model = keras.Sequential([
layers.LSTM(64, return_sequences=True, input_shape=(None, 1)),
layers.LSTM(64),
layers.Dense(1) # Predict single value
])
temp_model.summary()
# Example input
example_input = tf.random.normal([4, 10, 1]) # [batch=4, seq_len=10, features=1]
output = temp_model(example_input)
print(f"\nInput shape: {example_input.shape}")
print(f"Output shape: {output.shape}")7.10 Summary
- RNNs process sequential data with memory of previous inputs
- LSTM solves vanishing gradient problem with gates
- Bidirectional LSTMs process sequences in both directions
- Applications: sentiment analysis, time series, machine translation
- Expect 85-90% accuracy on IMDB sentiment analysis
7.11 What’s Next?
Chapter 8: Working with text data - tokenization, embeddings, and NLP pipelines!