This lecture will guide you through the process of implementing a simple Transformer model from scratch using PyTorch. The Transformer architecture, introduced in Attention Is All You Need (Vaswani et al., 2017), is the backbone of modern NLP and AI systems. This hands-on session will help you understand how to build, train, and test a Transformer model for sequence-to-sequence tasks.
A Transformer consists of encoder and decoder layers, each composed of:
The encoder processes the input sequence, while the decoder generates an output sequence using the encoder’s hidden representations.
We’ll build a basic Transformer model step by step.
Before starting, install the required Python libraries:
pip install torch torchtext numpy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
Since Transformers do not have a built-in sequence order mechanism, we use Positional Encoding to add sequence information.
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0))
def forward(self, x):
return x + self.pe[:, :x.size(1)]
# This function creates a sine and cosine wave-based encoding to provide word order information.def scaled_dot_product_attention(Q, K, V, mask=None):
d_k = Q.size(-1)
scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention_weights = F.softmax(scores, dim=-1)
return torch.matmul(attention_weights, V)
# This function calculates the self-attention mechanism by computing the dot product between Query (Q), Key (K), and Value (V), scaling the scores, and applying softmax.class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_k = d_model // num_heads
self.num_heads = num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def forward(self, Q, K, V, mask=None):
Q = self.W_q(Q).view(Q.size(0), -1, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(K).view(K.size(0), -1, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(V).view(V.size(0), -1, self.num_heads, self.d_k).transpose(1, 2)
if mask is not None:
mask = mask.unsqueeze(1)
attention_output = scaled_dot_product_attention(Q, K, V, mask)
attention_output = attention_output.transpose(1, 2).contiguous().view(Q.size(0), -1, self.num_heads * self.d_k)
return self.W_o(attention_output)
# This Multi-Head Attention module runs multiple attention layers in parallel to capture different aspects of input sequences.class FeedForward(nn.Module):
def __init__(self, d_model, d_ff):
super(FeedForward, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.relu = nn.ReLU()
def forward(self, x):
return self.fc2(self.relu(self.fc1(x)))
# This fully connected feedforward network processes each word representation independently.class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff):
super(TransformerEncoderLayer, self).__init__()
self.attention = MultiHeadAttention(d_model, num_heads)
self.norm1 = nn.LayerNorm(d_model)
self.ffn = FeedForward(d_model, d_ff)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
attn_output = self.attention(x, x, x, mask)
x = self.norm1(x + attn_output)
ffn_output = self.ffn(x)
return self.norm2(x + ffn_output)
# This layer combines multi-head attention, feedforward networks, and layer normalizationclass TransformerEncoder(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, max_seq_length):
super(TransformerEncoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
self.encoder_layers = nn.ModuleList([
TransformerEncoderLayer(d_model, num_heads, d_ff)
for _ in range(num_layers)
])
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self, x, mask=None):
# Embed input and add positional encoding
x = self.embedding(x)
x = self.positional_encoding(x)
# Pass through encoder layers
for layer in self.encoder_layers:
x = layer(x, mask)
# Final output layer
return self.fc_out(x)After defining the model, training involves:
# Example of how to use the model
def main():
# Hyperparameters
vocab_size = 10000 # Size of your vocabulary
d_model = 512 # Embedding dimension
num_heads = 8 # Number of attention heads
d_ff = 2048 # Feedforward network dimension
num_layers = 6 # Number of transformer layers
max_seq_length = 100 # Maximum sequence length
# Create model
model = TransformerEncoder(
vocab_size=vocab_size,
d_model=d_model,
num_heads=num_heads,
d_ff=d_ff,
num_layers=num_layers,
max_seq_length=max_seq_length
)
# Dummy training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
# Simulate training
for epoch in range(10):
# Generate dummy input and target data
input_data = torch.randint(0, vocab_size, (32, max_seq_length)) # Batch of 32 sequences
target_data = torch.randint(0, vocab_size, (32, max_seq_length))
optimizer.zero_grad()
outputs = model(input_data)
# Reshape for loss calculation
loss = criterion(outputs.view(-1, vocab_size), target_data.view(-1))
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}, Loss: {loss.item()}")
if __name__ == "__main__":
main()