import torch.nn as nn
import math
class TransformerBlock(nn.Module):
def __init__(self, hidden_dim, num_heads, ff_dim, dropout=0.1,
use_relative_pos=True):
tremendous().__init__()
self.hidden_dim = hidden_dim
self.num_heads = num_heads
self.head_dim = hidden_dim // num_heads
self.use_relative_pos = use_relative_pos
# Multi-head consideration
self.q_linear = nn.Linear(hidden_dim, hidden_dim)
self.k_linear = nn.Linear(hidden_dim, hidden_dim)
self.v_linear = nn.Linear(hidden_dim, hidden_dim)
self.out_linear = nn.Linear(hidden_dim, hidden_dim)
# Relative positional encoding
if use_relative_pos:
self.rel_pos_embed = nn.Parameter(
torch.randn(2 * hidden_dim – 1, self.head_dim))
# Layer normalization
self.norm1 = nn.LayerNorm(hidden_dim)
self.norm2 = nn.LayerNorm(hidden_dim)
# Feed-forward community
self.ff = nn.Sequential(
nn.Linear(hidden_dim, ff_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(ff_dim, hidden_dim)
)
self.dropout = nn.Dropout(dropout)
def relative_position_to_absolute(self, x):
“””Convert relative place illustration to absolute”””
batch_size, num_heads, seq_length, _ = x.measurement()
# Pad for shifting
col_pad = torch.zeros(batch_size, num_heads, seq_length, 1,
machine=x.machine)
x = torch.cat([x, col_pad], dim=-1)
flat_x = x.view(batch_size, num_heads, seq_length * 2 – 1)
flat_pad = torch.zeros(batch_size, num_heads, seq_length – 1,
machine=x.machine)
flat_x_padded = torch.cat([flat_x, flat_pad], dim=-1)
# Reshape and slice out the padded parts
final_x = flat_x_padded.view(batch_size, num_heads, seq_length + 1,
seq_length)
final_x = final_x[:, :, :seq_length, :]
return final_x
def ahead(self, x, masks=None):
batch_size, seq_length, _ = x.measurement()
# Multi-head consideration
q = self.q_linear(x).view(batch_size, seq_length, self.num_heads,
self.head_dim).transpose(1, 2)
okay = self.k_linear(x).view(batch_size, seq_length, self.num_heads,
self.head_dim).transpose(1, 2)
v = self.v_linear(x).view(batch_size, seq_length, self.num_heads,
self.head_dim).transpose(1, 2)
# Scaled dot-product consideration
scores = torch.matmul(q, okay.transpose(-2, -1)) / math.sqrt(self.head_dim)
# Add relative positional encoding
if self.use_relative_pos:
rel_pos_bias = self.compute_relative_position_bias(seq_length)
scores = scores + rel_pos_bias
if masks is just not None:
scores = scores.masked_fill(masks == 0, float(‘-inf’))
attention_weights = torch.softmax(scores, dim=-1)
attention_weights = self.dropout(attention_weights)
context = torch.matmul(attention_weights, v)
# Reshape and apply output projection
context = context.transpose(1, 2).contiguous().view(
batch_size, seq_length, self.hidden_dim)
out = self.out_linear(context)
# Residual connection and layer normalization
x = self.norm1(x + self.dropout(out))
# Feed-forward community
ff_out = self.ff(x)
# Residual connection and layer normalization
x = self.norm2(x + self.dropout(ff_out))
return x, attention_weights
def compute_relative_position_bias(self, seq_length):
“””Compute relative positional bias”””
if not self.use_relative_pos:
return 0
positions = torch.arange(seq_length, machine=self.rel_pos_embed.machine)
relative_positions = positions.unsqueeze(1) – positions.unsqueeze(0)
relative_positions += seq_length – 1 # Shift to all optimistic indices
bias = self.rel_pos_embed[relative_positions]
return bias.unsqueeze(0).unsqueeze(0)