A Technical Overview of the Attention Mechanism in Deep Learning | by Silva.f.francis

import torch
import torch.nn as nn
import math

class TransformerBlock(nn.Module):
def __init__(self, hidden_dim, num_heads, ff_dim, dropout=0.1,
use_relative_pos=True):
tremendous().__init__()
self.hidden_dim = hidden_dim
self.num_heads = num_heads
self.head_dim = hidden_dim // num_heads
self.use_relative_pos = use_relative_pos

# Multi-head consideration
self.q_linear = nn.Linear(hidden_dim, hidden_dim)
self.k_linear = nn.Linear(hidden_dim, hidden_dim)
self.v_linear = nn.Linear(hidden_dim, hidden_dim)
self.out_linear = nn.Linear(hidden_dim, hidden_dim)

# Relative positional encoding
if use_relative_pos:
self.rel_pos_embed = nn.Parameter(
torch.randn(2 * hidden_dim – 1, self.head_dim))

# Layer normalization
self.norm1 = nn.LayerNorm(hidden_dim)
self.norm2 = nn.LayerNorm(hidden_dim)

# Feed-forward community
self.ff = nn.Sequential(
nn.Linear(hidden_dim, ff_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(ff_dim, hidden_dim)
)

self.dropout = nn.Dropout(dropout)

def relative_position_to_absolute(self, x):
“””Convert relative place illustration to absolute”””
batch_size, num_heads, seq_length, _ = x.measurement()

# Pad for shifting
col_pad = torch.zeros(batch_size, num_heads, seq_length, 1,
machine=x.machine)
x = torch.cat([x, col_pad], dim=-1)

flat_x = x.view(batch_size, num_heads, seq_length * 2 – 1)
flat_pad = torch.zeros(batch_size, num_heads, seq_length – 1,
machine=x.machine)

flat_x_padded = torch.cat([flat_x, flat_pad], dim=-1)

# Reshape and slice out the padded parts
final_x = flat_x_padded.view(batch_size, num_heads, seq_length + 1,
seq_length)
final_x = final_x[:, :, :seq_length, :]

return final_x

def ahead(self, x, masks=None):
batch_size, seq_length, _ = x.measurement()

# Multi-head consideration
q = self.q_linear(x).view(batch_size, seq_length, self.num_heads,
self.head_dim).transpose(1, 2)
okay = self.k_linear(x).view(batch_size, seq_length, self.num_heads,
self.head_dim).transpose(1, 2)
v = self.v_linear(x).view(batch_size, seq_length, self.num_heads,
self.head_dim).transpose(1, 2)

# Scaled dot-product consideration
scores = torch.matmul(q, okay.transpose(-2, -1)) / math.sqrt(self.head_dim)

# Add relative positional encoding
if self.use_relative_pos:
rel_pos_bias = self.compute_relative_position_bias(seq_length)
scores = scores + rel_pos_bias

if masks is just not None:
scores = scores.masked_fill(masks == 0, float(‘-inf’))

attention_weights = torch.softmax(scores, dim=-1)
attention_weights = self.dropout(attention_weights)

context = torch.matmul(attention_weights, v)

# Reshape and apply output projection
context = context.transpose(1, 2).contiguous().view(
batch_size, seq_length, self.hidden_dim)
out = self.out_linear(context)

# Residual connection and layer normalization
x = self.norm1(x + self.dropout(out))

# Feed-forward community
ff_out = self.ff(x)

# Residual connection and layer normalization
x = self.norm2(x + self.dropout(ff_out))

return x, attention_weights

def compute_relative_position_bias(self, seq_length):
“””Compute relative positional bias”””
if not self.use_relative_pos:
return 0

positions = torch.arange(seq_length, machine=self.rel_pos_embed.machine)
relative_positions = positions.unsqueeze(1) – positions.unsqueeze(0)
relative_positions += seq_length – 1 # Shift to all optimistic indices

bias = self.rel_pos_embed[relative_positions]
return bias.unsqueeze(0).unsqueeze(0)

Source link

Handling Big Git Repos in AI Development | by Rajarshi Karmakar | Jul, 2025

Tone Awareness: Setting the Right Energy for Digital Spaces | by Fred’s Bytes | Jun, 2025

📝 Topic: “The Forgotten Art of Listening in a World That Won’t Stop Talking” | by Javeria Jahangeer | Jun, 2025

Transform Complexity into Opportunity with Digital Engineering

I Tried Buying a Car Through Amazon: Here Are the Pros, Cons

Amazon and eBay to pay ‘fair share’ for e-waste recycling

Artificial Intelligence Concerns & Predictions For 2025

Barbara Corcoran: Entrepreneurs Must ‘Embrace Change’

Most Popular

5 Time Management Challenges for Executives — and How to Solve Them

Streamlit Made Simple: Building Interactive Dashboards for Data Science | by Harshit Kandoi | Feb, 2025

A.I. Action Plans + The College Student Who Broke Job Interviews + Hot Mess Express

Our Picks

Transform Complexity into Opportunity with Digital Engineering

OpenAI Is Fighting Back Against Meta Poaching AI Talent

Lessons Learned After 6.5 Years Of Machine Learning

A Technical Overview of the Attention Mechanism in Deep Learning | by Silva.f.francis | Jun, 2025

Related Posts