Muti Head Attention
Muti Head Attention
#导入包
import torch
from torch import nn
import torch.nn.functional as f
import math
x = torch.rand(128,32,512)
d_moule=512
n_head=8
from numpy import transpose
from torch import batch_norm
class MutiHeadAttention(nn.Module):
def __init__(self,d_model,n_head):
"""
d_model:输入向量的维度
n_head:多头注意力机制的头数
"""
super(MutiHeadAttention,self).__init__()
#将参数赋予实例
self.n_head = n_head #多头注意力的头数
self.d_mouel = d_model #向量的维度
#定义对q,k,v的线性变换
self.w_q = nn.Linear(d_model,d_model)
self.w_k = nn.Linear(d_model,d_model)
self.w_v = nn.Linear(d_model,d_model)
self.w_combine = nn.Linear(d_model,d_model)
#dim=-1 表示Softmax操作将沿着最后一个维度进行。
# 在多分类问题中,这通常意味着对每个样本的类别预测进行Softmax操作,将原始的类别得分(logits)转换为概率分布。
self.softmax=nn.Softmax(dim=-1)
def forward(self,q,k,v,mask=None):
batch,time,dimension = q.shape
#计算每个头处理的特征维度数。
n_d = self.d_mouel//self.n_head
#对q,k,v进行线性变换,并进行维度调整便于后续计算
q,k,v = self.w_q(q),self.w_k(k),self.w_v(v)
q = q.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
k = k.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
v = v.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
#进行赋分计算
[email protected](2,3)/math.sqrt(n_d)
#掩码机制
if mask is not None:
score = score.masked_fill(mask==0,-10000)
#加权求和
score = self.softmax(score)@v
score = score.permute(0,2,1,3).contiguous().view(batch,time,dimension)
out = self.w_combine(score)
return out
attention = MutiHeadAttention(d_model=d_moule,n_head=n_head)
out = attention(x,x,x)
print(out)