Encoder
#导入包
import torch
from torch import nn
import torch.nn.functional as f
import math
class PositionwiseFeedForward(nn.Module):
def __init__(self,d_model,hidden,dropout=0.1):
"""
d_model:输入特征维度
hidden:隐藏层的维度
dropout:dropout的概率
"""
super(PositionwiseFeedForward,self).__init__()
#定义全连接层,和Dropout
self.fc1 = nn.Linear(d_model,hidden)
self.fc2 = nn.Linear(hidden,d_model)
self.dropout = nn.Dropout(dropout)
def forward(self,x):
x = self.fc1(x)
x = f.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
from typing import Self
from numpy import transpose
from torch import batch_norm
class MutiHeadAttention(nn.Module):
def __init__(self,d_model,n_head):
"""
d_model:输入向量的维度
n_head:多头注意力机制的头数
"""
super(MutiHeadAttention,self).__init__()
#将参数赋予实例
self.n_head = n_head #多头注意力的头数
self.d_mouel = d_model #向量的维度
#定义对q,k,v的线性变换
self.w_q = nn.Linear(d_model,d_model)
self.w_k = nn.Linear(d_model,d_model)
self.w_v = nn.Linear(d_model,d_model)
self.w_combine = nn.Linear(d_model,d_model)
#dim=-1 表示Softmax操作将沿着最后一个维度进行。
# 在多分类问题中,这通常意味着对每个样本的类别预测进行Softmax操作,将原始的类别得分(logits)转换为概率分布。
self.softmax=nn.Softmax(dim=-1)
def forward(self,q,k,v,mask=None):
batch,time,dimension = q.shape
#计算每个头处理的特征维度数。
n_d = self.d_mouel//self.n_head
#对q,k,v进行线性变换,并进行维度调整便于后续计算
q,k,v = self.w_q(q),self.w_k(k),self.w_v(v)
q = q.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
k = k.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
v = v.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
#进行赋分计算
[email protected](2,3)/math.sqrt(n_d)
#掩码机制
if mask is not None:
score = score.masked_fill(mask==0,-10000)
#加权求和
score = self.softmax(score)@v
score = score.permute(0,2,1,3).contiguous().view(batch,time,dimension)
out = self.w_combine(score)
return out
from math import sqrt
class LayerNormal(nn.Module):
def __init__(self,d_model,eps=1e-12):
"""
d_model:数据维度
eps:稳定系数
"""
super(LayerNormal,self).__init__()
#nn.Parameter所创造的张量会随着模型的训练而自动优化
#初始化为全1的向量
self.gamma = nn.Parameter(torch.ones(d_model))
self.beta = nn.Parameter(torch.zeros(d_model))
self.eps = eps
def forward(self,x):
#求平均值,keepdim=True保持维度数量不变
mean = x.mean(-1,keepdim=True)
#求方差,unbiased=False计算总体方差,keepdim=True保持维度不不变
var = x.var(-1,unbiased=False,keepdim=True)
out = (x-mean)/sqrt(var+self.eps)
out = self.gamma*out + self.beta
return out
class EncoderLayer(nn.Module):
"""
这是单个EncoderLayer的定义
真正的Encoder是由多个EncoderLayer组成
"""
def __init__(self,d_model,ffn_hidden,n_head,dropout=0.1):
#初始化并定义各个模块
super(EncoderLayer,self).__init__()
self.attention = MutiHeadAttention(d_model,n_head)
self.normal = LayerNormal(d_model)
self.dropout1 = nn.Dropout(dropout)
self.ffn = PositionwiseFeedForward(d_model=d_model,hidden=ffn_hidden,dropout=dropout)
self.norma2 = LayerNormal(d_model)
self.dropout2 = nn.Dropout(dropout)
def forward(self,x,mask=None):
#前向传播
_x = x
x = self.attention(x,x,x,mask)
x = self.dropout1(x)
x = self.normal(x + _x)
_x = x
x = self.ffn(x)
x = self.dropout2(x)
x = self.norma2(x + _x)
return x
#首先定义token embadding
"""
将输入词汇表的索引转换成指定维度的Embedding
"""
from zmq import device
class TokenEmbedding(nn.Embedding):
def __init__(self,vocab_size,d_model):
"""
初始化TokenEmbedding类。
参数:
vocab_size (int): 词汇表的大小。
d_model (int): Embedding的维度。
注意:
此类自动将索引为1的词汇视为填充词,并将其嵌入向量初始化为全零。
如果你不希望这样,可以手动设置padding_idx参数,或者将其设置为None。
"""
super(TokenEmbedding,self).__init__(vocab_size,d_model,padding_idx=1)
class PositionalEmbedding(nn.Module):
def __init__(self,d_model,max_len,device):
"""
初始化位置矩阵
"""
super(PositionalEmbedding,self).__init__()
#初始化0矩阵
self.encoding = torch.zeros(max_len,d_model,device=device)
#位置编码不需要优化,就不需要梯度更新
self.encoding.requires_grad = False
#定义pos,生成位置索引
pos = torch.arange(0,max_len)
pos = pos.to(device)
#类型转换为浮点型便于计算,在进行维度拓展为二维张量,利用广播机制自动对其
pos = pos.float().unsqueeze(dim=1)
#根据公式计算
frequencies_indices = torch.arange(0, d_model, step=2, device=device).float()
frequencies = 1.0/torch.pow(10000.0,frequencies_indices//d_model).unsqueeze(dim=0)
self.encoding[:,0::2] = torch.sin(pos*frequencies)
self.encoding[:,1::2] = torch.cos(pos*frequencies)
def forward(self,x):
#获取批量大小和序列长度
batch_size,seq_len = x.size()
return self.encoding[:seq_len,:]
class TransformerEmbedding(nn.Module):
def __init__(self,vocab_size,d_model,max_len,drop_prob,device):
super(TransformerEmbedding,self).__init__()
self.tok_emb = TokenEmbedding(vocab_size=vocab_size,d_model=d_model)
self.pos_emb = PositionalEmbedding(d_model=d_model,max_len=max_len,device=device)
self.drop_out=nn.Dropout(p=drop_prob)
def forward(self,x):
tok_emb = self.tok_emb(x)
pos_emb = self.pos_emb(x)
return self.drop_out(tok_emb+pos_emb)
class Encoder(nn.Module):
def __init__(self,enc_voc_size,max_len,d_model,ffn_hidden,n_head,n_layer,device,dropout=0.1):
"""
enc_voc_size:词汇表的大小
max_len:输入序列最大长度
d_model:输入特征的维度
ffn_hidden:隐藏层的神经元个数
n_head:多头注意力机制的头数
n_layer:Encoder有多少层
dropout=0.1:dropout的概率
divice:设备
"""
super(Encoder,self).__init__()
self.embedding = TransformerEmbedding(vocab_size=enc_voc_size,d_model=d_model,max_len=max_len,drop_prob=dropout,device=device)
#定义layer
self.layers = nn.ModuleList(
[
EncoderLayer(d_model=d_model,ffn_hidden=ffn_hidden,n_head=n_head,dropout=dropout)
for _ in range(n_layer)
]
)
def forward(self,x,s_mask):
#词汇映射
x = self.embedding(x)
for layer in self.layers:
x = layer(x,s_mask)
return x