Transformer
Transformer模型完全依赖于注意力机制,是编码器-解码器架构的一个实例。下面是整体架构图。它最重要的是自注意力层,为每两个token分配一个注意力值。在transformer for vision中,每个token表示一个patch。下面是Transformer的整体架构。
如图所示,我们对这个架构进行一些分析:
- 对于编码器来讲
编码器是由多个相同的模块叠加n次而成。每个模块有两个子层,(层用蓝色来表示)分别为:
- 多头自注意力汇聚层:在计算编码器的自注意力时,查询、键和值都来自前一个编码器层的输出。
- 基于位置的前馈网络(逐位前馈网络):两层感知机。
子层之间都采用了残差连接(连接操作用白色来表示),在残差连接的加法计算之后,紧接着应用了 层规范化。
由于残差连接的特殊性,输入序列对应的每个位置,Transformer编码器都将输出一个和原向量相同形状的向量。
- 对解码器来讲
解码器也是由n个相同的模块叠加而成,并且使用残差连接和层规范化。每个层除了有两个子层外,还有一个 编码器-解码器注意力层(图上标着多头注意力的层),查询来自前一个解码器层的输出,而键和值来自整个编码器的输出。
解码器的第一个层(掩蔽多头注意力)中,查询、键和值都来自上一个解码器层的输出。
下面我们介绍几个新见的层:
- 逐位前馈网络 (Positionwise Feed-Forward Networks)
该网络层使用同样的MLP对所有序列位置进行变换,因此称为基于位置的。
- 输入:
X
的形状(批量大小,时间步数或序列长度,隐单元数或特征维度)将被一个两层的感知机转换成形状为(批量大小,时间步数,ffn_num_outputs
)的输出张量
class PositionWiseFFN(nn.Module):
"""基于位置的前馈网络"""
def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs,
**kwargs):
super(PositionWiseFFN, self).__init__(**kwargs)
self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)
self.relu = nn.ReLU()
self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)
def forward(self, X):
return self.dense2(self.relu(self.dense1(X)))
- 残差连接&层规范化
class AddNorm(nn.Module):
"""残差连接后进行层规范化"""
def __init__(self, normalized_shape, dropout, **kwargs):
super(AddNorm, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
self.ln = nn.LayerNorm(normalized_shape) #层规范化
def forward(self, X, Y):
return self.ln(self.dropout(Y) + X)
编码器
要构成一个编码器,我们先实现编码器中的“模块”,这个模块包含两个子层,分别为多头注意力层和逐位前馈网络。层与层之间使用残差连接和层规范化。
class EncoderBlock(nn.Module):
"""Transformer编码器块"""
def __init__(self, key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
dropout, use_bias=False, **kwargs):
super(EncoderBlock, self).__init__(**kwargs)
self.attention = d2l.MultiHeadAttention(
key_size, query_size, value_size, num_hiddens, num_heads, dropout,
use_bias)
self.addnorm1 = AddNorm(norm_shape, dropout)
self.ffn = PositionWiseFFN(
ffn_num_input, ffn_num_hiddens, num_hiddens)
self.addnorm2 = AddNorm(norm_shape, dropout)
def forward(self, X, valid_lens):
Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
return self.addnorm2(Y, self.ffn(Y))
下面实现的Transformer编码器的代码中,堆叠了num_layers
个EncoderBlock
类的实例。
class TransformerEncoder(d2l.Encoder):
"""Transformer编码器"""
def __init__(self, vocab_size, key_size, query_size, value_size,
num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, num_layers, dropout, use_bias=False, **kwargs):
super(TransformerEncoder, self).__init__(**kwargs)
self.num_hiddens = num_hiddens
self.embedding = nn.Embedding(vocab_size, num_hiddens)
self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
self.blks = nn.Sequential()
for i in range(num_layers):
self.blks.add_module("block"+str(i),
EncoderBlock(key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, dropout, use_bias))
def forward(self, X, valid_lens, *args):
# 因为位置编码值在-1和1之间,
# 因此嵌入值乘以嵌入维度的平方根进行缩放,
# 然后再与位置编码相加。
X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens)) #嵌入层+位置编码处理
self.attention_weights = [None] * len(self.blks)
for i, blk in enumerate(self.blks):
X = blk(X, valid_lens)
self.attention_weights[i] = blk.attention.attention.attention_weights
return X
Transformer编码器输出的形状是(批量大小,时间步数目,num_hiddens
)。
解码器
Transformer解码器也是由多个相同的模块组成。在DecoderBlock
类中包含的每个模块包含了三个子层:解码器自注意力、“编码器-解码器”注意力和基于位置的前馈网络。这些子层也都被残差连接和紧随的层规范化连接。
在掩蔽多头解码器自注意力层(第一个子层)中,查询、键和值都来自上一个解码器层的输出。关于序列到序列模型(sequence-to-sequence model),在训练阶段,其输出序列的所有位置(时间步)的词元都是已知的;然而,在预测阶段,其输出序列的词元是逐个生成的。因此,在任何解码器时间步中,只有生成的词元才能用于解码器的自注意力计算中。为了在解码器中保留自回归的属性,其掩蔽自注意力设定了参数dec_valid_lens
,以便任何查询都只会与解码器中所有已经生成词元的位置(即直到该查询位置为止)进行注意力计算。
class DecoderBlock(nn.Module):
"""解码器中第i个块"""
def __init__(self, key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
dropout, i, **kwargs):
super(DecoderBlock, self).__init__(**kwargs)
self.i = i
self.attention1 = d2l.MultiHeadAttention(
key_size, query_size, value_size, num_hiddens, num_heads, dropout)
self.addnorm1 = AddNorm(norm_shape, dropout)
self.attention2 = d2l.MultiHeadAttention(
key_size, query_size, value_size, num_hiddens, num_heads, dropout)
self.addnorm2 = AddNorm(norm_shape, dropout)
self.ffn = PositionWiseFFN(ffn_num_input, ffn_num_hiddens,
num_hiddens)
self.addnorm3 = AddNorm(norm_shape, dropout)
def forward(self, X, state):
enc_outputs, enc_valid_lens = state[0], state[1]
# 训练阶段,输出序列的所有词元都在同一时间处理,
# 因此state[2][self.i]初始化为None。
# 预测阶段,输出序列是通过词元一个接着一个解码的,
# 因此state[2][self.i]包含着直到当前时间步第i个块解码的输出表示
if state[2][self.i] is None:
key_values = X
else:
key_values = torch.cat((state[2][self.i], X), axis=1)
state[2][self.i] = key_values
if self.training:
batch_size, num_steps, _ = X.shape
# dec_valid_lens的开头:(batch_size,num_steps),
# 其中每一行是[1,2,...,num_steps]
dec_valid_lens = torch.arange(
1, num_steps + 1, device=X.device).repeat(batch_size, 1)
else:
dec_valid_lens = None
# 自注意力
X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
Y = self.addnorm1(X, X2)
# 编码器-解码器注意力。
# enc_outputs的开头:(batch_size,num_steps,num_hiddens)
Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
Z = self.addnorm2(Y, Y2)
return self.addnorm3(Z, self.ffn(Z)), state
解码器的子模块已经构建完毕,下面代码就是整个解码器代码了。在解码器的最后一层,通过一个全连接层计算所有vocab_size
个可能的输出词元的预测值。解码器的自注意力权重和编码器解码器注意力权重都被存储下来,方便日后可视化的需要。
class TransformerDecoder(d2l.AttentionDecoder):
def __init__(self, vocab_size, key_size, query_size, value_size,
num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, num_layers, dropout, **kwargs):
super(TransformerDecoder, self).__init__(**kwargs)
self.num_hiddens = num_hiddens
self.num_layers = num_layers
self.embedding = nn.Embedding(vocab_size, num_hiddens)
self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
self.blks = nn.Sequential()
for i in range(num_layers):
self.blks.add_module("block"+str(i),
DecoderBlock(key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens,
num_heads, dropout, i))
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, enc_valid_lens, *args):
return [enc_outputs, enc_valid_lens, [None] * self.num_layers]
def forward(self, X, state):
X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
self._attention_weights = [[None] * len(self.blks) for _ in range (2)]
for i, blk in enumerate(self.blks):
X, state = blk(X, state)
# 解码器自注意力权重
self._attention_weights[0][
i] = blk.attention1.attention.attention_weights
# “编码器-解码器”自注意力权重
self._attention_weights[1][
i] = blk.attention2.attention.attention_weights
return self.dense(X), state
@property
def attention_weights(self):
return self._attention_weights
- 训练
num_hiddens, num_layers, dropout, batch_size, num_steps = 32, 2, 0.1, 64, 10
lr, num_epochs, device = 0.005, 200, d2l.try_gpu()
ffn_num_input, ffn_num_hiddens, num_heads = 32, 64, 4
key_size, query_size, value_size = 32, 32, 32
norm_shape = [32]
train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps)
# 输入的形状为(batch_size, num_steps, 词元特征向量维度)
encoder = TransformerEncoder(
len(src_vocab), key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
num_layers, dropout)
decoder = TransformerDecoder(
len(tgt_vocab), key_size, query_size, value_size, num_hiddens,
norm_shape, ffn_num_input, ffn_num_hiddens, num_heads,
num_layers, dropout)
net = d2l.EncoderDecoder(encoder, decoder)
d2l.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
Vision Transformer
当Transformer在NLP领域取得极大成功时,人们就着手将其应用到计算机视觉领域。但迎面而来的是一个问题:Transformer接受的是一个个词元序列,而不是图片。应该如何处理呢?
让我们考虑这样一种情况,输入图片高度为h,宽度为w,通道数为c。那么我们可以将一整张图片分割成多个图片块,每个图片块的长度和宽度均为p,通道数为c。再将它们分别展平成一维张量,形状为\(p^2c\)。这一个个一维张量就可以看做成一个个词元序列。如果Transformer要求词元特征向量的维数为\(num\_hiddens\),那么我们还可以再经过一个全连接层变换,将一维张量的形状变化变为\(num\_hiddens\)。这样形成的一个个形状为\(num\_hiddens\)的一维张量,就是一个个词元,我们称词元为patch。我们用Patch Embedding网络层来实现上述操作。在实际实现过程中,由于一个二维卷积层就可以实现分割+全连接的操作,因此人们就采用卷积层来实现。
总的来说,Patch Embedding层将图片分割成一系列patches,数量为\(m==\frac{hw}{p^2}\),每个patch都会展平成一个张量。因此patches可以被看做是词元序列。一个特别的词元<cls>和m个展平的patches都被线性投影成m+1个张量,并与可学习的位置嵌入层(Positional embedding)相加。之后经由多层Transformer 编码器将m+1个输入张量转化成同样长度的输出张量。最后取第一个输出张量\(\text{Rep}_{<cls>}\)经过变换得出预测结果。
ViT(Vision Transformer)的整体架构:
PatchEmbedding层
我们首先来实现PatchEmbedding层:
输入:
- img_size:图片的长和宽
- patch_size:patch的长和宽
- X:输入形状为(batch_size,in_channels,img_size, img_size)。经过conv层后形状变为(batch_size, num_hiddens, img_size/patch_size, img_size/patch_size)
class PatchEmbedding(nn.Module):
def __init__(self, img_size=96, patch_size=16, num_hiddens=512):
super().__init__()
def _make_tuple(x):
if not isinstance(x, (list, tuple)):
return (x, x)
return x
img_size, patch_size = _make_tuple(img_size), _make_tuple(patch_size)
self.num_patches = (img_size[0] // patch_size[0]) * (
img_size[1] // patch_size[1]) #图片可以分为的patch总数
self.conv = nn.LazyConv2d(num_hiddens, kernel_size=patch_size,
stride=patch_size) #将X切割成patches,patch的特征向量维数为num_hiddens
def forward(self, X):
# Output shape: (batch size, no. of patches, no. of channels),no. of channels equals to num_hiddens
return self.conv(X).flatten(2).transpose(1, 2)
MLP层
接下来我们实现MLP多层感知机层。MLP层的位置就在Transformer架构中的位置前馈网络FFN的位置。
class ViTMLP(nn.Module):
def __init__(self, mlp_num_hiddens, mlp_num_outputs, dropout=0.5):
super().__init__()
self.dense1 = nn.LazyLinear(mlp_num_hiddens)
self.gelu = nn.GELU() #比relu激活函数更平滑一些
self.dropout1 = nn.Dropout(dropout)
self.dense2 = nn.LazyLinear(mlp_num_outputs)
self.dropout2 = nn.Dropout(dropout)
def forward(self, x):
return self.dropout2(self.dense2(self.dropout1(self.gelu(
self.dense1(x)))))
Vit基本模块
Vit是由多个相同模块组成的,模块内包括自注意力和MLP层。下面我们来实现它,和Transformer的编码器架构不同,我们在多头注意力和MLP之前进行normalization,使训练更有效。
class ViTBlock(nn.Module):
def __init__(self, num_hiddens, norm_shape, mlp_num_hiddens,
num_heads, dropout, use_bias=False):
super().__init__()
self.ln1 = nn.LayerNorm(norm_shape) #对输入的最后一维(特征维)进行规范化
self.attention = d2l.MultiHeadAttention(num_hiddens, num_heads,
dropout, use_bias) #键值、查询的大小都是num_hiddens,输出形状为(batch_size, num_hiddens, num_hiddens)
self.ln2 = nn.LayerNorm(norm_shape)
self.mlp = ViTMLP(mlp_num_hiddens, num_hiddens, dropout)
def forward(self, X, valid_lens=None):
X = X + self.attention(*([self.ln1(X)] * 3), valid_lens) #+号为残差连接
return X + self.mlp(self.ln2(X)) #+号为残差连接
X的输入形状和输出形状相同。
class MultiHeadAttention(nn.Module):
"""多头注意力"""
def __init__(self, key_size, query_size, value_size, num_hiddens,
num_heads, dropout, bias=False, **kwargs): #注意,英文版的写法是只有一个key_size,因为query_size和value_size的值一般都和key_size相同
super(MultiHeadAttention, self).__init__(**kwargs)
self.num_heads = num_heads
self.attention = d2l.DotProductAttention(dropout)
self.W_q = nn.Linear(query_size, num_hiddens, bias=bias)
self.W_k = nn.Linear(key_size, num_hiddens, bias=bias)
self.W_v = nn.Linear(value_size, num_hiddens, bias=bias)
self.W_o = nn.Linear(num_hiddens, num_hiddens, bias=bias)
def forward(self, queries, keys, values, valid_lens):
# queries,keys,values的形状:
# (batch_size,查询或者“键-值”对的个数,num_hiddens)
# valid_lens 的形状:
# (batch_size,)或(batch_size,查询的个数)
# 经过变换后,输出的queries,keys,values 的形状:
# (batch_size*num_heads,查询或者“键-值”对的个数,
# num_hiddens/num_heads)
queries = transpose_qkv(self.W_q(queries), self.num_heads)
keys = transpose_qkv(self.W_k(keys), self.num_heads)
values = transpose_qkv(self.W_v(values), self.num_heads)
if valid_lens is not None:
# 在轴0,将第一项(标量或者矢量)复制num_heads次,
# 然后如此复制第二项,然后诸如此类。
valid_lens = torch.repeat_interleave(
valid_lens, repeats=self.num_heads, dim=0)
# output的形状:(batch_size*num_heads,查询的个数,
# num_hiddens/num_heads)
output = self.attention(queries, keys, values, valid_lens)
# output_concat的形状:(batch_size,查询的个数,num_hiddens)
output_concat = transpose_output(output, self.num_heads)
return self.W_o(output_concat)
Vit模型
接下来我们将前面的各个部件组装起来:
我们将图片的patches连通cls看为一个个时间步下的词元。
class ViT(d2l.Classifier):
"""Vision Transformer."""
def __init__(self, img_size, patch_size, num_hiddens, mlp_num_hiddens,
num_heads, num_blks, emb_dropout, blk_dropout, lr=0.1,
use_bias=False, num_classes=10):
super().__init__()
self.save_hyperparameters()
self.patch_embedding = PatchEmbedding(
img_size, patch_size, num_hiddens)
self.cls_token = nn.Parameter(torch.zeros(1, 1, num_hiddens))
num_steps = self.patch_embedding.num_patches + 1 # Add the cls token
# Positional embeddings are learnable
self.pos_embedding = nn.Parameter(
torch.randn(1, num_steps, num_hiddens))
self.dropout = nn.Dropout(emb_dropout)
self.blks = nn.Sequential()
for i in range(num_blks):
self.blks.add_module(f"{i}", ViTBlock(
num_hiddens, num_hiddens, mlp_num_hiddens,
num_heads, blk_dropout, use_bias))
self.head = nn.Sequential(nn.LayerNorm(num_hiddens), #在最后一维(通道维)规范化
nn.Linear(num_hiddens, num_classes))
def forward(self, X):
#输入X:(batch_size, no. of channels, img_size, img_size)
X = self.patch_embedding(X)
#输出X:(batch_size, patches数量, 特征维)
X = torch.cat((self.cls_token.expand(X.shape[0], -1, -1), X), 1)#将cls拼接到所有patches的最前面
X = self.dropout(X + self.pos_embedding)
for blk in self.blks:
X = blk(X)
return self.head(X[:, 0]) #仅将cls给全连接层,得出预测结果
训练:
img_size, patch_size = 96, 16
num_hiddens, mlp_num_hiddens, num_heads, num_blks = 512, 2048, 8, 2
emb_dropout, blk_dropout, lr = 0.1, 0.1, 0.1
model = ViT(img_size, patch_size, num_hiddens, mlp_num_hiddens, num_heads,
num_blks, emb_dropout, blk_dropout, lr)
trainer = d2l.Trainer(max_epochs=10, num_gpus=1)
data = d2l.FashionMNIST(batch_size=128, resize=(img_size, img_size))
trainer.fit(model, data)
标签:hiddens,Transformer,nn,self,学习,num,dropout,Vision,size
From: https://www.cnblogs.com/fyqs/p/17189840.html