前言
模型中常见的可训练层包括卷积层和线性层,这里将给出计算公式并在pytorch下进行验证。
计算模型的参数:
import torch.nn as nn
def cal_params(model: nn.Module):
num_learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_non_learnable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
return num_learnable_params, num_non_learnable_params
卷积层
\[n_\text{param} = (k_h \times k_w \times c_\text{in} + 1) \times c_\text{out} \]其中,\(k_h,\ k_w\)表示卷积核的高和宽,\(c_\text{in},\ c_\text{out}\)表示输入和输出的通道数,\(+1\)表示偏置项。
举例:对于一个卷积层,输入通道数为3,输出通道数为64,卷积核大小为\(3\times3\),则参数量为:
测试
conv = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3)
print(cal_params(conv))
# (1792, 0)
线性层
线性层:\(y = Wx+b\),参数在权重\(W\)和偏置\(b\)中,参数量为:
\[n_\text{param} = (n_\text{in} + 1) \times n_\text{out} \]举例:对于一个线性层,输入维度为64,输出维度为10,则参数量为:
\[n_\text{param} = (64 + 1) \times 10 = 650 \]测试
linear = nn.Linear(in_features=64, out_features=10)
print(cal_params(linear))
# (650, 0)
Transformer的参数量计算
Encoder layer
结构图中,Encoder的每一层参数量由Multi-head attention层,Feed forward层和两层LayerNorm层组成,设特征的维度为\(d_\text{model}=d\)
Multi-head attention
共有四个线性层\(W_Q,\ W_K,\ W_V,\ W_O\),参数量为:$$n_\text{atten}=(d + 1) \times d \times 4=4d^2 + 4d$$
Feed forward
由两个线性层组成,中间隐藏层的维度为\(d_\text{ff}\),参数量为:\((d + 1) \times d_\text{ff} + (d_\text{ff} + 1) \times d\),论文中,\(d_\text{ff}=4d\),参数量为:
\[n_\text{ff}=(d + 1) \times 4d + (4d + 1) \times d=8d^2 + 5d \]LayerNorm
两个LayerNorm层,每个LayerNorm层有两个参数\(\gamma,\ \beta\),参数量为:\(n_\text{ln}=2d\)
整个Encoder layer的参数量为:
\[n_\text{encoder layer} = n_\text{atten} + n_\text{ff} + 2n_\text{ln} = 4d^2 + 4d + 8d^2 + 5d + 4d = 12d^2 + 13d \]举例:\(d_\text{model}=512\)时,Encoder layer的参数量为:
\[n_\text{encoder layer} = 12 \times 512^2 + 13 \times 512 = 3,152,384 \]Encoder layer的实现:
class AddAndNorm(nn.Module):
def __init__(self, d_model, dropout_prob, eps=1e-6):
super().__init__()
self.w = nn.Parameter(torch.ones(d_model))
self.b = nn.Parameter(torch.zeros(d_model))
self.eps = eps
self.dropout = nn.Dropout(p=dropout_prob)
def norm(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.w * (x - mean) / (std + self.eps) + self.b
def forward(self, x, sublayer):
return x + self.dropout(sublayer(self.norm(x)))
def attention(q, k, v, mask=None, dropout=None):
d_k = q.size(-1)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
masking_value = -1e9 if scores.dtype == torch.float32 else -1e4
scores = scores.masked_fill(mask == 0, masking_value)
p_attn = scores.softmax(dim=-1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, v), p_attn
class MultiHeadedAttention(nn.Module):
def __init__(self, h, d_model, dropout_prob=0.1):
super().__init__()
assert d_model % h == 0
self.d_k = d_model // h
self.h = h
self.w_q = nn.Linear(d_model, d_model)
self.w_k = nn.Linear(d_model, d_model)
self.w_v = nn.Linear(d_model, d_model)
self.w_o = nn.Linear(d_model, d_model)
self.attn = None
self.dropout = nn.Dropout(p=dropout_prob)
def forward(self, q, k, v, mask=None):
if mask is not None:
mask = mask.unsqueeze(1) # 相同的mask应用于所有的注意力头h
batch_size = q.size(0)
# 1) 执行线性变换,将 d_model 维度的 x 分割成 h 个 d_k 维度
q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
# 2) 计算注意力
x, self.attn = attention(q, k, v, mask=mask, dropout=self.dropout)
# 3) 通过线性层连接多头注意力计算完的向量
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
return self.w_o(x)
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout_prob=0.1):
super().__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(p=dropout_prob)
def forward(self, x):
return self.w_2(self.dropout(self.w_1(x).relu()))
class EncoderLayer(nn.Module):
def __init__(self, d_model, h, d_ff, dropout_prob):
super().__init__()
self.attn = MultiHeadedAttention(h, d_model, dropout_prob)
self.ff = FeedForward(d_model, d_ff, dropout_prob)
self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(2)])
def forward(self, x, mask):
x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))
return self.sublayer[1](x, self.ff)
d_model = 512
d_ff = 4 * d_model
h = 8
dropout_prob = 0.1
encoder_layer = EncoderLayer(d_model, h, d_ff, dropout_prob)
print(cal_params(encoder_layer))
# (3152384, 0)
注意:在《Attention is All You Need》中 Add & Norm层的实现是:$${\rm LayerNorm}(x + {\rm Sublayer}(x))$$但可以看到,我们的代码实现方式为$$x+{\rm Sublayer(LayerNorm(x))}$$这在网上亦有讨论[Discussion],总的来说,后者的实现方式更加稳定,在许多实践中以后者为主。
Decoder layer
对比结构,decoder 每层的可训练参数相较于encoder layer多了一个Multi-head attention层和一个LayerNorm层,参数量为:
\[n_\text{decoder layer} = n_\text{encoder layer} + n_\text{atten} + n_\text{ln} \]\[= (12d^2 + 13d) + (4d^2 + 4d) + 2d = 16d^2 + 19d \]举例:\(d_\text{model}=512\)时,Decoder layer的参数量为:
\[n_\text{decoder layer} = 16 \times 512^2 + 19 \times 512 = 4,204,032 \]测试:
class DecoderLayer(nn.Module):
def __init__(self, d_model, h, d_ff, dropout_prob):
super().__init__()
self.self_attn = MultiHeadedAttention(h, d_model, dropout_prob)
self.src_attn = MultiHeadedAttention(h, d_model, dropout_prob)
self.ff = FeedForward(d_model, d_ff, dropout_prob)
self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(3)])
def forward(self, x, memory, src_mask, tgt_mask):
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))
return self.sublayer[2](x, self.ff)
d_model = 512
d_ff = 4 * d_model
h = 8
dropout_prob = 0.1
decoder_layer = DecoderLayer(d_model, h, d_ff, dropout_prob)
print(cal_params(decoder_layer))
# (4204032, 0)
Embedding层和生成层
Encoder和decoder分别使用一个embedding层,设encoder处理的词表大小为\(n_\text{src vocab}\),decoder处理的词表大小为\(n_\text{tgt vocab}\),特征维度为\(d_\text{model}=d\)。Encoder的embedding层参数量为:
\[n_\text{src emb}=n_\text{src vocab} \times d \]Decoder的embedding层参数量为:
\[n_\text{tgt emb}=n_\text{tgt vocab} \times d \]对于生成层,由于是一个线性层,参数量为:\(n_\text{gen}=(d + 1) \times n_\text{tgt vocab}\)。此外位置编码和残差连接没有可训练的参数。
举例:\(d_\text{model}=512\),\(n_\text{src vocab}=n_\text{tgt vocab}=10000\)时,Embedding层和生成层的参数量为:$$n_\text{src emb}=n_\text{tgt emb}=512 \times 10000=5,120,000$$
测试
class Embeddings(nn.Module):
def __init__(self, d_model, vocab):
super().__init__()
self.lut = nn.Embedding(num_embeddings=vocab, embedding_dim=d_model)
self.d_model = d_model
def forward(self, x):
return self.lut(x) * math.sqrt(self.d_model)
d_model = 512
d_ff = 4 * d_model
h = 8
dropout_prob = 0.1
vocal_size = 10000
embeddings = Embeddings(d_model, vocal_size)
print(cal_params(embeddings))
# (5120000, 0)
设encoder和decoder的层数为\(l\),则整个transformer的参数量为:
\[n = n_\text{src emb} + n_\text{tgt emb} + n_\text{gen} + l\times (n_\text{encoder layer} + n_\text{decoder layer}) \]\[=n_\text{src vocab}\times d + n_\text{tgt vocab}\times d + (d + 1)\times n_\text{tgt vocab} + l\times (12d^2 + 13d + 16d^2 + 19d) \]\[=d\times(n_\text{src vocab}+2n_\text{tgt vocab}) + n_\text{tgt vocab} + l\times(28d^2 + 32d) \]举例:\(d_\text{model}=512,\ n_\text{src vocab}=n_\text{tgt vocab}=10000,\ l=6\)时,整个transformer的参数量为:
\[512\times30000 + 10000 + 6\times(28\times512^2 + 32\times512) = 59,508,496 \]测试
class Transformer(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model, d_ff, h, dropout_prob, layers):
super().__init__()
self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab), PositionalEncoding(d_model, dropout_prob))
self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), PositionalEncoding(d_model, dropout_prob))
self.encoder = nn.ModuleList([EncoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])
self.decoder = nn.ModuleList([DecoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])
self.generator = nn.Linear(d_model, tgt_vocab)
def forward(self, src, tgt, src_mask, tgt_mask):
memory = self.encode(src, src_mask)
return self.decode(memory, src_mask, tgt, tgt_mask)
def encode(self, src, mask):
x = self.src_embed(src)
for layer in self.encoder:
x = layer(x, mask)
return x
def decode(self, memory, src_mask, tgt, tgt_mask):
x = self.tgt_embed(tgt)
for layer in self.decoder:
x = layer(x, memory, src_mask, tgt_mask)
return self.generator(x)
d_model = 512
d_ff = 4 * d_model
h = 8
layer = 6
dropout_prob = 0.1
vocal_size = 10000
model = Transformer(vocal_size, vocal_size, d_model, d_ff, h, dropout_prob, layer)
print(cal_params(model))
# (59508496, 0)
运行环境
Package Version
------------------------- -----------
torch 2.5.0
完整的代码:
import torch
import math
from torch import nn
def cal_params(model: nn.Module):
num_learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_non_learnable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
return num_learnable_params, num_non_learnable_params
class AddAndNorm(nn.Module):
def __init__(self, d_model, dropout_prob, eps=1e-6):
super().__init__()
self.w = nn.Parameter(torch.ones(d_model))
self.b = nn.Parameter(torch.zeros(d_model))
self.eps = eps
self.dropout = nn.Dropout(p=dropout_prob)
def norm(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.w * (x - mean) / (std + self.eps) + self.b
def forward(self, x, sublayer):
return x + self.dropout(sublayer(self.norm(x)))
def attention(q, k, v, mask=None, dropout=None):
d_k = q.size(-1)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
masking_value = -1e9 if scores.dtype == torch.float32 else -1e4
scores = scores.masked_fill(mask == 0, masking_value)
p_attn = scores.softmax(dim=-1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, v), p_attn
class MultiHeadedAttention(nn.Module):
def __init__(self, h, d_model, dropout_prob=0.1):
super().__init__()
assert d_model % h == 0
self.d_k = d_model // h
self.h = h
self.w_q = nn.Linear(d_model, d_model)
self.w_k = nn.Linear(d_model, d_model)
self.w_v = nn.Linear(d_model, d_model)
self.w_o = nn.Linear(d_model, d_model)
self.attn = None
self.dropout = nn.Dropout(p=dropout_prob)
def forward(self, q, k, v, mask=None):
if mask is not None:
mask = mask.unsqueeze(1) # 相同的mask应用于所有的注意力头h
batch_size = q.size(0)
# 1) 执行线性变换,将 d_model 维度的 x 分割成 h 个 d_k 维度
q = self.w_q(q).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
k = self.w_k(k).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
v = self.w_v(v).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
# 2) 计算注意力
x, self.attn = attention(q, k, v, mask=mask, dropout=self.dropout)
# 3) 通过线性层连接多头注意力计算完的向量
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
return self.w_o(x)
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout_prob=0.1):
super().__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(p=dropout_prob)
def forward(self, x):
return self.w_2(self.dropout(self.w_1(x).relu()))
class EncoderLayer(nn.Module):
def __init__(self, d_model, h, d_ff, dropout_prob):
super().__init__()
self.attn = MultiHeadedAttention(h, d_model, dropout_prob)
self.ff = FeedForward(d_model, d_ff, dropout_prob)
self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(2)])
def forward(self, x, mask):
x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))
return self.sublayer[1](x, self.ff)
class DecoderLayer(nn.Module):
def __init__(self, d_model, h, d_ff, dropout_prob):
super().__init__()
self.self_attn = MultiHeadedAttention(h, d_model, dropout_prob)
self.src_attn = MultiHeadedAttention(h, d_model, dropout_prob)
self.ff = FeedForward(d_model, d_ff, dropout_prob)
self.sublayer = nn.ModuleList([AddAndNorm(d_model, dropout_prob) for _ in range(3)])
def forward(self, x, memory, src_mask, tgt_mask):
x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))
return self.sublayer[2](x, self.ff)
class Embeddings(nn.Module):
def __init__(self, d_model, vocab):
super().__init__()
self.lut = nn.Embedding(num_embeddings=vocab, embedding_dim=d_model)
self.d_model = d_model
def forward(self, x):
return self.lut(x) * math.sqrt(self.d_model)
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout_prob, max_len=5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout_prob)
# 计算位置编码
pe = torch.zeros(max_len, d_model) # Shape: max_len x d_model
position = torch.arange(0, max_len).unsqueeze(1) # Shape: max_len x 1
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000) / d_model))
res = position * div_term # Shape: max_len x d_model/2
pe[:, 0::2] = torch.sin(res)
pe[:, 1::2] = torch.cos(res)
pe = pe.unsqueeze(0) # Shape: 1 x max_len x d_model
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)].requires_grad_(False)
return self.dropout(x)
class Transformer(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model, d_ff, h, dropout_prob, layers):
super().__init__()
self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab), PositionalEncoding(d_model, dropout_prob))
self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), PositionalEncoding(d_model, dropout_prob))
self.encoder = nn.ModuleList([EncoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])
self.decoder = nn.ModuleList([DecoderLayer(d_model, h, d_ff, dropout_prob) for _ in range(layers)])
self.generator = nn.Linear(d_model, tgt_vocab)
def forward(self, src, tgt, src_mask, tgt_mask):
memory = self.encode(src, src_mask)
return self.decode(memory, src_mask, tgt, tgt_mask)
def encode(self, src, mask):
x = self.src_embed(src)
for layer in self.encoder:
x = layer(x, mask)
return x
def decode(self, memory, src_mask, tgt, tgt_mask):
x = self.tgt_embed(tgt)
for layer in self.decoder:
x = layer(x, memory, src_mask, tgt_mask)
return self.generator(x)
if __name__ == '__main__':
d_model = 512
d_ff = 4 * d_model
h = 8
layer = 6
dropout_prob = 0.1
vocal_size = 10000
model = Transformer(vocal_size, vocal_size, d_model, d_ff, h, dropout_prob, layer)
print(cal_params(model))