对比各个大模型的网络结构
ps:使用自己的config,但是模型结构跟官方配置原理一致.
chatglm3
ChatGLMForConditionalGeneration(
(transformer): ChatGLMModel(
(embedding): Embedding(
(word_embeddings): Embedding(65024, 4096)
)
(rotary_pos_emb): RotaryEmbedding()
(encoder): GLMTransformer(
(layers): ModuleList(
(0-1): 2 x GLMBlock(
(input_layernorm): RMSNorm()
(self_attention): SelfAttention(
(query_key_value): Linear(in_features=4096, out_features=12288, bias=False)
(core_attention): CoreAttention(
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(dense): Linear(in_features=4096, out_features=4096, bias=False)
)
(post_attention_layernorm): RMSNorm()
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
(dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
)
)
)
(final_layernorm): RMSNorm()
)
(output_layer): Linear(in_features=4096, out_features=65024, bias=False)
)
)
#模型调试代码:
#=======通过下面这个代码可以debug学习chatglm3的代码.
from modeling_chatglm import ChatGLMForConditionalGeneration,ChatGLMConfig
import torch
def run ():
config= ChatGLMConfig(num_layers=2,original_rope=True,use_cache=True) #=====有一些参数在config.json里面搬过来即可.
model = ChatGLMForConditionalGeneration(config=config)
inputs_ids = torch.randint(low=0,high=config.vocab_size, size=(4,30))
print(model)
res = model(inputs_ids)
print(res)
run()
llama2
LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(32000, 2048)
(layers): ModuleList(
(0-1): 2 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
(k_proj): Linear(in_features=2048, out_features=2048, bias=False)
(v_proj): Linear(in_features=2048, out_features=2048, bias=False)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=2048, out_features=554, bias=False)
(up_proj): Linear(in_features=2048, out_features=554, bias=False)
(down_proj): Linear(in_features=554, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
(input_layernorm): LlamaRMSNorm()
(post_attention_layernorm): LlamaRMSNorm()
)
)
(norm): LlamaRMSNorm()
)
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
)
#模型调试代码:
from transformers.models .llama import LlamaModel,LlamaConfig,LlamaForCausalLM
import torch
def run ():
llamaconfig= LlamaConfig(vocab_size=32000,
hidden_size=4096//2,
intermediate_size=1108//2,
num_hidden_layers=2,
num_attention_heads=32//2,max_position_embeddings=2048//2)
llamamodel = LlamaForCausalLM(config=llamaconfig) #https://hf-mirror.com/hiyouga/Llama-2-Chinese-13b-chat/blob/main/config.json 参考这里面的architecture知道llama2依然用的事llamaforcausallm架构.
inputs_ids = torch.randint(low=0,high=llamaconfig.vocab_size, size=(4,30))
print(llamamodel)
res = llamamodel(inputs_ids)
print(res)
run()
核心部分放在一起比较
chatglm: (input_layernorm): RMSNorm()
(self_attention): SelfAttention(
(query_key_value): Linear(in_features=4096, out_features=12288, bias=False)
(core_attention): CoreAttention(
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(dense): Linear(in_features=4096, out_features=4096, bias=False)
)
(post_attention_layernorm): RMSNorm()
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
(dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
)
llama2: (self_attn): LlamaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
(k_proj): Linear(in_features=2048, out_features=2048, bias=False)
(v_proj): Linear(in_features=2048, out_features=2048, bias=False)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=2048, out_features=554, bias=False)
(up_proj): Linear(in_features=2048, out_features=554, bias=False)
(down_proj): Linear(in_features=554, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
(input_layernorm): LlamaRMSNorm()
(post_attention_layernorm): LlamaRMSNorm()