@TOC
本文介绍了NeMo如何训练llama2_7b模型
1.参考链接
2.创建容器
docker run --gpus all --shm-size=32g -ti -e NVIDIA_VISIBLE_DEVICES=all \
--privileged --net=host -v $PWD:/home \
-w /home --name NeMo \
nvcr.io/nvidia/nemo:24.05 /bin/bash
mkdir -p /home/NeMo
3.数据转换
cd /home/NeMo
python /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
--input=/home/autotrain/datasets/timdettmers/openassistant-guanaco/openassistant_best_replies_train.jsonl \
--json-keys=text \
--tokenizer-library=sentencepiece \
--tokenizer-model=/home/ModelLink/llama-2-7b-hf/tokenizer.model \
--output-prefix=gpt_training_data \
--append-eod \
--workers=32
4.从零开始训练
python /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
--config-path=/opt/NeMo-Framework-Launcher/launcher_scripts/conf/training/llama \
--config-name=llama2_7b \
trainer.devices=8 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=300000 \
trainer.val_check_interval=300 \
trainer.log_every_n_steps=50 \
trainer.limit_val_batches=50 \
trainer.limit_test_batches=50 \
trainer.accumulate_grad_batches=1 \
trainer.precision=bf16 \
model.micro_batch_size=1 \
model.global_batch_size=4 \
model.tensor_model_parallel_size=4 \
model.pipeline_model_parallel_size=2 \
model.max_position_embeddings=1024 \
model.encoder_seq_length=1024 \
model.data.seq_length=1024 \
model.tokenizer.library=sentencepiece \
model.tokenizer.model=/home/ModelLink/llama-2-7b-hf/tokenizer.model \
model.data.data_prefix=[1.0,gpt_training_data_text_document] \
model.data.num_workers=0 \
model.data.splits_string=\'980,10,10\' \
exp_manager.resume_if_exists=True \
exp_manager.resume_ignore_no_checkpoint=True \
exp_manager.create_checkpoint_callback=True \
exp_manager.checkpoint_callback_params.monitor=val_loss \
exp_manager.checkpoint_callback_params.save_top_k=3 \
exp_manager.checkpoint_callback_params.mode=min \
exp_manager.checkpoint_callback_params.always_save_nemo=False \
exp_manager.explicit_log_dir="./result" \
exp_manager.wandb_logger_kwargs.name="llama2_7b" \
model.optim.name=fused_adam \
model.optim.lr=6e-4 \
model.optim.betas=[0.9,0.95] \
model.optim.weight_decay=0.1 \
model.optim.sched.name=CosineAnnealing \
model.optim.sched.warmup_steps=750 \
model.optim.sched.constant_steps=80000 \
model.optim.sched.min_lr=6e-5 \
~model.optim.bucket_cap_mb \
~model.optim.overlap_grad_sync \
~model.optim.overlap_param_sync \
~model.optim.contiguous_grad_buffer \
~model.optim.contiguous_param_buffer
5.加载预训练模型,继续训练
A.模型转换
cd /opt/NeMo
python /opt/NeMo/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
--input_name_or_path /home/ModelLink/llama-2-7b-hf/ \
--output_path llama-2-7b-hf-nemo
B.开始训练
python /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_continue_training.py \
--config-path=/opt/NeMo-Framework-Launcher/launcher_scripts/conf/training/llama \
--config-name=llama2_7b \
+restore_from_path="./llama-2-7b-hf-nemo" \
trainer.devices=8 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=300000 \
trainer.val_check_interval=300 \
trainer.log_every_n_steps=50 \
trainer.limit_val_batches=50 \
trainer.limit_test_batches=50 \
trainer.accumulate_grad_batches=1 \
model.micro_batch_size=1 \
model.global_batch_size=4 \
model.tensor_model_parallel_size=4 \
model.pipeline_model_parallel_size=2 \
model.max_position_embeddings=512 \
model.encoder_seq_length=512 \
model.data.seq_length=512 \
model.tokenizer.library=sentencepiece \
model.tokenizer.model=/home/ModelLink/llama-2-7b-hf/tokenizer.model \
model.data.data_prefix=[1.0,gpt_training_data_text_document] \
model.data.num_workers=0 \
model.megatron_amp_O2=false \
+model.seq_len_interpolation_factor=1 \
model.data.splits_string=\'980,10,10\' \
exp_manager.resume_if_exists=True \
exp_manager.resume_ignore_no_checkpoint=True \
exp_manager.create_checkpoint_callback=True \
exp_manager.checkpoint_callback_params.monitor=val_loss \
exp_manager.checkpoint_callback_params.save_top_k=3 \
exp_manager.checkpoint_callback_params.mode=min \
exp_manager.checkpoint_callback_params.always_save_nemo=False \
exp_manager.explicit_log_dir="./result" \
exp_manager.wandb_logger_kwargs.name="llama2_7b" \
model.optim.name=fused_adam \
run.results_dir="./result" \
model.optim.lr=6e-4 \
model.optim.betas=[0.9,0.95] \
model.optim.weight_decay=0.1 \
model.optim.sched.name=CosineAnnealing \
model.optim.sched.warmup_steps=750 \
model.optim.sched.constant_steps=80000 \
model.optim.sched.min_lr=6e-5 \
~model.optim.bucket_cap_mb \
~model.optim.overlap_grad_sync \
~model.optim.overlap_param_sync \
~model.optim.contiguous_grad_buffer \
~model.optim.contiguous_param_buffer
C.输出
| Name | Type | Params
-----------------------------------
0 | model | GPTModel | 842 M
-----------------------------------
842 M Trainable params
0 Non-trainable params
842 M Total params
3,370.648 Total estimated model params size (MB)
Epoch 0: : 0%| | 22/300000 [00:32<123:59:27, reduced_train_loss=1.400, global_step=21.00, consumed_samples=88.00, train_step_timing in s=1.470
6.其它命令[暂时不用]
mkdir -p unpacked_nemo_file
tar -xvf llama-2-7b-hf-nemo -C unpacked_nemo_file
* convert your legacy checkpoint to TP1 PP1 format
python /opt/NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py \
--model_file="./llama-2-7b-hf-nemo" \
--target_file="./output/llama-2-7b-hf-nemo_mp" \
--target_tensor_model_parallel_size 4 \
--target_pipeline_model_parallel_size 2 \
--hparams_file="/opt/NeMo-Framework-Launcher/launcher_scripts/conf/training/llama/llama2_7b.yaml"
mkdir -p unpacked_nemo_file_mp1tp1
tar -xvf ./llama-2-7b-hf-nemo -C unpacked_nemo_file_mp1tp1
python /opt/NeMo/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py \
--input_name_or_path ./unpacked_nemo_file_mp1tp1 \
--output_path ./output.nemo --cpu-only
标签:trainer,optim,7b,Launcher,NeMo,--,manager,exp,model
From: https://blog.csdn.net/m0_61864577/article/details/139454941