Megatron-DeepSpeed-cuda-多机训练
- 1.从ngc拉取pytorch:24.03-py3镜像
- 2.安装nvidia-docker、创建容器
- 3.安装Megatron-DeepSpeed环境
- 4.安装openmpi和ssh服务
- 5.拷贝公钥
- 6.安装pdsh
- 7.升级protobuf
- 8.准备数据集
- 9.创建配置文件
- 10.开始测试
本文演示了Megatron-DeepSpeed-GPU-多机训练的操作步骤
1.从ngc拉取pytorch:24.03-py3镜像
docker pull nvcr.io/nvidia/pytorch:24.03-py3
2.安装nvidia-docker、创建容器
cd /mnt
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list
apt-get update
apt-get install -y nvidia-docker2
nvidia-docker run -ti -e NVIDIA_VISIBLE_DEVICES=all --privileged \
--net=host -v $PWD:/home \
-w /home --rm nvcr.io/nvidia/pytorch:24.03-py3 /bin/bash
3.安装Megatron-DeepSpeed环境
pip install transformers
pip install deepspeed
git clone https://github.com/microsoft/Megatron-DeepSpeed
cd Megatron-DeepSpeed
git checkout 3c5f47563f697702c1e305fa01b7563f54b747fc
python3 setup.py install
4.安装openmpi和ssh服务
apt update
apt install -y openssh-server
apt install -y openmpi-bin openmpi-doc libopenmpi-dev
rm -rf ~/.ssh/*
ssh-keygen
sed -i 's/^.*PermitRootLogin.*$/PermitRootLogin yes/g' /etc/ssh/sshd_config
sed -i 's/^.*Port.*$/Port 2223/g' /etc/ssh/sshd_config
export passwd=Hello123 && printf "${passwd}\n${passwd}\n" | passwd root
cat >/usr/bin/run.sh <<EOF
#!/bin/bash
mkdir -p /run/sshd
source ~/.bashrc
/usr/sbin/sshd -D
EOF
chmod 777 /usr/bin/run.sh
nohup /usr/bin/run.sh &
tee ~/.ssh/config <<-'EOF'
Host worker_1
User root
Hostname 192.168.1.100
port 2223
IdentityFile ~/.ssh/id_rsa
Host worker_2
User root
Hostname 192.168.1.101
port 2223
IdentityFile ~/.ssh/id_rsa
EOF
5.拷贝公钥
ssh-copy-id worker_1
ssh-copy-id worker_2
6.安装pdsh
wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/pdsh/pdsh-2.29.tar.bz2
tar -xf pdsh-2.29.tar.bz2
cd pdsh-2.29
./configure --with-ssh
make -j
make install
cp /usr/local/bin/pdsh /usr/bin/
7.升级protobuf
pip install --upgrade protobuf==3.20.1
8.准备数据集
cd /home/Megatron-DeepSpeed
wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
xz -d oscar-1GB.jsonl.xz
python3 tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix my-gpt2 \
--vocab-file gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
9.创建配置文件
cd /home/Megatron-DeepSpeed
tee hostfile <<-'EOF'
worker_1 slots=1
worker_2 slots=1
EOF
tee ds_config.json <<-'EOF'
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 16,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 1
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOF
10.开始测试
export MAX_JOBS=8
export NCCL_DEBUG=info
export NCCL_SOCKET_IFNAME=enp5s0
export NCCL_IB_DISABLE=1
deepspeed --hostfile ./hostfile pretrain_gpt.py \
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 1 \
--distributed-backend nccl \
--num-layers 2 \
--hidden-size 8 \
--num-attention-heads 2 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 1 \
--rampup-batch-size 2 2 1_000 \
--global-batch-size 16 \
--train-samples 10_000 \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 1e-4 \
--log-interval 1 \
--lr-warmup-samples 5 \
--min-lr 1e-6 \
--lr-decay-style cosine \
--lr-decay-samples 12 \
--clip-grad 1.0 \
--weight-decay 1e-1 \
--fp16 \
--partition-activations \
--seed 42 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--exit-interval 100 --log-interval 10 \
--save-interval 50 --eval-interval 100 \
--eval-iters 10 --checkpoint-activations \
--save checkpoints/gpt2_4 \
--data-path my-gpt2_text_document \
--tensorboard-dir output_dir/tensorboard \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--deepspeed \
--deepspeed_config ./ds_config.json \
--zero-stage 1 --deepspeed-activation-checkpointing
标签:DeepSpeed,gpt2,--,Megatron,ssh,多机,nvidia,GPU
From: https://blog.csdn.net/m0_61864577/article/details/137381731