autotrain学习-环境搭建、模型和数据集下载、训练全过程
autotrain框架可以通过配置简单的yaml文件就能训练dreambooth,llm_sft,llm_dpo, llm_orpo,llm_generic,llm_reward,text_classification,text_regression,token_classification image_object_detection seq2seq image_classification等模型,本文以SFT微调为例演示如何一步步操作。为了快速测试多个不同的模型,没有下载模型的权值。并且,transformers的源码也需要稍做调整。
1.参考链接
2.创建容器
docker run --gpus all --shm-size=32g -ti -e NVIDIA_VISIBLE_DEVICES=all \
--privileged --net=host -v $PWD:/home \
-w /home --name autotrain \
nvcr.io/nvidia/pytorch:23.07-py3 /bin/bash
docker start autotrain
docker exec -ti autotrain bash
mkdir -p /home/autotrain
cd /home/autotrain
3.安装autotrain
cd /home/autotrain
wget https://repo.anaconda.com/archive/Anaconda3-2023.09-0-Linux-x86_64.sh
bash Anaconda3-2023.09-0-Linux-x86_64.sh -p ~/anaconda3 -b
~/anaconda3/bin/conda init
source ~/.bashrc
conda create -n autotrain python=3.10
conda activate autotrain
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/
pip install autotrain-advanced
conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc
pip install --force-reinstall charset-normalizer==3.1.0
pip3 install deepspeed
4.解决没有真实权值的问题(不下载真实的权值)
- vim /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/transformers/modeling_utils.py
--- modeling_utils.py 2024-06-04 03:41:34.132831250 +0000
+++ 2.py 2024-06-04 05:29:11.517216442 +0000
@@ -503,6 +503,8 @@
"""
Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
"""
+ if checkpoint_file is None:
+ return
if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
# Check format of the archive
with safe_open(checkpoint_file, framework="pt") as f:
@@ -3302,11 +3304,13 @@
f" {pretrained_model_name_or_path}."
)
else:
- raise EnvironmentError(
- f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
- f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
- f" {pretrained_model_name_or_path}."
- )
+ #raise EnvironmentError(
+ # f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
+ # f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
+ # f" {pretrained_model_name_or_path}."
+ #)
+ archive_file=None
+ is_local = True
elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
archive_file = pretrained_model_name_or_path
is_local = True
@@ -3548,13 +3552,13 @@
)
from_pt = not (from_tf | from_flax)
# load pt weights early so that we know which dtype to init the model under
if from_pt:
if not is_sharded and state_dict is None:
# Time to load the checkpoint
state_dict = load_state_dict(resolved_archive_file)
+ else:
+ state_dict={}
# set dtype to instantiate the model under:
# 1. If torch_dtype is not None, we use that dtype
# 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
@@ -3595,6 +3599,8 @@
if is_sharded:
loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
else:
+ if state_dict is None:
+ state_dict={}
loaded_state_dict_keys = list(state_dict.keys())
- 不访问hf官网 vim /root/anaconda3/envs/autotrain/lib/python3.10/site-packages/autotrain/trainers/dreambooth/utils.py +12
VALID_IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]
try:
+ raise
XL_MODELS = [
5.下载SFT微调数据集
cat > download_dataset.py <<-'EOF'
from huggingface_hub import snapshot_download
from pathlib import Path
import os
import glob
import json
def download_dataset(repo_id):
models_path = Path.cwd().joinpath("datasets",repo_id)
models_path.mkdir(parents=True, exist_ok=True)
if len(glob.glob(os.path.join(models_path, "*")))>0:
return
snapshot_download(repo_id=repo_id,
local_dir=models_path,
resume_download=True,
repo_type="dataset",
token="")
download_dataset("timdettmers/openassistant-guanaco")
EOF
python download_dataset.py
6.下载opt-125m模型(忽略权值文件)
cat > download_model.py <<-'EOF'
from huggingface_hub import snapshot_download
from pathlib import Path
import os
import glob
import json
import tqdm
def download_model(repo_id):
models_path = Path.cwd().joinpath("models",repo_id)
models_path.mkdir(parents=True, exist_ok=True)
if len(glob.glob(os.path.join(models_path, "*.json")))>0:
return
snapshot_download(repo_id=repo_id,
#allow_patterns=["*.json", "tokenizer*","README.md"],
ignore_patterns=["*.h5", "*.ot", "*.msgpack","*.safetensors","*.bin"],
local_dir=models_path,
resume_download=True,
token="")
download_model("facebook/opt-125m")
EOF
python download_model.py
7.下载后的目录结构
.
+--- datasets
| +--- timdettmers
| | +--- openassistant-guanaco
| | | +--- .gitattributes
| | | +--- openassistant_best_replies_eval.jsonl
| | | +--- openassistant_best_replies_train.jsonl
| | | +--- README.md
+--- download_dataset.py
+--- download_model.py
+--- models
| +--- facebook
| | +--- opt-125m
| | | +--- .gitattributes
| | | +--- config.json
| | | +--- generation_config.json
| | | +--- LICENSE.md
| | | +--- merges.txt
| | | +--- README.md
| | | +--- special_tokens_map.json
| | | +--- tokenizer_config.json
| | | +--- vocab.json
8.SFT训练
A.生成配置文件(使用之前下载好的模型和数据集)
cat > config.yaml <<-'EOF'
task: llm-sft
base_model: ./models/facebook/opt-125m/
project_name: autotrain-opt-125m
log: tensorboard
backend: local
data:
path: ./datasets/timdettmers/openassistant-guanaco
train_split: train
valid_split: null
chat_template: null
column_mapping:
text_column: text
params:
block_size: 1024
model_max_length: 2048
max_prompt_length: 512
epochs: 3
batch_size: 2
lr: 3e-5
padding: right
optimizer: adamw_torch
scheduler: linear
gradient_accumulation: 4
mixed_precision: fp16
hub:
username: ${HF_USERNAME}
token: ${HF_TOKEN}
push_to_hub: false
EOF
B.开始训练
autotrain --config ./config.yaml
C.输出日志
Parameter Offload: Total persistent parameters: 121344 in 122 params
INFO | 2024-06-04 03:57:40 | autotrain.trainers.common:on_train_begin:231 - Starting to train...
10%|███████████████████▌ | 25/255 [00:35<05:14, 1.37s/it]INFO | 2024-06-04 03:58:16 | autotrain.trainers.common:on_log:226 - {'loss': 10.9163, 'grad_norm': 2.5211533373985793, 'learning_rate': 1.153846153846154e-05, 'epoch': 0.29411764705882354}
{'loss': 10.9163, 'grad_norm': 2.5211533373985793, 'learning_rate': 1.153846153846154e-05, 'epoch': 0.29}
20%|███████████████████████████████████████ | 50/255 [01:10<05:06, 1.50s/it]INFO | 2024-06-04 03:58:50 | autotrain.trainers.common:on_log:226 - {'loss': 9.6804, 'grad_norm': 1.5985785899614147, 'learning_rate': 2.8820960698689958e-05, 'epoch': 0.5882352941176471}
{'loss': 9.6804, 'grad_norm': 1.5985785899614147, 'learning_rate': 2.8820960698689958e-05, 'epoch': 0.59}
29%|██████████████████████████████████████████████████████████▌ | 75/255 [01:45<04:10, 1.39s/it]INFO | 2024-06-04 03:59:25 | autotrain.trainers.common:on_log:226 - {'loss': 8.7368, 'grad_norm': 1.570037248448336, 'learning_rate': 2.554585152838428e-05, 'epoch': 0.8823529411764706}
{'loss': 8.7368, 'grad_norm': 1.570037248448336, 'learning_rate': 2.554585152838428e-05, 'epoch': 0.88}
39%|█████████████████████████████████████████████████████████████████████████████▋ | 100/255 [02:19<03:30, 1.36s/it]INFO | 2024-06-04 03:59:59 | autotrain.trainers.common:on_log:226 - {'loss': 8.0304, 'grad_norm': 1.5732878074676733, 'learning_rate': 2.2270742358078603e-05, 'epoch': 1.1764705882352942}
{'loss': 8.0304, 'grad_norm': 1.5732878074676733, 'learning_rate': 2.2270742358078603e-05, 'epoch': 1.18}
49%|█████████████████████████████████████████████████████████████████████████████████████████████████ | 125/255 [02:55<03:08, 1.45s/it]INFO | 2024-06-04 04:00:35 | autotrain.trainers.common:on_log:226 - {'loss': 7.4482, 'grad_norm': 1.2109399657283213, 'learning_rate': 1.899563318777293e-05, 'epoch': 1.4705882352941178}
{'loss': 7.4482, 'grad_norm': 1.2109399657283213, 'learning_rate': 1.899563318777293e-05, 'epoch': 1.47}
59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 150/255 [03:31<02:34, 1.47s/it]INFO | 2024-06-04 04:01:11 | autotrain.trainers.common:on_log:226 - {'loss': 7.0319, 'grad_norm': 1.18086794951649, 'learning_rate': 1.5720524017467248e-05, 'epoch': 1.7647058823529411}
{'loss': 7.0319, 'grad_norm': 1.18086794951649, 'learning_rate': 1.5720524017467248e-05, 'epoch': 1.76}
69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 175/255 [04:06<01:57, 1.46s/it]INFO | 2024-06-04 04:01:46 | autotrain.trainers.common:on_log:226 - {'loss': 6.7086, 'grad_norm': 1.1956156171545111, 'learning_rate': 1.2445414847161574e-05, 'epoch': 2.0588235294117645}
{'loss': 6.7086, 'grad_norm': 1.1956156171545111, 'learning_rate': 1.2445414847161574e-05, 'epoch': 2.06}
78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 200/255 [04:42<01:19, 1.45s/it]INFO | 2024-06-04 04:02:22 | autotrain.trainers.common:on_log:226 - {'loss': 6.5593, 'grad_norm': 0.9094333521055719, 'learning_rate': 9.170305676855895e-06, 'epoch': 2.3529411764705883}
{'loss': 6.5593, 'grad_norm': 0.9094333521055719, 'learning_rate': 9.170305676855895e-06, 'epoch': 2.35}
88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 225/255 [05:16<00:39, 1.33s/it]INFO | 2024-06-04 04:02:57 | autotrain.trainers.common:on_log:226 - {'loss': 6.4095, 'grad_norm': 0.8512523656020207, 'learning_rate': 5.895196506550218e-06, 'epoch': 2.6470588235294117}
{'loss': 6.4095, 'grad_norm': 0.8512523656020207, 'learning_rate': 5.895196506550218e-06, 'epoch': 2.65}
98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 250/255 [05:51<00:06, 1.33s/it]INFO | 2024-06-04 04:03:31 | autotrain.trainers.common:on_log:226 - {'loss': 6.2869, 'grad_norm': 0.9246563410483989, 'learning_rate': 2.6200873362445413e-06, 'epoch': 2.9411764705882355}
{'loss': 6.2869, 'grad_norm': 0.9246563410483989, 'learning_rate': 2.6200873362445413e-06, 'epoch': 2.94}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [05:58<00:00, 1.33s/it]INFO | 2024-06-04 04:03:38 | autotrain.trainers.common:on_log:226 - {'train_runtime': 358.0521, 'train_samples_per_second': 45.488, 'train_steps_per_second': 0.712, 'train_loss': 7.757074094286152, 'epoch': 3.0}
{'train_runtime': 358.0521, 'train_samples_per_second': 45.488, 'train_steps_per_second': 0.712, 'train_loss': 7.757074094286152, 'epoch': 3.0}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 255/255 [05:58<00:00, 1.40s/it]
INFO | 2024-06-04 04:03:38 | autotrain.trainers.clm.utils:post_training_steps:287 - Finished training, saving model...
INFO | 2024-06-04 04:03:43 | autotrain.parser:run:190 - Job ID: 19029
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3090 On | 00000000:01:00.0 Off | N/A |
| 76% 61C P2 186W / 350W| 4454MiB / 24576MiB | 74% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce RTX 3090 On | 00000000:25:00.0 Off | N/A |
| 73% 61C P2 236W / 350W| 3024MiB / 24576MiB | 65% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 2 NVIDIA GeForce RTX 3090 On | 00000000:41:00.0 Off | N/A |
| 59% 60C P2 266W / 350W| 3064MiB / 24576MiB | 78% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 3 NVIDIA GeForce RTX 3090 On | 00000000:61:00.0 Off | N/A |
| 52% 55C P2 162W / 350W| 3024MiB / 24576MiB | 75% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 4 NVIDIA GeForce RTX 3090 On | 00000000:81:00.0 Off | N/A |
| 64% 61C P2 199W / 350W| 3098MiB / 24576MiB | 73% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 5 NVIDIA GeForce RTX 3090 On | 00000000:A1:00.0 Off | N/A |
| 74% 59C P2 231W / 350W| 5184MiB / 24576MiB | 67% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 6 NVIDIA GeForce RTX 3090 On | 00000000:C1:00.0 Off | N/A |
| 57% 58C P2 230W / 350W| 3098MiB / 24576MiB | 76% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 7 NVIDIA GeForce RTX 3090 On | 00000000:E1:00.0 Off | N/A |
| 53% 55C P2 157W / 350W| 3098MiB / 24576MiB | 67% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
(autotrain) root@x1:/home/autotrain# tree autotrain-opt-125m -L 1
autotrain-opt-125m
├── README.md
├── config.json
├── generation_config.json
├── merges.txt
├── model.safetensors
├── runs
├── special_tokens_map.json
├── tokenizer.json
├── tokenizer_config.json
├── training_args.bin
├── training_params.json
└── vocab.json
标签:loss,rate,epoch,+---,全过程,autotrain,norm,搭建
From: https://blog.csdn.net/m0_61864577/article/details/139440966