单张V100的GPU内存只有16G,LLAMA-7B模型大小就有约27G,在单张16G的V100上微调LLAMA-7B呢?这里需要用到几个技巧:1)lora微调;2)混合精度训练;3)梯度累积。目前github上已经有不少用lora微调llama的代码,但基本都是完全基于hugging face的库去实现的,本文训练阶段基于pytorch实现。
1、模型和数据准备
模型:https://huggingface.co/decapoda-research/llama-7b-hf
微调数据集:https://github.com/LC1332/Chinese-alpaca-lora/blob/main/data/trans_chinese_alpaca_data.json
2、代码实现
2.1、config.py
经测试,单张V100,batch size只能设置为1,序列长度设置为200。config.py用于存储配置参数
class Config: epochs = 4 log_every = 10 eval_every = 500 checkpoint_every = 500 train_steps = 1500 warmup_steps = 100 batch_size = 1 accu_steps = 128 sequence_len = 200 learning_rate = 3e-4 weight_decay = 0 load_in_8bit = False lora_r = 8 lora_alpha = 16 lora_dropout = 0.05 lora_target_modules = ["q_proj", "v_proj"] val_set_size = 2000 data_path = "xxx/data/trans_chinese_alpaca_data.json" base_model = "xxx/models/llama-7b-hf" lora_model = "./lora_model"
2.2 data_helper.py
data_helper.py 是数据处理环节,这里要主要三个点:1)padding时选择在左边padding会好一些;2)label padding时要使用-100 padding,这样计算loss时会直接跳过。
import json import random import torch from torch.utils.data import Dataset from config import Config def collate_fn(batch): r"""Puts each data field into a tensor with outer dimension batch size""" batch = list(zip(*batch)) input_ids = torch.tensor(batch[0], dtype=torch.long) attention_mask = torch.tensor(batch[1], dtype=torch.float16) labels = torch.tensor(batch[2], dtype=torch.long) return input_ids, attention_mask, labels class DataHelper: def __init__(self): self.data_path = Config.data_path self.val_set_size = Config.val_set_size def load_data(self): with open(self.data_path, "r") as fr: data = json.load(fr) return data def gen_data(self): data = self.load_data() random.shuffle(data) train_data = data[self.val_set_size:] valid_data = data[:self.val_set_size] return train_data, valid_data class LlamaDataset(Dataset): def __init__(self, tokenizer, data): self.tokenizer = tokenizer self.tokenizer.pad_token_id = self.tokenizer.unk_token_id self.tokenizer.padding_side = "left" self.data = data self.sequence_len = Config.sequence_len self.eos_token_id = self.tokenizer.eos_token_id self.pad_token_id = self.tokenizer.unk_token_id # =0 self.label_pad_token_id = -100 # pytorch 中label默认为-100时不会计算loss def generate_prompt(self, data_point): # sorry about the formatting disaster gotta move fast if data_point["input"]: return f"""给定任务的描述和输入的问题,请返回结果。 描述: {data_point["instruction"]} 输入: {data_point["input"]} 回答: {data_point["output"]}""" else: return f"""给定问题的描述,请返回结果。 描述: {data_point["instruction"]} 回答: {data_point["output"]}""" def tokenize(self, prompt, add_eos_token=True): # there's probably a way to do this with the tokenizer settings # but again, gotta move fast result = self.tokenizer( prompt, truncation=True, max_length=self.sequence_len, padding=False, return_tensors=None ) input_ids, attention_mask, labels = [], [], [] if ( result["input_ids"][-1] != self.eos_token_id and len(result["input_ids"]) < self.sequence_len and add_eos_token ): result["input_ids"].append(self.eos_token_id) result["attention_mask"].append(1) pad_len = self.sequence_len - len(result["input_ids"]) if pad_len <= 0: input_ids = result["input_ids"][:self.sequence_len] attention_mask = result["attention_mask"][:self.sequence_len] labels = input_ids.copy() else: input_ids = [self.pad_token_id] * pad_len + result["input_ids"] attention_mask = [0] * pad_len + result["attention_mask"] labels = [self.label_pad_token_id] * pad_len + result["input_ids"] return input_ids, attention_mask, labels def generate_and_tokenize_prompt(self, data_point): full_prompt = self.generate_prompt(data_point) input_ids, attention_mask, labels = self.tokenize(full_prompt) return input_ids, attention_mask, labels def __len__(self): return len(self.data) def __getitem__(self, index): data_point = self.data[index] input_ids, attention_mask, labels = self.generate_and_tokenize_prompt(data_point) return (input_ids, attention_mask, labels)
2.3 model.py
直接调用transformers的接口加载模型,模型加载时选择float16加载,训练完后模型保存时只保存lora权重。
import torch from torch import nn from transformers import LlamaForCausalLM from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training from config import Config class LlamaLoraModel(nn.Module): def __init__(self): super(LlamaLoraModel, self).__init__() model = LlamaForCausalLM.from_pretrained( Config.base_model, load_in_8bit=Config.load_in_8bit, torch_dtype=torch.float16, device_map="auto" # 设置为auto时会默认使用所有可以使用的gpu,并且将模型分片加载。 ) # 权重类型是float16 # if Config.load_in_8bit: # model = prepare_model_for_int8_training(model) lora_config = LoraConfig( r=Config.lora_r, lora_alpha=Config.lora_alpha, target_modules=Config.lora_target_modules, lora_dropout=Config.lora_dropout, bias="none", task_type="CAUSAL_LM" ) self.peft_model = get_peft_model(model, lora_config) self.peft_model.config.use_cache = False def forward(self, input_ids, attention_mask, labels): output = self.peft_model( input_ids=input_ids, attention_mask=attention_mask, labels=labels, return_dict=True ) loss = output["loss"] logits = output["logits"] predictions = logits.argmax(dim=-1) return loss, predictions def print_trainable_parameters(self): self.peft_model.print_trainable_parameters() def save_lora_model(self, lora_model): self.peft_model.save_pretrained(lora_model)
2.4 trainer.py
上面提到使用transformers加载模型是,当device_map设置为auto会默认使用所有的gpu,但这份代码只支持单gpu,所以这里要通过 os.environ["CUDA_VISIBLE_DEVICES"] = "1" 去约束只有一块gpu可以使用。
混合精度训练时会先放大loss计算梯度,后缩小梯度更新权重,因此在使用梯度裁剪时需要在缩小梯度后。
混合精度训练时分两个步骤:一是放大loss,放大前要转换成float32(pytorch会自动转,不需要显示转),否则可能会上溢,执行loss的放大操作,self.scaler.scale(loss),loss放大后在求梯度时,梯度就不会因为过小而下溢。二是缩小梯度,梯度在缩小前也需要转换成float32,否则会下溢。
import random import os import json import time import sys import argparse os.environ["CUDA_VISIBLE_DEVICES"] = "0" import torch from torch.cuda.amp import autocast, GradScaler from torch.nn.utils import clip_grad_norm_ from torch.utils.data import DataLoader from transformers import AdamW, get_polynomial_decay_schedule_with_warmup from transformers import LlamaTokenizer from peft import get_peft_model_state_dict from model import LlamaLoraModel from data_helper import DataHelper, LlamaDataset, collate_fn from metric import mean, accuracy from utils import get_logger from config import Config def set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) class Trainer: def __init__(self): self.lora_model = Config.lora_model self.epochs = Config.epochs self.log_every = Config.log_every self.eval_every = Config.eval_every self.checkpoint_every = Config.checkpoint_every self.train_steps = Config.train_steps self.warmup_steps = Config.warmup_steps self.learning_rate = Config.learning_rate self.weight_decay = Config.weight_decay self.batch_size = Config.batch_size self.accu_steps = Config.accu_steps self.tokenizer = LlamaTokenizer.from_pretrained(Config.base_model) self.train_data_loader, self.valid_data_loader = self.get_data_loader() print("get data loader done") # 初始化模型对象 self.model = LlamaLoraModel() self.model.peft_model.print_trainable_parameters() old_state_dict = self.model.state_dict self.model.state_dict = ( lambda self, *_, **__: get_peft_model_state_dict( self, old_state_dict() ) ).__get__(self.model, type(self.model)) self.model = self.model.cuda() self.model.train() print("model load done") # for name, param in self.model.named_parameters(): # print(name, param.dtype) self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay) self.scheduler = get_polynomial_decay_schedule_with_warmup(optimizer=self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.train_steps, lr_end=0.0) self.scaler = GradScaler() #用于混合精度训练 def get_data_loader(self): # 加载数据集 data_obj = DataHelper() train_data, valid_data = data_obj.gen_data() logger.info("train data size: {}".format(len(train_data))) logger.info("valid data size: {}".format(len(valid_data))) train_data_set = LlamaDataset(self.tokenizer, train_data) valid_data_set = LlamaDataset(self.tokenizer, valid_data) train_data_loader = DataLoader(train_data_set, batch_size=self.batch_size, drop_last=True, num_workers=6, shuffle=True, collate_fn=collate_fn) valid_data_loader = DataLoader(valid_data_set, batch_size=self.batch_size, num_workers=6, collate_fn=collate_fn) return train_data_loader, valid_data_loader def train(self): current_step = 1 start = time.time() train_losses = [] train_word_preds = [] train_word_labels = [] train_masks = [] for epoch in range(self.epochs): logger.info("----- Epoch {}/{} -----".format(epoch + 1, self.epochs)) for batch_data in self.train_data_loader: input_ids = batch_data[0].cuda() attention_mask = batch_data[1].cuda() labels = batch_data[2].cuda() with autocast(): loss, predictions = self.model(input_ids, attention_mask, labels) train_losses.append(float(loss)) train_word_preds.extend(predictions.tolist()) train_word_labels.extend(labels.tolist()) train_masks.extend(attention_mask.tolist()) # 梯度累积训练 loss /= self.accu_steps # loss.backward() # 放大loss,并求梯度 self.scaler.scale(loss).backward() if current_step % self.accu_steps == 0: # self.optimizer.step() # 先将梯度缩放回去,再执行梯度裁剪 self.scaler.unscale_(self.optimizer) clip_grad_norm_(self.model.parameters(), 1.0) self.scaler.step(self.optimizer) # for name, param in self.model.named_parameters(): # if param.requires_grad: # print(name, param.dtype, param.grad.dtype) self.scheduler.step() self.scaler.update() self.optimizer.zero_grad() if current_step % (self.log_every * self.accu_steps) == 0: acc = accuracy(pred_ys=train_word_preds, true_ys=train_word_labels, masks=train_masks) logger.info("train: step: {}, loss: {}, acc: {}".format( current_step // self.accu_steps, mean(train_losses), acc)) train_losses = [] train_word_preds = [] train_word_labels = [] train_masks = [] if current_step % (self.eval_every * self.accu_steps) == 0: self.model.eval() with torch.no_grad(): eval_losses = [] eval_word_preds = [] eval_word_labels = [] eval_masks = [] for batch_data in self.valid_data_loader: input_ids = batch_data[0].cuda() attention_mask = batch_data[1].cuda() labels = batch_data[2].cuda() with autocast(): eval_loss, eval_predictions = self.model(input_ids, attention_mask, labels) eval_losses.append(eval_loss) eval_word_preds.extend(eval_predictions.tolist()) eval_word_labels.extend(labels.tolist()) eval_masks.extend(attention_mask.tolist()) acc = accuracy(pred_ys=eval_word_preds, true_ys=eval_word_labels, masks=eval_masks) logger.info("\n") logger.info("eval: loss: {}, acc: {}".format( mean(eval_losses), acc)) logger.info("\n") self.model.train() if current_step % (self.checkpoint_every * self.accu_steps) == 0: lora_model_path = self.lora_model + "/" + str(current_step // self.accu_steps) if not os.path.exists(lora_model_path): os.makedirs(lora_model_path) self.model.save_lora_model(lora_model_path) current_step += 1 if (current_step // self.accu_steps) > self.train_steps: break if (current_step // self.accu_steps) > self.train_steps: break end = time.time() print("total train time: ", end - start) if __name__ == "__main__": # 读取用户在命令行输入的信 logger = get_logger("llama_lora", "log.txt") trainer = Trainer() trainer.train()
2.5 metric.py
用于输出准确率指标,生成时是通过前n-1个token生成n个token。
from itertools import chain def mean(item: list) -> float: """ 计算列表中元素的平均值 :param item: 列表对象 :return: """ res = sum(item) / len(item) if len(item) > 0 else 0 return res def accuracy(pred_ys, true_ys, masks): total = 0 corr = 0 for pred_y, true_y, mask in zip(pred_ys, true_ys, masks): # 做一层转换,让生成的结果对应上预测的结果,即前n-1个token预测第n个token pred_y = pred_y[:-1] true_y = true_y[1:] mask = mask[:-1] for p, t, m in zip(pred_y, true_y, mask): if m == 1: total += 1 if p == t: corr += 1 return corr / total if total > 0 else 0
2.6 utils.py
import logging def get_logger(name, log_path): """ get loggger """ logger = logging.getLogger(name) logger.setLevel(level = logging.INFO) # 向文件输出 handler = logging.FileHandler(log_path, encoding='UTF-8') handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # 向终端输出 console = logging.StreamHandler() console.setLevel(logging.DEBUG) # 为logger对象添加句柄 logger.addHandler(handler) logger.addHandler(console) return logger
以上代码就可以执行单gpu微调llama-7b了,所有文件放在一个文件夹下,直接python trainer.py即可。
可以看到训练时基本上占满了16G
训练结果输出:
2023-04-18 15:50:35,592 - llama_lora - INFO - train: step: 10, loss: 2.061204940266907, acc: 0.5903934252177503
2023-04-18 15:53:52,040 - llama_lora - INFO - train: step: 20, loss: 2.037644072668627, acc: 0.6015729358968587
2023-04-18 15:57:09,730 - llama_lora - INFO - train: step: 30, loss: 1.835213798424229, acc: 0.6303313571621855
2023-04-18 16:00:26,805 - llama_lora - INFO - train: step: 40, loss: 1.5541459571570158, acc: 0.6843670305522795
2023-04-18 16:03:44,602 - llama_lora - INFO - train: step: 50, loss: 1.389773446181789, acc: 0.7062043614818337
2023-04-18 16:07:01,532 - llama_lora - INFO - train: step: 60, loss: 1.320238757925108, acc: 0.713156766351226
2023-04-18 16:10:19,220 - llama_lora - INFO - train: step: 70, loss: 1.1323074967134743, acc: 0.7215333197594442
2023-04-18 16:13:36,530 - llama_lora - INFO - train: step: 80, loss: 1.090813200455159, acc: 0.7257552417417492
2023-04-18 16:16:53,352 - llama_lora - INFO - train: step: 90, loss: 1.0772152052028106, acc: 0.7281639132807461
2023-04-18 16:20:10,449 - llama_lora - INFO - train: step: 100, loss: 1.0679462264524773, acc: 0.7289817682330341
......
......
2.7 generate.py
import sys import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" import torch import transformers from peft import PeftModel from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer from config import Config class Generator: def __init__(self): self.tokenizer = LlamaTokenizer.from_pretrained(Config.base_model) model = LlamaForCausalLM.from_pretrained( Config.base_model, load_in_8bit=False, torch_dtype=torch.float16, device_map="auto", ) self.model = PeftModel.from_pretrained( model, Config.lora_model, torch_dtype=torch.float16, ) self.model.config.pad_token_id = self.tokenizer.pad_token_id = 0 # unk self.model.config.bos_token_id = 1 self.model.config.eos_token_id = 2 # 推断时直接转换成float16 self.model.half() self.model.eval() def generate_prompt(self, instruction, input=None): # sorry about the formatting disaster gotta move fast if input: return f"""给定任务的描述和输入的问题,请返回结果。 描述: {instruction} 输入: {input} 回答: """ else: return f"""给定问题的描述,请返回结果。 描述: {instruction} 回答: """ def evaluate( self, instruction, input=None, temperature=0.1, top_p=0.75, top_k=40, num_beams=4, max_new_tokens=128, **kwargs, ): prompt = self.generate_prompt(instruction, input) inputs = self.tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].cuda() generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) with torch.no_grad(): generation_output = self.model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, ) s = generation_output.sequences[0] output = self.tokenizer.decode(s) print(output) return output.strip("回答:") if __name__ == "__main__": generator = Generator() instruction = "标注下面商品标题的实体词,乐事薯片500克" inp = "" res = generator.evaluate(instruction, inp) print(res)
直接执行python generate.py即可调用模型生成。
标签:7B,A100,lora,self,train,LLAMA,import,model,data From: https://www.cnblogs.com/jiangxinyang/p/17330352.html