1、T5微调
本笔记展示如何使用Huggingface的Transformers对T5模型进行微调,以解决不同的自然语言处理任务,使用了T5论文中提出的文本-文本方法。为了演示,我选择了3个非文本-文本问题,只是为了重申论文中所述的这种文本-文本框架有多广泛适用,以及如何在完全不改变模型的情况下用于不同的任务。
这只是一个草稿,如果您发现本笔记中有任何问题或有任何问题,请通过Twitter联系我。
import argparse import glob import os import json import time import logging import random import re from itertools import chain from string import punctuation import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import pandas as pd import numpy as np import torch from torch.utils.data import Dataset, DataLoader import pytorch_lightning as pl from transformers import ( AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup )
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. INFO:transformers.file_utils:PyTorch version 1.5.0+cu101 available. INFO:transformers.file_utils:TensorFlow version 2.2.0-rc4 available.
def set_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) set_seed(42)
2、模型
我们将使用强大的pytorch-lightning库进行训练。下面的大部分代码是从这里适配的:https://github.com/huggingface/transformers/blob/master/examples/lightning_base.py
训练器是通用的,可用于任何文本-文本任务。您只需要更改数据集。其余的代码对于所有任务都将保持不变。
这就是文本-文本格式最有趣和强大的地方。您只需将问题制定为文本-文本设置,即可在各种自然语言处理任务上微调模型。无需更改超参数、学习率、优化器或损失函数。只需将数据集接入,您就可以开始了!
class T5FineTuner(pl.LightningModule): def __init__(self, hparams): super(T5FineTuner, self).__init__() self.hparams = hparams self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path) self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path) def is_logger(self): return self.trainer.proc_rank <= 0 def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None ): return self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, lm_labels=lm_labels, ) def _step(self, batch): lm_labels = batch["target_ids"] lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100 outputs = self( input_ids=batch["source_ids"], attention_mask=batch["source_mask"], lm_labels=lm_labels, decoder_attention_mask=batch['target_mask'] ) loss = outputs[0] return loss def training_step(self, batch, batch_idx): loss = self._step(batch) tensorboard_logs = {"train_loss": loss} return {"loss": loss, "log": tensorboard_logs} def training_epoch_end(self, outputs): avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean() tensorboard_logs = {"avg_train_loss": avg_train_loss} return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} def validation_step(self, batch, batch_idx): loss = self._step(batch) return {"val_loss": loss} def validation_epoch_end(self, outputs): avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() tensorboard_logs = {"val_loss": avg_loss} return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs} def configure_optimizers(self): "Prepare optimizer and schedule (linear warmup and decay)" model = self.model no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.hparams.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) self.opt = optimizer return [optimizer] def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None): if self.trainer.use_tpu: xm.optimizer_step(optimizer) else: optimizer.step() optimizer.zero_grad() self.lr_scheduler.step() def get_tqdm_dict(self): tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]} return tqdm_dict def train_dataloader(self): train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams) dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4) t_total = ( (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu))) // self.hparams.gradient_accumulation_steps * float(self.hparams.num_train_epochs) ) scheduler = get_linear_schedule_with_warmup( self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total ) self.lr_scheduler = scheduler return dataloader def val_dataloader(self): val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams) return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)
logger = logging.getLogger(__name__) class LoggingCallback(pl.Callback): def on_validation_end(self, trainer, pl_module): logger.info("***** Validation results *****") if pl_module.is_logger(): metrics = trainer.callback_metrics # Log results for key in sorted(metrics): if key not in ["log", "progress_bar"]: logger.info("{} = {}\n".format(key, str(metrics[key]))) def on_test_end(self, trainer, pl_module): logger.info("***** Test results *****") if pl_module.is_logger(): metrics = trainer.callback_metrics # Log and save results to file output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(metrics): if key not in ["log", "progress_bar"]: logger.info("{} = {}\n".format(key, str(metrics[key]))) writer.write("{} = {}\n".format(key, str(metrics[key])))
让我们定义超参数和其他参数。根据需要,您可以覆盖这个 dict
来针对特定任务进行修改。在大多数情况下,您只需更改 data_dir
和 output_dir
。
这里的批量大小为8,梯度累积步数为16,因此有效批次大小为128。
args_dict = dict( data_dir="", # path for data files output_dir="", # path to save the checkpoints model_name_or_path='t5-base', tokenizer_name_or_path='t5-base', max_seq_length=512, learning_rate=3e-4, weight_decay=0.0, adam_epsilon=1e-8, warmup_steps=0, train_batch_size=8, eval_batch_size=8, num_train_epochs=2, gradient_accumulation_steps=16, n_gpu=1, early_stop_callback=False, fp_16=False, # if you want to enable 16-bit training then install apex and set this to true opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default seed=42, )
3、IMDB review classification
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz !tar -xvf aclImdb_v1.tar.gz train_pos_files = glob.glob('aclImdb/train/pos/*.txt') train_neg_files = glob.glob('aclImdb/train/neg/*.txt') len(train_pos_files), len(train_neg_files)
我们将从训练集中使用 2000 个样本进行验证。让我们选择 1000 条正面评价和 1000 条负面评价用于验证,并将它们保存在 val 目录中。
!mkdir aclImdb/val aclImdb/val/pos aclImdb/val/neg random.shuffle(train_pos_files) random.shuffle(train_neg_files) val_pos_files = train_pos_files[:1000] val_neg_files = train_neg_files[:1000] import shutil for f in val_pos_files: shutil.move(f, 'aclImdb/val/pos') for f in val_neg_files: shutil.move(f, 'aclImdb/val/neg')
4、准备数据集
tokenizer = T5Tokenizer.from_pretrained('t5-base') ids_neg = tokenizer.encode('negative </s>') ids_pos = tokenizer.encode('positive </s>') len(ids_neg), len(ids_pos)
所有示例均已转换为文本-文本格式,如论文中所示。然而,在这里我没有使用任何任务前缀。示例被编码如下, 如果评价是积极的,则目标是 'positive',否则是 'negative'
输入: 我和丈夫一起去看这部电影,我们都认为演技很糟糕!
目标: negative
输入: 尽管别人说什么,我觉得这部电影很有趣。
目标: positive
以下数据集负责读取评论文件并将示例处理为文本-文本格式。
它通过删除 HTML 标记来清理评论文本。它还根据 T5 模型的要求在输入和目标末尾附加 eos 标记 </s>
。
对于 T5,最大输入长度为 512,我们可以根据数据集选择目标序列的最大长度。T5Tokenizer
将 'positive' 和 'negative' 编码为单个 id,因此我选择了最大目标长度为 2,额外的 1 用于 </s>
标记。
class ImdbDataset(Dataset): def __init__(self, tokenizer, data_dir, type_path, max_len=512): self.pos_file_path = os.path.join(data_dir, type_path, 'pos') self.neg_file_path = os.path.join(data_dir, type_path, 'neg') self.pos_files = glob.glob("%s/*.txt" % self.pos_file_path) self.neg_files = glob.glob("%s/*.txt" % self.neg_file_path) self.max_len = max_len self.tokenizer = tokenizer self.inputs = [] self.targets = [] self._build() def __len__(self): return len(self.inputs) def __getitem__(self, index): source_ids = self.inputs[index]["input_ids"].squeeze() target_ids = self.targets[index]["input_ids"].squeeze() src_mask = self.inputs[index]["attention_mask"].squeeze() # might need to squeeze target_mask = self.targets[index]["attention_mask"].squeeze() # might need to squeeze return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask} def _build(self): self._buil_examples_from_files(self.pos_files, 'positive') self._buil_examples_from_files(self.neg_files, 'negative') def _buil_examples_from_files(self, files, sentiment): REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]") REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)") for path in files: with open(path, 'r') as f: text = f.read() line = text.strip() line = REPLACE_NO_SPACE.sub("", line) line = REPLACE_WITH_SPACE.sub("", line) line = line + ' </s>' target = sentiment + " </s>" # tokenize inputs tokenized_inputs = self.tokenizer.batch_encode_plus( [line], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt" ) # tokenize targets tokenized_targets = self.tokenizer.batch_encode_plus( [target], max_length=2, pad_to_max_length=True, return_tensors="pt" ) self.inputs.append(tokenized_inputs) self.targets.append(tokenized_targets)
dataset = ImdbDataset(tokenizer, 'aclImdb', 'val', max_len=512) len(dataset) data = dataset[28] print(tokenizer.decode(data['source_ids'])) print(tokenizer.decode(data['target_ids']))
!mkdir -p t5_imdb_sentiment args_dict.update({'data_dir': 'aclImdb', 'output_dir': 't5_imdb_sentiment', 'num_train_epochs':2}) args = argparse.Namespace(**args_dict) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5 ) train_params = dict( accumulate_grad_batches=args.gradient_accumulation_steps, gpus=args.n_gpu, max_epochs=args.num_train_epochs, early_stop_callback=False, precision= 16 if args.fp_16 else 32, amp_level=args.opt_level, gradient_clip_val=args.max_grad_norm, checkpoint_callback=checkpoint_callback, callbacks=[LoggingCallback()], )
定义get_dataset
函数来返回数据集。模型调用此函数来获取训练和验证数据集。我们正在定义一个数据集函数,这样我们将不需要修改模型代码。根据问题重新定义函数以返回不同的数据集。虽然现在这并不是最佳解决方案,但目前可行。
def get_dataset(tokenizer, type_path, args): return ImdbDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path, max_len=args.max_seq_length)
5、初始化模型
model = T5FineTuner(args) trainer = pl.Trainer(**train_params) trainer.fit(model)
!mkdir t5_base_imdb_sentiment
model.model.save_pretrained('t5_base_imdb_sentiment') # !cp -r t5_base_imdb_sentiment drive/My\ Drive/
6、评估
对于推断,我们将使用带有最大长度2的贪婪解码的生成方法。让我们在测试数据集上可视化一些预测结果。
loader = DataLoader(dataset, batch_size=32, num_workers=4) model.model.eval() outputs = [] targets = [] for batch in tqdm(loader): outs = model.model.generate(input_ids=batch['source_ids'].cuda(), attention_mask=batch['source_mask'].cuda(), max_length=2) dec = [tokenizer.decode(ids) for ids in outs] target = [tokenizer.decode(ids) for ids in batch["target_ids"]] outputs.extend(dec) targets.extend(target)
7、预测
import textwrap from tqdm.auto import tqdm from sklearn import metrics dataset = ImdbDataset(tokenizer, 'aclImdb', 'test', max_len=512) loader = DataLoader(dataset, batch_size=32, shuffle=True) it = iter(loader) batch = next(it) batch["source_ids"].shape outs = model.model.generate(input_ids=batch['source_ids'].cuda(), attention_mask=batch['source_mask'].cuda(), max_length=2) dec = [tokenizer.decode(ids) for ids in outs] texts = [tokenizer.decode(ids) for ids in batch['source_ids']] targets = [tokenizer.decode(ids) for ids in batch['target_ids']] for i in range(32): lines = textwrap.wrap("Review:\n%s\n" % texts[i], width=100) print("\n".join(lines)) print("\nActual sentiment: %s" % targets[i]) print("Predicted sentiment: %s" % dec[i]) print("=====================================================================\n")
for i, out in enumerate(outputs): if out not in ['positive', 'negative']: print(i, 'detected invalid prediction')
metrics.accuracy_score(targets, outputs) print(metrics.classification_report(targets, outputs))
标签:files,tokenizer,self,T5,ids,调优,多项,path,import From: https://www.cnblogs.com/zhangxianrong/p/18227553