首页 > 编程语言 >【BERT-多标签文本分类实战】之七——训练-评估-测试与运行主程序

【BERT-多标签文本分类实战】之七——训练-评估-测试与运行主程序

时间:2023-01-16 21:38:20浏览次数:81  
标签:之七 BERT 主程序 loss dev train test model config



[1] 损失函数与评价指标

  多标签文本分类任务,用的损失函数是​​BCEWithLogitsLoss​​​,不是交叉熵损失函数​​cross_entropy​​!!

BCEWithLogitsLoss​cross_entropy​有什么区别?
+
1)​​​cross_entropy​​​它就是算单标签的损失的,大家去看一下它的公式,它对一个文本只取概率最大的那个标签;
+
2)​​​BCEWithLogitsLoss​​​对模型输出取的是​​sigmoid​​​,而​​cross_entropy​​​对模型的输出取的是​​softmax​​​。​​sigmoid​​​和​​softmax​​​虽然都是把一组数据放缩到[0,1]区间,但是​​softmax​​​具有排斥性,放缩后的一组数据之和为1,所以这样一组标签概率只会有一个较大值;而​​sigmoid​​​也是把一组数据放缩到[0,1]区间,但它更类似于等比例缩放,原来大的数现在还大,可以有多个较大的概率存在,所以​​sigmoid​​​更适合在多标签文本分类任务中。所以要使用​​BCEWithLogitsLoss​​。

  本次实战项目中使用的评价指标有:准确率​​accuracy​​​、精确率​​precision​​​、汉明损失​​hamming_loss​​​。是基于​​sklearn​​库实现的。

# 计算多标签准确率、精确率、hm
def APH(y_true, y_pred):
return metrics.accuracy_score(y_true, y_pred), \
metrics.precision_score(y_true, y_pred, average='samples'), \
metrics.hamming_loss(y_true, y_pred)

还有其他评价指标,召回率、F1等等,评价指标还分可为micro和macro,种类较多,可以参考地址:​​https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics​​。

[2] 采样

  采样是指:把模型输出出来的概率,转化成独热数组,通常使用阈值为0.5的阈值函数,即概率大于0.5的标签采样为1,否则为0。本项目设置阈值为0.4、且只取2个标签。

# 预测多标签的输出,把概率值转化为独热数组
def Predict(outputs, alpha=0.4):
predic = torch.sigmoid(outputs)
zero = torch.zeros_like(predic)
topk = torch.topk(predic, k=2, dim=1, largest=True)[1]
for i, x in enumerate(topk):
for y in x:
if predic[i][y] > alpha:
zero[i][y] = 1
return zero.cpu()

[3] 训练

  训练代码如下:

def train(config, model, train_iter, dev_iter, test_iter, is_write):
start_time = time.time()
model.train()

# 普通算法
# optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

# bert算法
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
# BertAdam implements weight decay fix,
# BertAdam doesn't compensate for bias as in the regular Adam optimizer.
optimizer = AdamW(optimizer_grouped_parameters,lr=config.learning_rate,eps=1e-8)

# 学习率指数衰减,每次epoch:学习率 = gamma * 学习率
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = len(train_iter) * config.num_epochs)
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
if is_write:
writer = SummaryWriter(
log_dir="{0}/{1}__{2}__{3}__{4}".format(config.log_path, config.batch_size, config.pad_size,
config.learning_rate, time.strftime('%m-%d_%H.%M', time.localtime())))
for epoch in range(config.num_epochs):
print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))

for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
model.zero_grad()
loss = Loss(outputs, labels)
loss.backward()
optimizer.step()
if total_batch % 100 == 0:
# 每多少轮输出在训练集和验证集上的效果
true = labels
predic = Predict(outputs)
train_oe = OneError(outputs, true)
train_acc, train_pre, train_hl = APH(true.data.cpu().numpy(), predic.data.cpu().numpy())

dev_acc, dev_pre, dev_hl, dev_oe, dev_loss = evaluate(config, model, dev_iter)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
torch.save(model.state_dict(), config.save_path)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train=== Loss: {1:>6.2}, Acc: {2:>6.2%}, Pre: {3:>6.2%}, HL: {4:>5.2} OE: {' \
'5:>6.2%}, Val=== Loss: {6:>5.2}, Acc: {7:>6.2%}, Pre: {8:>6.2%}, HL: {9:>5.2}, ' \
'OE: {10:>6.2%}, Time: {11} {12} '
print(msg.format(total_batch, loss.item(), train_acc, train_pre, train_hl, train_oe,
dev_loss, dev_acc, dev_pre, dev_hl, dev_oe, time_dif, improve))
if is_write:
writer.add_scalar('loss/train', loss.item(), total_batch)
writer.add_scalar("acc/train", train_acc, total_batch)
writer.add_scalar("pre/train", train_pre, total_batch)
writer.add_scalar("oe/train", train_oe, total_batch)
writer.add_scalar("hamming loss/train", train_hl, total_batch)
writer.add_scalar("loss/dev", dev_loss, total_batch)
writer.add_scalar("acc/dev", dev_acc, total_batch)
writer.add_scalar("pre/dev", dev_pre, total_batch)
writer.add_scalar("oe/dev", dev_oe, total_batch)
writer.add_scalar("hamming loss/dev", dev_hl, total_batch)
model.train()
total_batch += 1
if total_batch - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
scheduler.step() # 学习率衰减
if flag:
break
if is_write:
writer.close()
return test(config, model, test_iter)

  需要解释的几点:

  1、​​bert​​​模型采用​​AdamW​​做优化,不同层要设置不同的权重衰减值;

  2、​​writer​​这个变量主要是做数据可视化的,参考博客:​​【深度学习】pytorch使用tensorboard可视化实验数据​​。

[4] 评估与测试

def test(config, model, test_iter):
# test
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_pre, test_rec, test_hl, test_loss, test_report = evaluate(config, model, test_iter,
test=True)
msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}, Test Pre: {2:>6.2%}, Test HL: {3:>5.2}, Test OE: {4:>6.2%}'
print(msg.format(test_loss, test_acc, test_pre, test_rec, test_hl))
print("Precision, Recall and F1-Score...")
print(test_report)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
return test_loss, test_acc, test_pre, test_rec, test_hl


def evaluate(config, model, data_iter, test=False):
model.eval()
loss_total = 0
predict_all = []
labels_all = []
with torch.no_grad():
for texts, labels in data_iter:
outputs = model(texts)
oe = OneError(outputs.data.cpu(), labels.data.cpu())
loss = Loss(outputs, labels)
loss_total += loss
labels = labels.data.cpu().numpy()
predic = Predict(outputs.data)
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic.numpy())

labels_all = labels_all.reshape(-1, config.num_classes)
predict_all = predict_all.reshape(-1, config.num_classes)
acc, pre, hl = APH(labels_all, predict_all)
if test:
report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=3)
return acc, pre, hl, oe, loss_total / len(data_iter), report
return acc, pre, hl, oe, loss_total / len(data_iter)

[5] 运行主程序run.py

if __name__ == '__main__':

"""配置参数
dataSet : 数据集名称. required.
model_name : 模型名称. required. 可选值['bert']
is_write : 是否开启tensorboard的记录绘图模式. 可选值[False, True]
"""

M = ['bert','bert_RNN','bert_RCNN','bert_DPCNN']
I = [False, True]

dataSet = 'Reuters-21578'
is_write = I[0]

for model_name in M:
x = import_module('models.' + model_name)
config = x.Config(dataSet)
# 设置numpy的随机种子,以使得结果是确定的
np.random.seed(1)
# 为CPU设置种子用于生成随机数,以使得结果是确定的
torch.manual_seed(1)
# 为当前GPU设置随机种子,以使得结果是确定的
torch.cuda.manual_seed_all(1)
# 保证每次结果一样
torch.backends.cudnn.deterministic = True

start_time = time.time()
print("Loading data...")
train_data, dev_data, test_data = build_dataset(config)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)

# train
model = x.Model(config).to(config.device)
print(model.parameters)
print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters')
train(config, model, train_iter, dev_iter, test_iter, is_write)

  代码还是比较好懂的,但是还是有一个整体能运行起来的项目体验更佳。


标签:之七,BERT,主程序,loss,dev,train,test,model,config
From: https://blog.51cto.com/u_15942590/6010812

相关文章