Bert实现情感分析demo
数据集
代码以及部分讲解
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
没有cuda就启用cpu
class CustomClassifier(nn.Module):
def __init__(self,bert):
super(CustomClassifier,self).__init__()
self.bert = bert
self.fc1 = nn.Linear(768,512)
self.fc2 = nn.Linear(512,2)
self.dropout = nn.Dropout(0.1) #防止泛化
self.relu = nn.ReLU() #激活函数,防止梯度消失。
self.softmax = nn.LogSoftmax(dim=1)
def forward(self,input_ids,attention_mask,labels = None):
_, cls_hidden_state = self.bert(input_ids, attention_mask=attention_mask, return_dict=False) #得到隐层状态,可以继续前向传播
x = self.fc1(cls_hidden_state)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.softmax(x)
if labels is not None:
loss_fn = nn.NLLLoss() #NLLLOSS损失函数和LogSoftmax绑定使用
loss = loss_fn(x,labels)
return loss,x
return x
def save(self,save_directory):
os.makedirs(save_directory,exist_ok=True)
torch.sava(self.state_dict(),os.path.join(save_directory,"pytorch_model.bin"))
self.bert.config.to_json_file(os.path.join(save_directory,"config.json"))
@classmethod #主要可以通过cls使用类的方法
def from_pretrained(cls, save_directory):
bert_model = AutoModel.from_pretrained(save_directory)
model = cls(bert_model)
model.load_state_dict(torch.load(os.path.join(save_directory,'pytorch_model.bin')))
return model
这是主要的类,有关参数可以进行微调。
这个demo使用模型是bert-base,预训练模型以及分词器在huggingface上下载。
主要下载tokenizer_config.json,vocab.txt,config.json,pytorch_model.bin文件
BERT Base:
- 隐藏层(Transformer 层)数量:12 层
- 隐藏状态维度:768
- 自注意力头数量:12
- 总参数数量:110M
def load_local_dataset(data_dir):
data = {'train': [], 'test': []}
labels = {'neg': 0, 'pos': 1}
for split in ['train', 'test']:
for label in ['neg', 'pos']:
folder = os.path.join(data_dir, split, label)
for filename in tqdm(os.listdir(folder), desc=f"Loading {split} {label} data"):
with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
data[split].append({'text': f.read(), 'label': labels[label]})
train_dataset = Dataset.from_pandas(pd.DataFrame(data['train']))
test_dataset = Dataset.from_pandas(pd.DataFrame(data['test']))
return DatasetDict({'train': train_dataset, 'test': test_dataset})
加载预训练数据,转化为dataset格式方便输入。
data_dir = './imdb/imdb' # 替换为你的数据路径
dataset = load_local_dataset(data_dir)
model_name = "./bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
#加载分词器和预训练模型
# 创建自定义分类模型
model = CustomClassifier(bert_model).to(device)
def preprocess_function(examples):
return tokenizer(examples['text'],truncation=True,padding=True,max_length=512)
encoded_dataset = dataset.map(preprocess_function,batch = True,desc="Tokenizing")
对数据进行预处理,截断,填充等。
trainingargs = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
eval_strategy="epoch",
save_strategy="epoch"
)
定义训练参数
def compute_metrics(p):
preds = np.argmax(p.predictions, axis=1)
precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
acc = accuracy_score(p.label_ids, preds)
return {
'accuracy': acc,
'precision': precision,
'recall': recall,
'f1': f1,
}
定义评估函数,可以返回精确率,召回率,f1分数等。
trainer = Trainer(
model = model,
args = trainingargs,
train_dataset=encoded_dataset['train'],
eval_dataset=encoded_dataset['test'],
compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()
model.save('./saved_model')
tokenizer.save_pretrained('./saved_model')
模型的训练和保存
model = CustomClassifier.from_pretrained('./saved_model')
model.to(device)
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('./saved_model')
# 定义预测函数
def predict(text):
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
inputs = {key: val.to(device) for key, val in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs, dim=1)
label = torch.argmax(probs, dim=1).item()
return 'pos' if label == 1 else 'neg'
# 进行预测
sample_text = "I failed the math exam this time."
prediction = predict(sample_text)
print(f"Prediction: {prediction}")
预测结果
训练大概需要30~50分钟