制作数据集
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
import datasets
import numpy as np
import torch
from llm2vec import LLM2Vec
from huggingface_hub import login
import os
# /root/data/kczx/cacahe
# os.environ["HF_HOME"] = "/root/data/kczx/cacahe"
# os.environ["HF_DATASETS_CACHE"] = "/root/data/kczx/cacahe"
# 使用你的token登录
dataset = "community-datasets/yahoo_answers_topics"
instruction = "Divide the given yahoo_onswers_topics into 10 categories: "
dataset = datasets.load_dataset(dataset)
# sentences_train, y_train = dataset["train"]["question_title"][0:2000], dataset["train"]["topic"][0:2000]
# sentences_test, y_test = dataset["test"]["question_title"][0:500], dataset["test"]["topic"][0:500]
label_mapping = {
0: "Society & Culture",
1:"Science & Mathematics",
2:"Health",
3:"Education & Reference",
4:"Computers & Internet",
5:"Sports",
6:"Business & Finance",
7:"Entertainment & Music",
8:"Family & Relationships",
9:"Politics & Government",
}
def preprocess_for_instruction_tuning(example):
# # 将问题标题和内容合并作为输入
# input_text = f"Question: {example['question_title']}\nDetails: {example['question_content']}" #注:这个格式对所有信息进行拼接
input_text = example['question_title']
# 输出为类别名称
output_text = label_mapping.get(example['topic'])
res = {
"instruction": "Classify the following question into a topic:",
"input": input_text,
"output": output_text
}
return res
# 对 train 和 test 数据集分别进行映射,并移除原始字段
processed_train = dataset["train"].map(
preprocess_for_instruction_tuning,
remove_columns=dataset["train"].column_names # 删除原始字段
)
processed_test = dataset["test"].map(
preprocess_for_instruction_tuning,
remove_columns=dataset["test"].column_names # 删除原始字段
)
# 检查处理后的数据集
print(processed_train[0])
print(processed_test[0])
# 保存处理后的数据集为 JSON 文件
processed_train.to_json("yahoo_topic_train.json")
processed_test.to_json("yahoo_topic_test.json")
训练
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"
import torch
torch.cuda.empty_cache()
# 加载JSON文件
dataset = load_dataset('json', data_files={
'train': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_train.json',
'test': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_test.json'
})
# 获取前1000条数据
train_data = dataset['train'].select(range(1000))
test_data = dataset['test'].select(range(1000))
# 打印验证前1000条数据
# print(train_data[0]) # 打印train数据集的第一条数据
# print(test_data[0])
# 加载 Mistral-7B 模型和 tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2" # Mistral-7B 模型的名称
# model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# ##模型是Mistral-7B用加,Qwen不用。
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
# # 数据格式转换为训练用
def preprocess_data(example):
max_length = 256
instruction = example["instruction"]
input_text = example["input"]
output_text = example["output"]
prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
target = output_text
return {"input_ids": tokenizer(prompt, truncation=True, max_length=max_length, padding='max_length')["input_ids"],
"labels": tokenizer(target, truncation=True, max_length=max_length, padding='max_length')["input_ids"]}
# def tokenize_function(examples):
# # 将输入输出分别转换为token id
# inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
# outputs = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=128)
# inputs['labels'] = outputs['input_ids'] # 将输出的input_ids作为标签
# return inputs
tokenized_dataset = train_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])
tokenized_test_dataset = test_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])
# 对数据集进行编码
# tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)
# def preprocess_data(example):
# """
# 将数据集进行预处理
# """
# MAX_LENGTH = 384
# input_ids, attention_mask, labels = [], [], []
# instruction = tokenizer(
# f"<|im_start|>system\n你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项,请输出文本内容的正确类型<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
# add_special_tokens=False,
# )
# response = tokenizer(f"{example['output']}", add_special_tokens=False)
# input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
# attention_mask = (
# instruction["attention_mask"] + response["attention_mask"] + [1]
# )
# labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
# if len(input_ids) > MAX_LENGTH: # 做一个截断
# input_ids = input_ids[:MAX_LENGTH]
# attention_mask = attention_mask[:MAX_LENGTH]
# labels = labels[:MAX_LENGTH]
# res = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
# return res
# 配置 LoRA
lora_config = LoraConfig(
r=8, # LoRA rank
lora_alpha=16,
lora_dropout=0.2,
target_modules=["q_proj", "v_proj"], # 调整 Mistral 的注意力投影层
bias="none",
task_type="SEQ_CLASSIFICATION"
)
model = get_peft_model(model, lora_config)
# 打印参数总和
total_params = 0
trainable_params = 0
non_trainable_params = 0
print("---- All Parameters ----")
for name, param in model.named_parameters():
total_params += param.numel()
if param.requires_grad:
trainable_params += param.numel()
# print(f"Trainable Parameter name: {name}, Shape: {param.shape}")
else:
non_trainable_params += param.numel()
# print(f"Non-Trainable Parameter name: {name}, Shape: {param.shape}")
print(f"\nTotal number of parameters: {total_params}")
print(f"Total number of trainable parameters: {trainable_params}")
print(f"Total number of non-trainable parameters: {non_trainable_params}")
training_args = TrainingArguments(
output_dir="./mistral-finetuned",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=32,
evaluation_strategy="steps",
save_strategy="steps",
learning_rate=2e-4,
num_train_epochs=3,
logging_steps=10,
save_steps=500,
warmup_steps=100,
fp16=True,
optim="adamw_torch",
deepspeed="./ds_config.json", # 引用 DeepSpeed 配置文件
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_test_dataset,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained("./mistral-finetuned")
tokenizer.save_pretrained("./mistral-finetuned")
print()
# instruction = "Classify the following question into a topic:"
# input_text = "What makes friendship click?"
# output_text = "Family & Relationships"
# # 构建输入和目标
# prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
# target = f"{output_text}"
# # 分词处理
# tokenized_prompt = tokenizer(prompt, truncation=True, max_length=512)
# tokenized_target = tokenizer(target, truncation=True, max_length=128)
# # 构造 labels,非目标部分用 -100 掩盖
# labels = [-100] * len(tokenized_prompt["input_ids"]) + tokenized_target["input_ids"]
# print()
# def preprocess_for_instruction_tuning(example):
# input_text = example['input']
# output_text = example['output']
# # 拼接输入和输出
# input_prompt = f"Instruction: {example['instruction']}\nInput: {input_text}\nOutput:"
# # 编码
# inputs = tokenizer(input_prompt, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
# labels = tokenizer(output_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
# # Mistral模型是 causal LM,所以需要调整标签格式
# labels = labels.masked_fill(labels == tokenizer.pad_token_id, -100) # 忽略填充部分
# inputs["labels"] = labels
# return inputs
#
# 处理数据集
# train_dataset = train_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])
# validation_dataset = test_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])
验证
未理解问题
直接python运行会报显存不足的问题,配置deepspeed参数后,用deepspeed运行,则可以运行,显存依旧没占满。24G占了15G
运行命令为,注意不能用python 直接运行
deepspeed --num_gpus=1 my_dataloader.py