第一次指令微调大模型记录

标签：instruction tokenizer 模型微调 dataset train 指令 test input

制作数据集

from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
import datasets
import numpy as np

import torch
from llm2vec import LLM2Vec
from huggingface_hub import login
import os

# /root/data/kczx/cacahe

# os.environ["HF_HOME"] = "/root/data/kczx/cacahe"
# os.environ["HF_DATASETS_CACHE"] = "/root/data/kczx/cacahe"

# 使用你的token登录

dataset = "community-datasets/yahoo_answers_topics"
instruction = "Divide the given yahoo_onswers_topics into 10 categories: "

dataset = datasets.load_dataset(dataset)

# sentences_train, y_train = dataset["train"]["question_title"][0:2000], dataset["train"]["topic"][0:2000]
# sentences_test, y_test = dataset["test"]["question_title"][0:500], dataset["test"]["topic"][0:500]



label_mapping = {
    0: "Society & Culture",
    1:"Science & Mathematics",
    2:"Health",
    3:"Education & Reference",
    4:"Computers & Internet",
    5:"Sports",
    6:"Business & Finance",
    7:"Entertainment & Music",
    8:"Family & Relationships",
    9:"Politics & Government",
}


def preprocess_for_instruction_tuning(example):
    # # 将问题标题和内容合并作为输入
    # input_text = f"Question: {example['question_title']}\nDetails: {example['question_content']}"   #注：这个格式对所有信息进行拼接
    
    input_text = example['question_title']
    # 输出为类别名称
    output_text = label_mapping.get(example['topic'])
    
    res = {
        "instruction": "Classify the following question into a topic:",
        "input": input_text,
        "output": output_text
    }

    return res

# 对 train 和 test 数据集分别进行映射，并移除原始字段
processed_train = dataset["train"].map(
    preprocess_for_instruction_tuning,
    remove_columns=dataset["train"].column_names  # 删除原始字段
)

processed_test = dataset["test"].map(
    preprocess_for_instruction_tuning,
    remove_columns=dataset["test"].column_names  # 删除原始字段
)

# 检查处理后的数据集
print(processed_train[0])
print(processed_test[0])

# 保存处理后的数据集为 JSON 文件
processed_train.to_json("yahoo_topic_train.json")
processed_test.to_json("yahoo_topic_test.json")

训练

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer


import os

os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4"



import torch
torch.cuda.empty_cache()

# 加载JSON文件
dataset = load_dataset('json', data_files={
    'train': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_train.json',
    'test': '/root/data/kczx/cby/EmbeddingCode/llm2vec-main/llm2vec-main/examples/Fine_tuning/yahoo_topic_test.json'
})


# 获取前1000条数据
train_data = dataset['train'].select(range(1000))
test_data = dataset['test'].select(range(1000))

# 打印验证前1000条数据
# print(train_data[0])  # 打印train数据集的第一条数据
# print(test_data[0]) 



# 加载 Mistral-7B 模型和 tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # Mistral-7B 模型的名称
# model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ##模型是Mistral-7B用加，Qwen不用。
tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(model_name)

# # 数据格式转换为训练用
def preprocess_data(example):

    max_length = 256

    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]
    prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
    target = output_text
    return {"input_ids": tokenizer(prompt, truncation=True, max_length=max_length, padding='max_length')["input_ids"],
            "labels": tokenizer(target, truncation=True, max_length=max_length, padding='max_length')["input_ids"]}


# def tokenize_function(examples):
#     # 将输入输出分别转换为token id
#     inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
#     outputs = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=128)
#     inputs['labels'] = outputs['input_ids']  # 将输出的input_ids作为标签
#     return inputs


tokenized_dataset = train_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])
tokenized_test_dataset = test_data.map(preprocess_data, remove_columns=["instruction", "input", "output"])

# 对数据集进行编码
# tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# def preprocess_data(example):
#     """
#     将数据集进行预处理
#     """
#     MAX_LENGTH = 384 
#     input_ids, attention_mask, labels = [], [], []
#     instruction = tokenizer(
#         f"<|im_start|>system\n你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
#         add_special_tokens=False,
#     )
#     response = tokenizer(f"{example['output']}", add_special_tokens=False)
#     input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
#     attention_mask = (
#         instruction["attention_mask"] + response["attention_mask"] + [1]
#     )
#     labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
#     if len(input_ids) > MAX_LENGTH:  # 做一个截断
#         input_ids = input_ids[:MAX_LENGTH]
#         attention_mask = attention_mask[:MAX_LENGTH]
#         labels = labels[:MAX_LENGTH]
    
#     res = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
#     return  res




# 配置 LoRA
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=16,
    lora_dropout=0.2,
    target_modules=["q_proj", "v_proj"],  # 调整 Mistral 的注意力投影层
    bias="none",
    task_type="SEQ_CLASSIFICATION"
)
model = get_peft_model(model, lora_config)


# 打印参数总和
total_params = 0
trainable_params = 0
non_trainable_params = 0
print("---- All Parameters ----")
for name, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        # print(f"Trainable Parameter name: {name}, Shape: {param.shape}")
    else:
        non_trainable_params += param.numel()
        # print(f"Non-Trainable Parameter name: {name}, Shape: {param.shape}")

print(f"\nTotal number of parameters: {total_params}")
print(f"Total number of trainable parameters: {trainable_params}")
print(f"Total number of non-trainable parameters: {non_trainable_params}")


training_args = TrainingArguments(
    output_dir="./mistral-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=32,
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    warmup_steps=100,
    fp16=True,
    optim="adamw_torch",
    deepspeed="./ds_config.json",  # 引用 DeepSpeed 配置文件
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_test_dataset, 
    tokenizer=tokenizer,
)

trainer.train()


model.save_pretrained("./mistral-finetuned")
tokenizer.save_pretrained("./mistral-finetuned")
print()
# instruction = "Classify the following question into a topic:"
# input_text = "What makes friendship click?"
# output_text = "Family & Relationships"

# # 构建输入和目标
# prompt = f"{instruction}\n\nInput: {input_text}\n\nOutput:"
# target = f"{output_text}"

# # 分词处理
# tokenized_prompt = tokenizer(prompt, truncation=True, max_length=512)
# tokenized_target = tokenizer(target, truncation=True, max_length=128)

# # 构造 labels，非目标部分用 -100 掩盖
# labels = [-100] * len(tokenized_prompt["input_ids"]) + tokenized_target["input_ids"]

# print()

# def preprocess_for_instruction_tuning(example):
#     input_text = example['input']
#     output_text = example['output']
    
#     # 拼接输入和输出
#     input_prompt = f"Instruction: {example['instruction']}\nInput: {input_text}\nOutput:"
    
#     # 编码
#     inputs = tokenizer(input_prompt, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
#     labels = tokenizer(output_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
    
#     # Mistral模型是 causal LM，所以需要调整标签格式
#     labels = labels.masked_fill(labels == tokenizer.pad_token_id, -100)  # 忽略填充部分
    
#     inputs["labels"] = labels
#     return inputs

#

 # 处理数据集
# train_dataset = train_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])
# validation_dataset = test_data.map(preprocess_for_instruction_tuning, remove_columns=["instruction", "input", "output"])

验证

未理解问题

直接python运行会报显存不足的问题，配置deepspeed参数后，用deepspeed运行，则可以运行，显存依旧没占满。24G占了15G

运行命令为，注意不能用python 直接运行

deepspeed --num_gpus=1 my_dataloader.py

标签：instruction,tokenizer,模型,微调,dataset,train,指令,test,input
From： https://www.cnblogs.com/chenyi502/p/18556458

第一次指令微调大模型记录

制作数据集

训练

验证

未理解问题

运行命令为，注意不能用python 直接运行

相关文章

赞助商

阅读排行