思路
由于我们项目的任务并不属于通用的自然语言处理任务,所以显然没有公开数据集,因而手工生成与模型生成相结合的方式或许是一个不错的选择。在不同的使用方式和任务情景下,我们曾经尝试了不同的数据集生成方式:
# v1
import json
import random
train_f = True
# train_f = False
# 20 * 2 * 12 * 2 = 960
data_num = 20 if train_f else 10
tenplate_num = 12
task_num = 3
max_pnum = 50
file_path = "./train.json" if train_f else "./dev.json"
data_list = []
const_content_prompt_0 = "步骤#问候\*语音#%s"
const_content_prompt_1 = "步骤#理解并基于输入拆分函数参数\*语音#%s"
const_content_prompt_2 = "步骤#基于函数调用结果进行总结\*工单查询#应到%s人\*目标检测#实到%s人\*任务类型#%s"
place_num = 12
place_db = ["哈尔滨","长春","沈阳","石家庄","兰州","西宁","西安","郑州","济南","太原","乌鲁木齐","呼和浩特"]
content_template_0 = [
"你好", "您好", "早上好", "中午好", "晚上好", "好久不见", "你是谁",
"你叫什么名字", "自我介绍下", "简单自我介绍下", "你能做什么", "你的用途",
]
summary_template_0 = "您好,我是电网大模型的语言核心,用于拆解执行行为库并向您汇报结果"
content_template_1 = [
"请尝试查看%s的工地的人数情况",
"输出工地%s人数情况",
"尝试查%s的人数",
"查查在%s工地人数嘛",
"开始检查%s的安全状况",
"试试看输出%s携带安全帽的情况",
"进行在%s的安全帽检测工作",
"检测在%s的安全帽佩戴情况啊",
"检测在%s的电笔配备情况",
"检查%s有没有设备异常",
"请输出%s工地的设备检查结果",
"告诉我%s配备电笔检查结果",
]
summary_template_1 = [
"因为出现地名为%s, 所以函数一参数为%s; 因为出现人数要求, 所以函数二参数为人数检测",
"因为出现地名为%s, 所以函数一参数为%s; 因为出现安全要求, 所以函数二参数为安全检查",
"因为出现地名为%s, 所以函数一参数为%s; 因为出现设备要求, 所以函数二参数为设备检查",
]
summary_template_2 = [
"[人数检测] 应到%d人, 实到%d人, 无人缺勤, 正常出工",
"[人数检测] 应到%d人, 实到%d人, 有人缺勤",
"[人数检测] 应到%d人, 实到%d人, 人数异常, 可能存在非法进入",
"[人数检测] 应到%d人, 实到%d人, 无工单出工, 可能存在非法进入",
"[安全检查] 实到%d人, 均佩戴安全帽",
"[安全检查] 实到%d人, %d人佩戴安全帽, 存在安全隐患",
"[安全检查] 实到%d人, %d人佩戴安全帽, 存在安全隐患",
"[安全检查] 现场无人施工",
"[设备检查] 实到%d人, 均正常携带设备",
"[设备检查] 实到%d人, %d人未携带设备",
"[设备检查] 实到%d人, %d人未携带设备",
"[设备检查] 现场无人施工",
]
content_template_2 = ["人数检测","安全检查","设备检查",]
mask_num = 7
mask_template = ["啊","哦","吗","嘛","哈","是","阿",]
for i in range(data_num):
random.seed()
data = {}
data["content"] = const_content_prompt_0 % content_template_0[data_num%12]
data["summary"] = summary_template_0
data_list.append(data)
if train_f:
tmp_s = list(content_template_0[data_num%12])
tmp_s[random.randint(0,len(tmp_s)-1)] = mask_template[random.randint(0,mask_num-1)]
data["content"] = const_content_prompt_0 % "".join(tmp_s)
data_list.append(data)
for j in range(tenplate_num):
data = {}
cur_place = place_db[random.randint(0,place_num-1)]
data["content"] = const_content_prompt_1 % (content_template_1[j] % (cur_place))
# p1:place, p2:task
data["summary"] = summary_template_1[j%3] % (cur_place, cur_place)
data_list.append(data)
if train_f:
tmp_s = list(content_template_1[j] % (cur_place))
tmp_s[random.randint(0,len(tmp_s)-1)] = mask_template[random.randint(0,mask_num-1)]
data["content"] = const_content_prompt_1 % "".join(tmp_s)
data_list.append(data)
cur_n1 = random.randint(2,max_pnum-1)
cur_n2 = random.randint(1,max_pnum)
cur_n3 = random.randint(1,cur_n1-1)
cur_n4 = random.randint(cur_n1+1,max_pnum)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n1, content_template_2[0])
data["summary"] = (summary_template_2[0] % (cur_n1,cur_n1))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n3, content_template_2[0])
data["summary"] = (summary_template_2[1] % (cur_n1,cur_n3))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n4, content_template_2[0])
data["summary"] = (summary_template_2[2] % (cur_n1,cur_n4))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (0, cur_n2,content_template_2[0])
data["summary"] = (summary_template_2[3] % (0,cur_n2))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n1, content_template_2[1])
data["summary"] = (summary_template_2[4] % (cur_n1))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n3, content_template_2[1])
data["summary"] = (summary_template_2[5] % (cur_n1,cur_n3))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n3, content_template_2[1])
data["summary"] = (summary_template_2[6] % (cur_n1,cur_n3))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (0, 0, content_template_2[1])
data["summary"] = (summary_template_2[7])
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n1, content_template_2[2])
data["summary"] = (summary_template_2[8] % (cur_n1))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n3, content_template_2[2])
data["summary"] = (summary_template_2[9] % (cur_n1,cur_n1 - cur_n3))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (cur_n1, cur_n3, content_template_2[2])
data["summary"] = (summary_template_2[10] % (cur_n1,cur_n1 - cur_n3))
data_list.append(data)
data = {}
data["content"] = const_content_prompt_2 % (0, 0, content_template_2[2])
data["summary"] = (summary_template_2[11])
data_list.append(data)
random.shuffle(data_list)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data_list, f, indent = 4, sort_keys = True, ensure_ascii=False)
# v2
import json
import random
import transformers
from transformers import (
AutoConfig,
AutoModel,
AutoTokenizer
)
# train_f = True
train_f = False
data_num = 36 if train_f else 12
tenplate_num = 12
file_path = "./trainVX.json" if train_f else "./devVX.json"
data_list = []
const_content_prompt = '要求#请根据输入准确提取出地点信息和模式信息,其中模式在[人数,安全帽,设备,问候,未知]列表中有且只有一个,以{"city":"提取出的地点信息","module":"提取出的模式信息"}形式返回输入#INPUT:%s, OUTPUT:'
place_num = 12
place_db = ["哈尔滨","长春","沈阳","石家庄","兰州","西宁","西安","郑州","济南","太原","乌鲁木齐","呼和浩特"]
content_template_0 = [
"你好", "您好", "早上好", "中午好", "晚上好", "好久不见", "你是谁",
"你叫什么名字", "自我介绍下", "简单自我介绍下", "你能做什么", "你的用途",
]
summary_template_0 = '{"city":"", "module":"问候"}'
content_template_1 = [
"请尝试查看%s的工地的人数情况",
"输出工地%s人数情况",
"尝试查%s的人数",
"查查在%s工地人数嘛",
"开始检查%s的安全状况",
"试试看输出%s携带安全帽的情况",
"进行在%s的安全帽检测工作",
"检测在%s的安全帽佩戴情况啊",
"检测在%s的安全绳配备情况",
"检查%s有没有设备异常",
"请输出%s工地的设备检查结果",
"告诉我%s配备安全绳检查结果",
]
summary_template_1 = ['{"city":"%s", "module":"人数"}',
'{"city":"%s", "module":"安全帽"}',
'{"city":"%s", "module":"设备"}']
mask_num = 7
mask_template = ["啊","哦","吗","嘛","哈","是","阿",]
for i in range(data_num):
random.seed()
data = {}
data["content"] = const_content_prompt % content_template_0[i%12]
data["summary"] = summary_template_0
data_list.append(data)
for j in range(tenplate_num):
data = {}
cur_place = place_db[random.randint(0,place_num-1)]
data["content"] = const_content_prompt % (content_template_1[j] % (cur_place))
# p1:place, p2:task
data["summary"] = summary_template_1[j//4] % cur_place
data_list.append(data)
# load model
model_path = "/home/lyc/workspace/ChatGLM-6B/chatglm-6b"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
model.eval()
query = "请任意说一句之前没有说过的话"
history = []
# gen unknown content
for i in range(data_num):
response, history = model.chat(tokenizer, query, history=history)
data = {}
data["content"] = const_content_prompt % response
data["summary"] = '{"city":"", "module":"未知"}'
data_list.append(data)
random.shuffle(data_list)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data_list, f, indent = 4, sort_keys = True, ensure_ascii=False)
其主要通过prompt来限制模型输出,我们尝试了如下几种模板:
# v1
"""
请帮我提取输入数据的city与module。module只有:[人数,安全帽,设备],只能选择一个module,不许创造未出现的module内容。将提取的信息字典返回:请回复如下格式:\\
```python\n{"city":"XXX", "module":"XXX"}\n```
---
EXAMPLE:
INPUT:执行在济南的设备要求
OUTPUT:{"city":"济南", "module":"设备"}
---
我的输入是:%s
下面是你的回复:
```python
"""
# V2
"""
要求#请根据输入准确提取出地点信息和模式信息,其中模式在[人数,安全帽,设备,问候,未知]列表中有且只有一个,以{"city":"提取出的地点信息","module":"提取出的模式信息"}形式返回
例子#INPUT:请查看在A的工地的B情况, OUTPUT:{"city":"A","module":"B"}; INPUT:CCCCCCCCCDDDDDDDD, OUTPUT:{"city":"无","module":"未知"}; INPUT:早上好, OUTPUT:{"city":"无","module":"问候"}
输入#INPUT:%s, OUTPUT:
"""
# V3
'要求#请根据输入准确提取出地点信息和模式信息,其中模式在[人数,安全帽,设备,问候,未知]列表中有且只有一个,以{"city":"提取出的地点信息","module":"提取出的模式信息"}形式返回 输入#INPUT:%s, OUTPUT:'
在ptuning的基础上,prompt设计可以更为简单,并且具有很强的扩展性。
标签:cur,list,summary,content,v2,实训,template,data,Tuning From: https://www.cnblogs.com/yichengliu0219/p/18264223