import grequests
import pandas as pd
import time
all_start = time.time()
sentence_list = [] # 存储相似句子对
df = pd.read_csv("new_clean.csv")
all_sentence_list = list(df["句子"].unique()) # 先去重再转为列表
serving_url = ""
auth_username = ""
auth_password = ""
id = 0
while len(all_sentence_list) > 0:
current_sentence = all_sentence_list[0] # 当前句子列表的第一个句子
other_sentence_len = len(all_sentence_list) # 目前该列表的长度
print(id, current_sentence, other_sentence_len)
batch = 500 # 每次匹配500
batch_num = other_sentence_len // batch # 需要匹配的总伦次,30000整除500=60次
cuttent_sentence_same_list = [
current_sentence
] # 这个变量是为了装与第一个句子的所有语义相同的句子,包含第一个句子本身
for num in range(batch_num + 1): # 从第0轮到第59轮
start = num * 500 + 1 # i=0时,start=1
end = (num + 1) * 500 # i=0时,end=500
if end > other_sentence_len: # 如果end大于目前循环匹配列表的长度,则取(该列表长度)作为结尾
end = other_sentence_len
print(start, end)
req_list = []
for i in range(start, end):
req_list.append(
grequests.get(
serving_url
+ "/sentence_distance?s1={}&s2={}".format(
current_sentence, all_sentence_list[i]
),
auth=(auth_username, auth_password),
timeout=4, # 设置每次请求的超时时限,防止堵塞
)
)
start = time.time()
res_list = grequests.map(req_list) # 并行发送,等最后一个运行完后返回
end = time.time()
print("耗时:{}".format(end - start))
for res in res_list:
if not res[0]:
continue
try:
distance = res[0].json()["distance"]
url = res[1].url
if distance < 0.6:
print(
distance,
res[1].url,
)
url = res[1].url
sentence2 = url.split("&s2=")[1] # 句子2
cuttent_sentence_same_list.append(sentence2)
except:
continue
for sent in cuttent_sentence_same_list: # 剔除掉已经相似的句子
if sent in all_sentence_list:
all_sentence_list.remove(sent)
# 遍历一轮后获取到的句子都拼成一行写到txt中去
file_name = "same_sentence.txt"
with open(file_name, "a", encoding="utf-8") as f:
current_same_sentences = "----".join(cuttent_sentence_same_list)
f.write(current_same_sentences + "\n")
id += 1
all_end = time.time()
print("总耗时:{}".format(all_end - all_start))
标签:异步,end,批量,sentence,写入,list,res,start,time
From: https://www.cnblogs.com/tiansz/p/17180565.html