自然语言处理当中经常需要字符串的查找操作,比如通过查找返回字串在文本当中的位置,比如通过匹配实现的ner
import pandas as pd import asyncio # data = pd.read_csv("guba_fc_result_20230413.csv") data = pd.read_csv("guba_all_post_20230413.csv") filename = "cate_group.txt" def read_list_from_file(filename): with open(filename, 'r') as f: lst = [line.strip() for line in f] return lst cate_group = read_list_from_file(filename) import marisa_trie # 构建trie trie = marisa_trie.Trie(cate_group) # 匹配长字符串 # long_string = '宁德时代是做锂电池的' # results = [] # for i in range(len(long_string)): # matches = trie.prefixes(long_string[i:]) # # 输出匹配结果 # if matches: # for matche in matches: # results.append(matche) # print(results) async def match_text(long_string): long_string = str(long_string) results = [] for i in range(len(long_string)): matches = trie.prefixes(long_string[i:]) # 输出匹配结果 if matches: for matche in matches: results.append(matche) return results async def main(): tasks = [] for i in data["text"]: tasks.append(asyncio.create_task(match_text(i))) matches_list = await asyncio.gather(*tasks) data["matches"] = matches_list print(matches_list) data.to_csv("guba_all_matches_20230413.csv") if __name__ == '__main__': asyncio.run(main())
标签:string,python,matches,results,long,trie,csv,asyncio From: https://www.cnblogs.com/LiuXinyu12378/p/17391914.html