import os import jieba def load_stopwords(stopwords_path): """加载停用词表""" with open(stopwords_path, 'r', encoding='utf-8') as file: stopwords = set(file.read().split()) return stopwords def remove_consecutive_stopwords(words, stopwords): """移除分词结果中连续的停用词""" filtered_words = [] skip_next = False for word in words: if word in stopwords: if not skip_next: # 标记下一个词为需要跳过的停用词(如果它是停用词的话) skip_next = True else: # 如果当前词不是停用词,或者之前的词不是停用词(即没有连续的停用词),则保留它 filtered_words.append(word) skip_next = False # 处理最后一个词是停用词且后面没有其他词的情况 if filtered_words and filtered_words[-1] in stopwords: filtered_words.pop() return filtered_words def process_text(text, stopwords): """去除停用词并进行分词,同时尝试移除连续的停用词标签:skip,next,stopwords,words,用词,filtered,分词,加停 From: https://blog.csdn.net/m0_63990585/article/details/140946989