1 # 这个核心文件是对两个word文档进行对比的文件; 2 from docx import Document 3 from docx.enum.text import WD_COLOR_INDEX 4 import re 5 from tqdm import tqdm 6 7 8 def strContrastInit(str): 9 """ 10 这是一个将字符串进行初始化的函数,它会去掉字符串中的标点符号; 11 :param str: 传入需要处理的字符串; 12 :return: 以列表的形式返回处理完成的字符串; 13 """ 14 symbol_list = [',', '.', '<', '>', ';', '(', ')', '?', '!', ':', 15 ',', '。', '《', '》', ';', '?', '!', '、', ':'] 16 match_str = re.sub('\s+', '', str) 17 str_list = list(match_str) 18 for i in range(len(str_list)): 19 for i_symbol_list in symbol_list: 20 if str_list[i] == i_symbol_list: 21 str_list[i] = '' 22 else: 23 continue 24 re_list = list(''.join(str_list)) 25 return re_list 26 27 28 def strContrastValue(str_s_list, str_c_list): 29 """ 30 这是一个比较两个字符串,是否相同的函数、并会返回它们相似的比例; 31 :param str_s_list: 源字符串列表; 32 :param str_c_list: 对比字符串列表; 33 :return: 返回比例值; 34 """ 35 count = 0 36 for i_s in str_s_list: 37 for i_c in str_c_list: 38 if i_s == i_c: 39 count += 1 40 str_c_list.remove(i_c) 41 break 42 else: 43 continue 44 value = round(count/len(str_s_list), 2) 45 return value 46 47 48 def macthParagraph_appendColor(paragraph, color, insert_str): 49 """ 50 这是一个将匹配到的文件进行,填充底色和添加标注的函数; 51 :param paragraph: 传入操作的对象段落; 52 :param color: 填充颜色; 53 :param insert_str: 插入的标注内容; 54 :return: 55 """ 56 for i_piece in paragraph.runs: 57 exec('i_piece.font.highlight_color = WD_COLOR_INDEX.%s' % color) 58 paragraph.add_run(insert_str).italic = True 59 60 61 # word文件对比函数; 62 def document_contrast(word_source, word_contrast, ration_number=3): 63 """ 64 这是一个检测word重复内容的函数,将识别到重复的内容进行填充底色; 65 :param word_source:源文件对象; 66 :param word_contrast:对比文件对象; 67 :param ration_number:匹配率;以供三个等级; 68 :return:返回处理完成的文件对象; 69 """ 70 if 1 <= ration_number <= 3: 71 if ration_number == 3: 72 ratio_bottom = 0 73 ratio_centre = 0 74 ratio_top = 0 75 elif ration_number == 2: 76 ratio_bottom = '000' 77 ratio_centre = 0 78 ratio_top = 0 79 elif ration_number == 1: 80 ratio_bottom = '000' 81 ratio_centre = '000' 82 ratio_top = 0 83 for_count = 0 84 for i_paragraphSource in tqdm(word_source.paragraphs): 85 for_count += 1 86 str_s_list = strContrastInit(i_paragraphSource.text) 87 if len(str_s_list) > 0: 88 source_floatNumber_left = len(i_paragraphSource.text) * 0.8 # 形参; 89 source_floatNumber_right = len(i_paragraphSource.text) * 1.2 # 形参; 90 for i_paragraphContrast in word_contrast.paragraphs: 91 str_c_list = strContrastInit(i_paragraphContrast.text) 92 if len(str_c_list) > 0: 93 if source_floatNumber_left < len(str_c_list) < source_floatNumber_right: 94 contrast_value = strContrastValue(str_s_list, str_c_list) 95 if contrast_value == 0: 96 continue 97 elif 0.5 <= contrast_value <= 0.7: # 符合度60-75%,标注灰色; 98 if ratio_bottom == 0: 99 insert_str1 = '匹配(%s)符合度:百分之50-70' % for_count 100 macthParagraph_appendColor(i_paragraphSource, 'GRAY_50',insert_str1) 101 macthParagraph_appendColor(i_paragraphContrast, 'GRAY_50', insert_str1) 102 ratio_bottom += 1 103 continue 104 else: 105 continue 106 elif 0.7 < contrast_value <= 0.85: # 符合度70-85%,标注黄色; 107 if ratio_centre == 0: 108 insert_str2 = '匹配(%s)符合度:百分之70-85' % for_count 109 macthParagraph_appendColor(i_paragraphSource, 'YELLOW', insert_str2) 110 macthParagraph_appendColor(i_paragraphContrast, 'YELLOW', insert_str2) 111 ratio_centre += 1 112 continue 113 else: 114 continue 115 elif 0.85 < contrast_value: 116 insert_str3 = '匹配(%s)符合度:百分之85-100' % for_count 117 macthParagraph_appendColor(i_paragraphSource, 'RED', insert_str3) 118 macthParagraph_appendColor(i_paragraphContrast, 'RED', insert_str3) 119 ratio_top += 1 120 continue 121 else: 122 continue 123 else: 124 continue 125 if ration_number == 3: 126 print('符合度在百分之85-100的存在:%s处' % ratio_top) 127 print('符合度在百分之70-85的存在:%s处' % ratio_centre) 128 print('符合度在百分之50-70的存在:%s处' % ratio_bottom) 129 elif ration_number == 2: 130 print('符合度在百分之85-100的存在:%s处' % ratio_top) 131 print('符合度在百分之70-85的存在:%s处' % ratio_centre) 132 else: 133 print('符合度在百分之85-100的存在:%s处' % ratio_top) 134 return word_source, word_contrast 135 else: 136 print('"ration_number"参数不符合规则!') 137 138 139 if __name__ == '__main__': 140 word_objS = Document('/Users/fgh/Desktop/test/技术文件.docx') 141 word_objC = Document('/Users/fgh/Desktop/test/技术文件+****服务项目.docx') 142 wore_newS, wore_newC = document_contrast(word_source=word_objS, word_contrast=word_objC) 143 wore_newS.save('/Users/fgh/Desktop/test/对比源文件技术文件.docx') 144 wore_newC.save('/Users/fgh/Desktop/test/对比文件技术文件+****服务项目.docx')
注意:
只能比较两个word文档的段落文本,不能比较表格内数据;
标签:docx,word,python,list,param,source,str,return From: https://www.cnblogs.com/noobXF/p/16884185.html