首页 > 编程语言 >python-docx实现两个word文档的比较

python-docx实现两个word文档的比较

时间:2022-11-12 17:25:04浏览次数:51  
标签:docx word python list param source str return

  1 # 这个核心文件是对两个word文档进行对比的文件;
  2 from docx import Document
  3 from docx.enum.text import WD_COLOR_INDEX
  4 import re
  5 from tqdm import tqdm
  6 
  7 
  8 def strContrastInit(str):
  9     """
 10     这是一个将字符串进行初始化的函数,它会去掉字符串中的标点符号;
 11     :param str: 传入需要处理的字符串;
 12     :return: 以列表的形式返回处理完成的字符串;
 13     """
 14     symbol_list = [',', '.', '<', '>', ';', '(', ')', '?', '!', ':',
 15                    ',', '。', '《', '》', ';', '?', '!', '、', ':']
 16     match_str = re.sub('\s+', '', str)
 17     str_list = list(match_str)
 18     for i in range(len(str_list)):
 19         for i_symbol_list in symbol_list:
 20             if str_list[i] == i_symbol_list:
 21                 str_list[i] = ''
 22             else:
 23                 continue
 24     re_list = list(''.join(str_list))
 25     return re_list
 26 
 27 
 28 def strContrastValue(str_s_list, str_c_list):
 29     """
 30     这是一个比较两个字符串,是否相同的函数、并会返回它们相似的比例;
 31     :param str_s_list: 源字符串列表;
 32     :param str_c_list: 对比字符串列表;
 33     :return: 返回比例值;
 34     """
 35     count = 0
 36     for i_s in str_s_list:
 37         for i_c in str_c_list:
 38             if i_s == i_c:
 39                 count += 1
 40                 str_c_list.remove(i_c)
 41                 break
 42             else:
 43                 continue
 44     value = round(count/len(str_s_list), 2)
 45     return value
 46 
 47 
 48 def macthParagraph_appendColor(paragraph, color, insert_str):
 49     """
 50     这是一个将匹配到的文件进行,填充底色和添加标注的函数;
 51     :param paragraph: 传入操作的对象段落;
 52     :param color: 填充颜色;
 53     :param insert_str: 插入的标注内容;
 54     :return:
 55     """
 56     for i_piece in paragraph.runs:
 57         exec('i_piece.font.highlight_color = WD_COLOR_INDEX.%s' % color)
 58     paragraph.add_run(insert_str).italic = True
 59 
 60 
 61 # word文件对比函数;
 62 def document_contrast(word_source, word_contrast, ration_number=3):
 63     """
 64     这是一个检测word重复内容的函数,将识别到重复的内容进行填充底色;
 65     :param word_source:源文件对象;
 66     :param word_contrast:对比文件对象;
 67     :param ration_number:匹配率;以供三个等级;
 68     :return:返回处理完成的文件对象;
 69     """
 70     if 1 <= ration_number <= 3:
 71         if ration_number == 3:
 72             ratio_bottom = 0
 73             ratio_centre = 0
 74             ratio_top = 0
 75         elif ration_number == 2:
 76             ratio_bottom = '000'
 77             ratio_centre = 0
 78             ratio_top = 0
 79         elif ration_number == 1:
 80             ratio_bottom = '000'
 81             ratio_centre = '000'
 82             ratio_top = 0
 83         for_count = 0
 84         for i_paragraphSource in tqdm(word_source.paragraphs):
 85             for_count += 1
 86             str_s_list = strContrastInit(i_paragraphSource.text)
 87             if len(str_s_list) > 0:
 88                 source_floatNumber_left = len(i_paragraphSource.text) * 0.8  # 形参;
 89                 source_floatNumber_right = len(i_paragraphSource.text) * 1.2  # 形参;
 90                 for i_paragraphContrast in word_contrast.paragraphs:
 91                     str_c_list = strContrastInit(i_paragraphContrast.text)
 92                     if len(str_c_list) > 0:
 93                         if source_floatNumber_left < len(str_c_list) < source_floatNumber_right:
 94                             contrast_value = strContrastValue(str_s_list, str_c_list)
 95                             if contrast_value == 0:
 96                                 continue
 97                             elif 0.5 <= contrast_value <= 0.7:  # 符合度60-75%,标注灰色;
 98                                 if ratio_bottom == 0:
 99                                     insert_str1 = '匹配(%s)符合度:百分之50-70' % for_count
100                                     macthParagraph_appendColor(i_paragraphSource, 'GRAY_50',insert_str1)
101                                     macthParagraph_appendColor(i_paragraphContrast, 'GRAY_50', insert_str1)
102                                     ratio_bottom += 1
103                                     continue
104                                 else:
105                                     continue
106                             elif 0.7 < contrast_value <= 0.85:  # 符合度70-85%,标注黄色;
107                                 if ratio_centre == 0:
108                                     insert_str2 = '匹配(%s)符合度:百分之70-85' % for_count
109                                     macthParagraph_appendColor(i_paragraphSource, 'YELLOW', insert_str2)
110                                     macthParagraph_appendColor(i_paragraphContrast, 'YELLOW', insert_str2)
111                                     ratio_centre += 1
112                                     continue
113                                 else:
114                                     continue
115                             elif 0.85 < contrast_value:
116                                 insert_str3 = '匹配(%s)符合度:百分之85-100' % for_count
117                                 macthParagraph_appendColor(i_paragraphSource, 'RED', insert_str3)
118                                 macthParagraph_appendColor(i_paragraphContrast, 'RED', insert_str3)
119                                 ratio_top += 1
120                                 continue
121                         else:
122                             continue
123             else:
124                 continue
125         if ration_number == 3:
126             print('符合度在百分之85-100的存在:%s处' % ratio_top)
127             print('符合度在百分之70-85的存在:%s处' % ratio_centre)
128             print('符合度在百分之50-70的存在:%s处' % ratio_bottom)
129         elif ration_number == 2:
130             print('符合度在百分之85-100的存在:%s处' % ratio_top)
131             print('符合度在百分之70-85的存在:%s处' % ratio_centre)
132         else:
133             print('符合度在百分之85-100的存在:%s处' % ratio_top)
134         return word_source, word_contrast
135     else:
136         print('"ration_number"参数不符合规则!')
137 
138 
139 if __name__ == '__main__':
140     word_objS = Document('/Users/fgh/Desktop/test/技术文件.docx')
141     word_objC = Document('/Users/fgh/Desktop/test/技术文件+****服务项目.docx')
142     wore_newS, wore_newC = document_contrast(word_source=word_objS, word_contrast=word_objC)
143     wore_newS.save('/Users/fgh/Desktop/test/对比源文件技术文件.docx')
144     wore_newC.save('/Users/fgh/Desktop/test/对比文件技术文件+****服务项目.docx')

注意:

  只能比较两个word文档的段落文本,不能比较表格内数据;

标签:docx,word,python,list,param,source,str,return
From: https://www.cnblogs.com/noobXF/p/16884185.html

相关文章