python实现word文档比对的功能较简单,笔者这里将其界面话,可以指定输入比对的文档,相似度,最小相似参数等。输出的结果以word的形式保存,重复部分会标出,基本实现了商业软件的功能。
先看界面
这里不废话了,直接给出全部源码,觉得好的点个赞。程序打包的话,自己百度。
from tkinter import Tk, Button, Label, filedialog, Entry, Frame, TOP, LEFT, RIGHT, X, HORIZONTAL
from tkinter.ttk import Progressbar
from tkinter import messagebox
from docx import Document
from docx.shared import RGBColor, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.enum.text import WD_COLOR_INDEX
import re, datetime
def getText(wordname):
d = Document(wordname)
texts = []
for para in d.paragraphs:
texts.append(para.text)
return texts
def is_Chinese(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def msplit(s, seperators=',|\.|\?|,|。|?|!'):
return re.split(seperators, s)
def readDocx(docfile):
print('*' * 80)
print('文件', docfile, '加载中……')
t1 = datetime.datetime.now()
paras = getText(docfile)
segs = []
for p in paras:
temp = []
for s in msplit(p):
if len(s) > 2:
temp.append(s.replace(' ', ""))
if len(temp) > 0:
segs.append(temp)
t2 = datetime.datetime.now()
print('加载完成,用时: ', t2 - t1)
return segs
def compareParagraph(doc1, i, doc2, j, filter_doc, min_segment=5, min_same_chars=10, min_similarity_ratio=0.5):
p1 = doc1[i]
p2 = doc2[j]
len1 = sum([len(s) for s in p1])
len2 = sum([len(s) for s in p2])
if len1 < min_same_chars or len2 < min_same_chars:
return {}
same_characters = []
for s1 in p1:
if len(s1) < min_segment:
continue
for s2 in p2:
if len(s2) < min_segment:
continue
if s2 in s1 and not any(s2 in p for p in filter_doc):
same_characters.append(s2)
elif s1 in s2 and not any(s1 in p for p in filter_doc):
same_characters.append(s1)
count = sum([len(s) for s in same_characters])
ratio = float(count) / min(len1, len2)
if count &
标签:word,min,python,s2,s1,same,文档,len,import
From: https://blog.csdn.net/tomelrg/article/details/140662116