import re from openpyxl import Workbook pdf_name = 'D:/beifangzhongzhi/zhongye/百保科技/疾病/疾病.pdf' import PyPDF2 def get_text(pdf_name): with open(pdf_name, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) text_str = '' for i in range(num_pages): page = reader.pages[i] # 处理每一页的内容 text = page.extract_text() text_str += text return text_str # 创建一个新的工作簿 wb = Workbook() # 选择默认的工作表 ws = wb.active # 获取pdf文字内容 text = get_text(pdf_name) # 获取重大疾病定义内容 zhongdajibing_old = \ ''.join(text.split('第五十六条 重大疾病定义')[1]).split('第五十七条 中症疾病定义 ')[0] '''通过数字加小数点分段,再根据第一个空格切割疾病名称和症状''' zhongdajibing = zhongdajibing_old.replace('\n', '') num_list = re.findall('\d+\. \d+', zhongdajibing_old) for num in num_list: zhongdajibing = zhongdajibing.replace(num, '###') zhongdajibing_list = zhongdajibing.split('###') print('zhongdajibing_list:',len(zhongdajibing_list),zhongdajibing_list) num=2 for zhongdajibing_new in zhongdajibing_list: if zhongdajibing_new == ' ': pass else: # 删除瑞华新瑞保终身重大疾病保险( 2.0版) zhongdajibing_new = re.sub('\d+ 瑞华新瑞保终身重大疾病保险( 2.0版) ', '', zhongdajibing_new) zhongdajibing = zhongdajibing_new.split(' ') # 删除空格和开头数字 zhongdajibing_z = [z for z in zhongdajibing if z != '' and z.isdigit() == False] zhongdajibing_z = ''.join(zhongdajibing_z).split(' ') zhongdajibing_y = [y.strip() for y in zhongdajibing if y != '' and y.isdigit() == False] name_old = zhongdajibing_y[0] name_old = re.sub('\d+', '', name_old) print(f'zhongdajibing_y:{len(zhongdajibing_y),zhongdajibing_y}') print(f'name_old:{name_old}') name = name_old.strip().split(' ')[0] if len(zhongdajibing_y)==1: zhengzhuang = ''.join(name_old.strip().split(' ')[1:]) else: zhengzhuang = ''.join(name_old.strip().split(' ')[1:] + zhongdajibing_y[1:]) print(name) print(zhengzhuang) # 数据可以直接分配到单元格中 ws['A1'] = "疾病名字" ws['B1'] = "疾病描述" ws[f'A{num}'] = name ws[f'B{num}'] = zhengzhuang # 保存工作簿到文件 wb.save("重大疾病定义.xlsx") print('\n') num+=1
标签:old,读取,text,excel,num,split,zhongdajibing,PDF,name From: https://www.cnblogs.com/wolvies/p/18179791