Python处理PDF——pdfplumber的安装与使用
# -*- coding:utf-8 -*- """ @Time :2023/XX/XX @Auth :Stone @File :parse_online_pdf.py @DESC :在线解析PDF文档 """ import requests import pdfplumber import re, time, os def online_pdf_parse(path_or_url, mode=1, url_params=None, proxies=None, save_as=None): ''' <语法> 参数path_or_url: PDF文档路径或者URL 参数mode: 设置解析模式, [1, '1', 'text']返回文档内容 -> str [2, '2', 'table']返回表格信息 -> list [3, '3', 'text_and_table']返回文档内容及表格信息 -> tuple 参数url_params: 读取在线PDF文档时,传入requests请求参数,类型 <- dict 参数proxies: 读取在线PDF文档时,传入requests的代理 参数save_as: 读取在线PDF文档时,若进行此项设置则另存为本地文档,方便后续使用 </语法> ''' url_mode = False # 判断是本地文档还是在线文档 if re.search(r'''(?x)\A([a-z][a-z0-9+\-.]*)://([a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])''', path_or_url): url_mode = True else: pdf_path = path_or_url if url_mode: pdf_url = path_or_url headers_d = None headers_d = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)'} if not proxies: proxy_host = {} if not url_params: url_params = {} url_params['headers'] = headers_d url_params['data'] = None url_params['params'] = None url_params['proxies'] = proxies if not url_params['headers']: url_params['headers'] = headers_d if url_params['data'] or url_params['params']: response = requests.post(pdf_url, **url_params) else: response = requests.get(pdf_url, **url_params) # 写入临时文件再进行解析 pdf_path = save_as if save_as else f'~temp{time.time()}~.pdf' with open(pdf_path, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() pdf_path = os.path.abspath(pdf_path) # 用pdfplumber对pdf文档进行解析 pdf_text = '' pdf_tables = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: if str(mode).lower() in ['1', 'text', '0', '3']: pdf_text += str(page.extract_text()) if str(mode).lower() in ['2', 'table', '0', '3']: pdf_tables += page.extract_tables() # print(f"pdf_tables={pdf_tables}") # print(f"*" * 166) # 删除临时pdf文档 if url_mode and not save_as: try: os.remove(pdf_path) except Exception as e: pass if str(mode).lower() in ['1', 'text']: return pdf_text elif str(mode).lower() in ['2', 'table']: return pdf_tables elif str(mode).lower() in ['3', 'text_and_table']: return pdf_text, pdf_tables def replace_str(str_font): """替换文字中的符号""" str_font = str(str_font).replace('\n', '').replace(' ', '').replace(': ', ':') return str_font def link_last_list(need_list): """链接上一个list""" result_total = [] for current_list in need_list: if current_list[0] == '': # 如果当前列表第一个值为空字符串 if result_total: # 如果有上一个非空列表 list1=上一个列表,list2=当前列表, new_list = [] for i in range(len(result_total[-1])): new_value = str(result_total[-1][i]) + str(current_list[i]) new_list.append(new_value) result_total[-1] = new_list else: result_total.append(current_list) else: result_total.append(current_list) print(f"获取到所有数组合并后为={result_total}") return result_total if __name__ == '__main__': pdf_url = f"********************************" pdf_text = online_pdf_parse(pdf_url, mode='table') # # print(f"获取的内容是={pdf_text}") # 识别后是按照页面进行划分数组,所以会产生一行的数据划分成两行 data = [] for item in pdf_text: for dd in item: data.append([replace_str(str_item) for str_item in dd]) # print(f"all_list={data}") result_list = link_last_list(data) print(f"拼接后的数组为={result_list}")
标签:pdfplumber,Python,text,list,url,params,str,pdf,PDF From: https://www.cnblogs.com/yoyo1216/p/17762359.html