初次尝试用python将pdf转换为excel表格,如有错误欢迎指出,
需要用到的库如下:
pip install pdfminer3k
pip install tabula-py
pip install openpyxl
如果是pip3,则:
pip3 install pdfminer3k
pip3 install tabula-py
pip3 install openpyxl
通过终端即可安装
新建一个IDLE文件,源码如下:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTText, LTFigure, LTImage, LTChar, LTTextBoxHorizontal
from pdfminer.pdfpage import PDFPage
from io import StringIO
def extract_table(pdf_path):
rsrcmgr = PDFResourceManager()
outfp = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
with open(pdf_path, 'rb') as fp:
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBoxHorizontal):
outfp.write(lt_obj.get_text())
outfp.write('\n')
return outfp.getvalue()
#()内为文件路径需要替换为真实路径信息
table = extract_table('/Users/1.pdf')
print(table)
import tabula
def convert_to_csv(pdf_path, csv_path):
tabula.convert_into(pdf_path, csv_path, output_format="csv", pages="all")
#()内为文件路径需要替换为真实路径信息
convert_to_csv("/Users/1.pdf", "/Users/1.csv")
import pandas as pd
#()内为文件路径需要替换为真实路径信息
df = pd.read_csv("/Users/1.csv")
print(df)
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
def convert_to_excel(csv_path, excel_path):
df = pd.read_csv(csv_path)
book= Workbook()
sheet = book.active
for r in dataframe_to_rows(df, index=False, header=True):
sheet.append(r)
book.save(excel_path)
#()内为文件路径需要替换为真实路径信息
convert_to_excel("/Users/1.csv", "/Users/1.xlsx")
执行后即可在指定路径下看到输出的文件
标签:python,excel,install,import,pdf,path,csv
From: https://www.cnblogs.com/fannyLi/p/17744858.html