工作中遇到的问题:需要从PDF中提取表格数据,并数据落到MySQL库中。具体情况如下图所示:
目录页:
代码数据页:
使用python实现代码如下(水平有限,此代码仅供自用):
import pdfplumber
pdf = r'XXX.pdf'
wookroot = pdfplumber.open(pdf)
pages = wookroot.pages
table_text = ''
for page in pages[5:9]:
text = page.extract_text()
# print(type(text))
table_text += text
post_start = table_text.rfind("7 代码集")
post_end = table_text.rfind("附录1")
# print(post_start, post_end)
table_text = table_text[post_start:post_end]
# print(table_text)
tableName_list = []
for _ in range(111):
tableName = []
post1 = table_text.rfind("(")
post2 = table_text.rfind(")")
post3 = table_text.rfind(" ")
# print(post1, post2, post3)
# print(table_text[post3:post1], table_text[post1 + 1:post2])
tableName.append(table_text[post3 + 1:post1])
tableName.append(table_text[post1 + 1:post2])
tableName_list.append(tableName)
table_text = table_text[:post3 - 1]
# 获取表名和表注释列表tableName_list
tableName_list.sort()
print(tableName_list)
print(len(tableName_list))
value_list = []
for page in pages[112:166]:
tables = page.extract_tables()
for ele in tables:
for elele in ele:
value_list.append(elele)
value_list = value_list[10:]
print((value_list))
# print(len(vlalue_list))
xh = []
for i, value in enumerate(value_list):
values =[]
if value == ['代码', '名称']:
xh.append(i)
print(len(xh))
value_list_new = []
for i in range(111):
if i != 110:
value_list_new.append(value_list[xh[i]:xh[i + 1]])
else:
value_list_new.append(value_list[xh[i]:])
#获取111个表的值的列表value_list_new
print(len(value_list_new))
print(value_list_new)
# 打开文件并写入数据
file = open("output.sql", "w") # 创建或覆盖名为"output.sql"的文本文件
# file.write("Hello, World!") # 向文件中写入字符串
for i, value in enumerate(value_list_new):
for j in value:
# print(j)
if j == ['代码', '名称']:
print("-- %s.dm_%s %s" % ((i + 1), tableName_list[i][0].lower(), tableName_list[i][1]))
file.write("-- %s.dm_%s %s\r\n" % ((i + 1), tableName_list[i][0].lower(), tableName_list[i][1]))
print("create table dm_%s(dm varchar(200) comment '代码', mc varchar(200) comment '名称') comment='%s';" \
% (tableName_list[i][0].lower(), tableName_list[i][1]))
file.write("drop table if exists dm_%s;\r\n" % tableName_list[i][0].lower())
file.write("create table dm_%s(dm varchar(200) comment '代码', mc varchar(200) comment '名称') comment='%s';\r\n"\
% (tableName_list[i][0].lower(), tableName_list[i][1]))
else:
print("insert into dm_%s values('%s','%s');" % (tableName_list[i][0].lower(), j[0], j[1]))
file.write("insert into dm_%s(dm, mc) values('%s','%s');\r\n" % (tableName_list[i][0].lower(), j[0], j[1]))
file.close() # 关闭文件
wookroot.close()
标签:表格,python,list,tableName,value,text,print,PDF,table
From: https://www.cnblogs.com/qixinghaitang/p/18001154