import docx from docx.document import Document from docx.table import _Cell, Table from docx.oxml.text.paragraph import CT_P from docx.text.paragraph import Paragraph from docx.oxml.table import CT_Tbl import pandas as pd doc = docx.Document('test.docx') '''依次遍历文档内容的函数''' def iter_block_items(parent): # 判断传入的是否为word文档对象,是则获取文档内容的全部子对象 if isinstance(parent, Document): parent_elm = parent.element.body # 判断传入的是否为单元格,是则获取单元格内全部子对象 elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") # 遍历全部子对象 for child in parent_elm.iterchildren(): # 判断是否为段落,是则返回段落对象 if isinstance(child, CT_P): yield Paragraph(child, parent) # 判断是否为表格,是则返回表格对象 if isinstance(child, CT_Tbl): yield Table(child, parent) '''获取表格数据,转换为dataframe数据结构''' def get_table_dataframe(table): date = [] keys = None for i, row in enumerate(table.rows): # 获取表格一行的数据 text = (cell.text for cell in row.cells) # text为generator生成器类型 # 判断是否是表头 if i == 0: keys = tuple(text) continue date.append(dict(zip(keys, text))) # zip方法,按列打包为元组的列表。再转换为字典 df = pd.DataFrame(date) # pd依赖的DataFrame方法将字典数据转换成列表集 return df # 遍历word文档,最后调用函数没有返回值时停止遍历 for block in iter_block_items(doc): # print(block.style.name) # 判断是否是表格 if block.style.name == 'Table Grid': df = get_table_dataframe(block) print(df) # 判断该子对象是否是正文 elif block.style.name == 'Normal': print(block.text) # 判断是否为标题1。如果是Heading 2则判断是否为标2,以此类推。 elif block.style.name == 'Heading 1': print(block.text) # path为:保存路径+文件名.csv。sep表示用逗号分隔,index是否要索引,header是否要列名 df.to_csv(path, sep=',', index=False, header=True, encoding='utf_8_sig') # path为:保存路径+文件名.xlsx。 df.to_excel(path, index=False, header=True, encoding='utf_8_sig') df = pd.read_csv(path) # 读取工作簿第一个表 df = pd.read_excel(path, sheet_name='Sheet1')
标签:docx,遍历,Word,parent,Python,text,df,import,block From: https://www.cnblogs.com/QQ-77Ly/p/17681357.html