import docx2txt
import fitz
import docx
from docx.oxml import parse_xml
def get_doc_content(filepath):
"""获取word文本内容"""
try:
doc = docx.Document(filepath)
content = []
for element in doc.element.body:
if element.__class__.__name__ == 'CT_P': # 段落
paragraph = docx.text.paragraph.Paragraph(parse_xml(element.xml), parent=None)
content.append(paragraph.text)
elif element.__class__.__name__ == 'CT_Tbl': # 表格
table = docx.table.Table(element, parent=None)
table_texts = []
for row in table.rows:
row_texts = []
for cell in row.cells:
if cell.text not in row_texts:
row_texts.append(cell.text)
else:
row_texts.append("")
table_texts.append("|".join(row_texts))
content.append("\n".join(table_texts))
content = "\n".join(content)
except Exception:
content = docx2txt.process(filepath)
return content
def get_pdf_content(filepath):
content = ""
with fitz.Document(filepath) as doc:
for page in doc:
content += page.get_text()
return content
def get_file_content(filepath):
try:
if filepath.endswith(".docx"):
content = get_doc_content(filepath)
elif filepath.endswith(".pdf"):
content = get_pdf_content(filepath)
elif filepath.endswith(".txt"):
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
else:
content = None
except:
content = None
return content
标签:docx,word,filepath,python,content,texts,table,pdf,row
From: https://www.cnblogs.com/lanjianhua/p/18685456