import docx
import re
# 手动修改
word_name = ""
# 手动修改
document = docx.Document(
r".\doc\已梳理\未添加(手动添加)\新建 DOCX 文档.docx"
)
all_paragraphs = document.paragraphs
# 获取word文档中的所有文字
word_text_list = [paragraph.text for paragraph in all_paragraphs]
word_text = "".join(word_text_list)
word_text = word_text.replace(":", ":")
# 获取问题列表
Q_list = re.findall("qstart(.*?)qend", word_text)
# 获取答案列表
A_list = re.findall("astart(.*?)aend", word_text)
# A_list.append(A_last)
# 导出
import pandas as pd
df = pd.DataFrame()
df["问题"] = Q_list
print(A_list)
print(len(A_list))
# 若答案有问题,手动修改
df["答案"] = A_list
df["文档来源"] = word_name
# dataframe重排
new_columns = df.columns.to_list()
new_columns.insert(0, "文档来源")
new_columns.pop(-1)
df = df.reindex(columns=new_columns)
df.to_excel("49.xlsx", index=False)
标签:docx,word,后缀,text,list,获取,df,columns
From: https://www.cnblogs.com/tiansz/p/16872693.html