- 提取所有Word 文档中的指定部分内容,并合并到一个txt文件中:
import glob
import docx
def get_text_between_headings(doc, heading1, heading2):
start = -1
end = -1
for i in range(len(doc.paragraphs)):
if doc.paragraphs[i].text == heading1:
start = i
elif doc.paragraphs[i].text == heading2:
end = i
break
return '\n'.join([doc.paragraphs[i].text for i in range(start+1, end)])
# 获取所有Word文件的路径
file_paths = glob.glob('./*.docx')
# 创建一个txt文件
with open('./output.txt', 'w', encoding='utf-8') as f:
# 遍历每个Word文件,将其内容写入txt文件
for file_path in file_paths:
doc = docx.Document(file_path)
# text = '\n\n'.join([paragraph.text for paragraph in doc.paragraphs])
# f.write(text)
fruit = get_text_between_headings(doc, "Done", "Introspection")
# print(fruit)
if not (fruit.startswith('Figure') or fruit.startswith('[')):
f.write(fruit)
- 给txt文档中的所有空行按顺序添加指定内容:
with open('output.txt', 'r') as file:
lines = file.readlines()
week_no = 0
with open('processed_file.txt', 'w') as file:
for i, line in enumerate(lines):
if not line.strip(): # 检查行是否为空行
week_no += 1
file.write(f'Week {week_no}')
file.write(line)
作者:艾孜尔江
转载请务必标明出处!