确保安装了以下Python库:
pip install requests beautifulsoup4 python-doc
import requests from bs4 import BeautifulSoup from docx import Document from docx.shared import Pt from docx.oxml.ns import qn # 函数:获取博客园文章内容 def get_blog_article(url): headers = { 'User-Agent': 'Your User Agent' # 替换为你的浏览器User-Agent } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: print(f"Failed to retrieve article. Status code: {response.status_code}") return None # 函数:解析HTML内容,提取标题和正文 def parse_article(html): soup = BeautifulSoup(html, 'html.parser') title = soup.find('a', id='cb_post_title_url').text.strip() content_tag = soup.find('div', id='cnblogs_post_body') paragraphs = content_tag.find_all('p') content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) return title, content # 函数:生成Word文档 def create_word_document(title, content, filename): doc = Document() # 设置文档标题 doc.add_heading(title, level=1).alignment = 1 # 标题居中 # 添加正文内容 paragraph = doc.add_paragraph(content) # 设置正文字体样式 run = paragraph.add_run() run.font.size = Pt(12) run.font.name = '宋体' run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体') # 设置中文字体 # 保存文档 doc.save(filename) print(f"Word document '{filename}' created successfully.") # 主函数 if __name__ == "__main__": # 示例博客园文章链接 blog_url = 'https://www.cnblogs.com/your-article-url' # 获取文章内容 html_content = get_blog_article(blog_url) if html_content: # 解析文章 article_title, article_content = parse_article(html_content) # 生成Word文档 output_filename = 'blog_article.docx' create_word_document(article_title, article_content, output_filename) else: print("Failed to fetch article content.")
主要注意的是:
1.'User-Agent': 'Your User Agent' # 替换为你的浏览器User-Agent
打开任意浏览器,在网址上输入about://version,按回车。(即可查看你自己浏览器的用户代理)
一些示例用户代理字符串:
- Chrome浏览器:
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36
- Firefox浏览器:
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0
- Safari浏览器:
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15
2.
# 示例博客园文章链接 blog_url = 'https://www.cnblogs.com/your-article-url'3.
示例:我的其中一篇博客url:https://www.cnblogs.com/2022-yang/p/18252952
# 生成Word文档 output_filename = 'blog_article.docx'
blog_article.docx为word文档名称标签:blog,word,title,python,博客园,content,url,article,浏览器 From: https://www.cnblogs.com/2022-yang/p/18257420
唯一的就是只能一篇一篇的导。