简易代码
import requests from bs4 import BeautifulSoup import re import html2text import os session = requests.session() cookies = { #换成自己的cookies } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } total_page=31 # 总页数加1 name="angelyan" # 博客名字 for page in range(1, total_page): params = { 'page': page, } response = session.get('https://www.cnblogs.com/%s'%name, cookies=cookies, headers=headers, params=params) soup = BeautifulSoup(response.text, "lxml") days = soup.find_all("div", class_="day") for d in days: a_url = d.find("a", class_=re.compile('^postTitle2')).attrs["href"] print(a_url) res = session.get(a_url, cookies=cookies, headers=headers) sup = BeautifulSoup(res.text, "lxml") try: title = sup.find("h1", class_="postTitle").text.strip() except: continue html = sup.find("div", class_="post") print(title) markdown = html2text.html2text(str(html)) # print(markdown) with open(os.path.join(r"./博客园", "%s.md" % title), "w", encoding="utf-8") as f: f.write(markdown)
标签:session,cookies,python,text,博客园,headers,Mardown,import,page From: https://www.cnblogs.com/angelyan/p/17789203.html