import json标签:text,博客园,更新,item,href,print,post,网上,resp From: https://www.cnblogs.com/dspython/p/17748947.html
import re
import requests
from bs4 import BeautifulSoup
fOut = open("博客爬取文章列表标题及地址.txt", "w", encoding="utf8")
for idx in range(20):
print("#" * 50, idx + 1)
url = "https://www.cnblogs.com/AggSite/AggSitePostList"
data = {
"CategoryType": "SiteHome",
"ParentCategoryId": 0,
"CategoryId": 808,
"PageIndex": idx + 1,
"TotalPostCount": 4000,
"ItemListActionName": "AggSitePostList"
}
headers = {
"Content-Type": "application/json; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
}
resp = requests.post(url, data=json.dumps(data), headers=headers, timeout=3)
# print(resp.text)
if resp.status_code != 200:
print(resp.status_code)
raise Exception()
soup = BeautifulSoup(resp.text, "html.parser")
post_items = soup.find_all("article", class_="post-item")
for post_item in post_items:
link = post_item.find("a", class_="post-item-title")
href = link.get("href")
text = link.get_text()
span = post_item.find("span", id=re.compile(r"^digg"))
number = span.get_text()
print(href, text, number)
# 写出文件
fOut.write("%s\t%s\n%s\t" % (href, text, number))
fOut.flush()
print("success:%s, %s, %s" % (href, text, number))
fOut.close()