对于刚入门爬虫的小伙伴来说,累积经验多练习代码是非常有必要的,下面就是有关爬虫的一些小案例,欢迎大家指正。
import requests标签:info,示例,Python,text,爬虫,df,article,getTargetInfo,div From: https://blog.51cto.com/u_13488918/6024759
from bs4 import BeautifulSoup
# import pandas
def GetBlogByPage(pageNum):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
targetUrl="目标网站"
response=requests.get(targetUrl.format(pageNum),headers=headers)
response.encoding='utf-8'
contentText=response.text
soup=BeautifulSoup(contentText,"html.parser")
getTargetInfo=[]
articles=soup.select('.article-item-box')
for article in articles:
info={}
info["title"]=article.a.text.strip()
info["source"]=article.a['href'].strip()
info["sendTime"]=article.div.p.span.text.strip()
info["ReadNum"]=article.div.select('span')[1].text
info["writeNum"]=article.div.select('span')[3].text
getTargetInfo.append(info)
with open("blog.txt",'w') as f:
for info in getTargetInfo:
print(info)
f.write(str(info))
# df=pandas.DataFrame(getTargetInfo)
# df.head()
# df.to_excel('blog.xlsx')
for i in range(1,9): #按页爬取
GetBlogByPage(i)