爬取网络:当当网
代码
import re
import requests
import time
import xlwt
url_basic = 'http://search.dangdang.com/?key='
heads = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
xinxi = []
for i in range(18): ##进入当当网,查看有多少页,将循环次数换成页数
try:
key = "多元统计分析" #换成任何你想要搜索的书籍
url = url_basic +key+"&act=input&page_index=" + str(i+1)
print(url)
response = requests.get(url, headers = heads)
content = response.text
pattern = re.compile('<li.*?<a title="(.*?)".*?>.*?search_now_price">.*?(\d+\D\d+)</span>.*?search_pre_price">.*?(\d+\D\d+)</span>.*?<a href=.*?ddclick=.*?>(\d+).*?</a>.*?<a href=.*?>(.*?)</a>.*?</span>.*?</li>',re.S)
results = re.findall(pattern, content)
xinxi += results
#print(results)
#print(response)
print("获取成功")
i += 1
time.sleep(1)
except:
break
#print(xinxi)
##将结果存储为excel表,若执行此步,请在idle环境运行,若直接输出或者文本输出,请查看我的上或下一条博文
headee = ["序号","名称","现价","原价","评论数","作者/出版社"]
with open(r"C:\Users\AdamCY\Desktop\wenjian\python爬虫\py_beiyesi.xlsx", "w") as file:
file = xlwt.Workbook()
sheet = file.add_sheet('sheet_1')
sheet.write(0, 0, headee[0])
sheet.write(0, 1, headee[1])
sheet.write(0, 2, headee[2])
sheet.write(0, 3, headee[3])
sheet.write(0, 4, headee[4])
sheet.write(0, 5, headee[5])
i = 0
s = 1
for result in xinxi:
sheet.write(i+1, 0, s)
sheet.write(i+1, 1, result[0])
sheet.write(i+1, 2, result[1])
sheet.write(i+1, 3, result[2])
sheet.write(i+1, 4, result[3])
sheet.write(i+1, 5, result[4])
s += 1
i += 1
file.save(r"C:\Users\AdamCY\Desktop\wenjian\python爬虫\py_beiyesi.xlsx")
print("excel数据保存成功")
结果: