爬取当当网图书数据并保存到本地,使用request、lxml的etree模块、csv模块保存数据到本地。
爬取网页的url为:
http://search.dangdang.com/?key={}&act=input&page_index={}
其中{}为搜索关键字,page_index为页码。
爬取的数据包括:书名、作者、出版社、图书简介、出版日期、价格、评分、评价人数。
代码如下:
import random
import requests
from lxml import etree
import pandas as pd
import time
data = []
data.append(['书名', '作者', '图书简介', '出版社', '出版日期', '评论数量', '价格', '编辑推荐', '作者简介', '排名'])
def get_book_info(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
selector = etree.HTML(response.text)
book_list = selector.xpath('//ul[@id="component_59"]/li')
for book in book_list:
book_detail_url = book.xpath('.//a/@href')[0]
get_book_detail('http:' + book_detail_url)
time.sleep(random.randint(1, 5))
def get_book_detail(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
selector = etree.HTML(response.text)
# 书名
book_name = selector.xpath('//*[@id="product_info"]/div[1]/h1/text()')
if len(book_name) == 0:
book_name = '无'
else:
book_name = ','.join(book_name)
# 作者
book_author = selector.xpath('//*[@id="author"]/a//text()')
if len(book_author) == 0:
book_author = '无'
else:
book_author = ','.join(book_author)
# 简介
book_intro = selector.xpath('//*[@id="product_info"]/div[1]/h2/span[1]/text()')
if len(book_intro) == 0:
book_intro = '无'
else:
book_intro = ','.join(book_intro)
# 出版社
book_publisher = selector.xpath('//*[@id="product_info"]/div[2]/span[2]/a//text()')
if len(book_publisher) == 0:
book_publisher = '无'
else:
book_publisher = ','.join(book_publisher)
# 出版日期
book_date = selector.xpath('//*[@id="product_info"]/div[2]/span[3]/text()')
if len(book_date) == 0:
book_date = '无'
else:
book_date = ','.join(book_date)
# 评论数量
book_comments = selector.xpath('//*[@id="comm_num_down"]/text()')
if len(book_comments) == 0:
book_comments = '无'
else:
book_comments = ','.join(book_comments)
# 价格
book_price = selector.xpath('//*[@id="dd-price"]/text()')
if len(book_price) == 0:
book_price = '无'
else:
book_price = ','.join(book_price)
# 编辑推荐
book_recommend = selector.xpath('//*[@id="abstract"]/div[2]/p/text()')
if len(book_recommend) == 0:
book_recommend = '无'
else:
book_recommend = ','.join(book_recommend)
# 作者简介
author_intro = ''.join(selector.xpath('//*[@id="authorIntroduction"]/div[2]//text()'))
# 排名
book_rank = ''.join(selector.xpath('//*[@id="product_info"]/div[2]/div/span[1]//text()'))
data.append([book_name, book_author, book_publisher, book_intro, book_date, book_price, book_comments, book_recommend, author_intro, book_rank])
if __name__ == '__main__':
keyword = input('请输入搜索关键字:')
page_index = 1
while True:
url = f'http://search.dangdang.com/?key={keyword}&act=input&page_index={page_index}'
get_book_info(url)
page_index += 1
if page_index > 5:
break
df = pd.DataFrame(data[1:], columns=data[0])
# 将DataFrame保存为Excel文件
df.to_excel('output.xlsx', index=False)
标签:xpath,join,批量,text,当当网,selector,book,id,图书
From: https://blog.csdn.net/svygh123/article/details/139456709