import requests
import parsel
import csv
import re
# 将表头写入CSV文件
with open('xxxgame.csv', mode='a', encoding='utf-8-sig', newline='') as f:
csv_writer = csv.DictWriter(f, fieldnames=['title', 'num', 'server', 'price', 'href', 'pic'])
csv_writer.writeheader()
url = 'http://www.***.com/list.aspx?gid=30&page=1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
}
try:
response = requests.get(url=url, headers=headers)
# 将返回的数据存成变量
html_data = response.text
# 使用parsel模块的Selector方法将数据转换成可操作的对象
selector = parsel.Selector(html_data)
max_page = int(selector.css('div.digg a')[-2].css('::text').get())
print(f'----------总共有{max_page}页----------')
for page in range(1, max_page + 1):
print(f'----------正在爬取第{page}页----------')
url = f'http://www.***.com/list.aspx?gid=30&page={page}'
response = requests.get(url=url, headers=headers)
html_data = response.text
selector = parsel.Selector(html_data)
divs = selector.css('.pro_list_item')
tags = selector.css('.pro_list_b a')
for div, tag in zip(divs, tags):
title = div.css('.pld1 span::text').get()
num = div.css('.pld2 span::text').get()
server = div.css('.pld3 span::text').get()
price = div.css('.pld4 span::text').get()
href = 'http://www.***.com' + tag.css('::attr(href)').get()
# 继续获取herf链接里的数据
response1 = requests.get(url=href, headers=headers)
html_data1 = response1.text
selector1 = parsel.Selector(html_data1)
pic = selector1.css('div.fcgoods_detail p').getall()
image_urls = []
for div in pic:
urls = re.findall(r'(https?://[^\s]+(?:jpg|png))', 'p')
image_urls.extend(urls)
# print(pic)
data = {'title': title, 'num': num, 'server': server, 'price': price, 'href': href, 'pic': pic}
csv_writer.writerow(data)
except (requests.RequestException, parsel.SelectorError) as e:
print(f"----------爬取第{page}页时发生错误:{e}----------")
# 提示爬取完成
print("----------爬取完成----------")
遇到的问题
1.对于xpath的标签和类属性的学习远远不够,多尝试爬取一些静态网页进行练习。
2.粗心造成的效率低下,自信检查代码、
3.对代码的层级要多学多看,以免造成不必要的错误。
未解决的问题
1.提取新标签页图片时,提取数量不完整。
2.做一个可视化界面。
2023年8月11日 18:47:28
标签:游戏,get,text,代码,----------,div,随笔,page,css From: https://www.cnblogs.com/maya0214/p/17623763.html