Spider代码
class BizhizolSpider(scrapy.Spider):
name = "bizhizol"
allowed_domains = ["zol.com.cn"]
start_urls = ["https://desk.zol.com.cn/youxi/"]
def parse(self, response,**kwargs):
# print(response.text)
res_list_li = response.xpath('//*[@class="pic-list2 clearfix"]/li')
# print(res_list_li)
for res_list in res_list_li:
img_url = res_list.xpath('./a/@href').extract_first()
if img_url.endswith(".exe"):
continue
# print(img_url)
"""
python URL拼接
# from urllib.parse import urljoin
# print(urljoin(response.url,img_url))
"""
#使用scrapy自带的拼接,其实也是调用了urllib模块
child_url =response.urljoin(img_url)
# print(child_url)
#拿到图片的URL需要重新发起请求
yield Request(
url=child_url,
method="get",
callback=self.suibianqimignzi,
)
def suibianqimignzi(self, response,**kwargs):
img_src = response.xpath("//*[@id='bigImg']/@src").extract_first()
# print(img_src)
yield {
"img_src":img_src
}
Pepiline代码
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
# ImagesPipeline 图片专用的管道
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class BizhiPipeline:
def process_item(self, item, spider):
return item
class MyTuPipeline(ImagesPipeline):
# 1. 发送请求(下载图片, 文件, 视频,xxx)
def get_media_requests(self, item, info):
url = item['img_src']
headers = {
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Referer': 'https://desk.zol.com.cn/showpic/1920x1080_100899_144.html',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'sec-ch-ua-platform': '"Windows"',
}
yield scrapy.Request(url=url,headers=headers, meta={"sss": url}) # 直接返回一个请求对象即可
# 2. 图片的存储路径
# 完整的路径: IMAGES_STORE + file_path()的返回值
# 在这个过程中. 文件夹自动创建
def file_path(self, request, response=None, info=None, *, item=None):
# 可以准备文件夹
img_path = "/youxi/"
print(request)
# 准备文件名字
# 坑: response.url 没办法正常使用
# file_name = response.url.split("/")[-1] # 直接用响应对象拿到url
# print("response:", file_name)
file_name = item['img_src'].split("/")[-1] # 用item拿到url
print("item:", file_name)
file_name = request.meta['sss'].split("/")[-1]
print("meta:", file_name)
real_path = img_path + "/" + file_name # 文件夹路径拼接
return real_path # 返回文件存储路径即可
# 3. 可能需要对item进行更新
def item_completed(self, results, item, info):
# print(results)
for r in results:
print(r[1]['path'])
return item # 一定要return item 把数据传递给下一个管道
效果展示