记录一下scrapy爬取图片遇到的坑
- 目标站点:站长素材图片 https://sc.chinaz.com/tupian
- 我的核心源代码
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import scrapy # useful for handling different item types with a single interface from itemadapter import ItemAdapter # 导入图片处理管道类 from scrapy.pipelines.images import ImagesPipeline class ImagesPipeLine(ImagesPipeline): def get_media_requests(self, item, info): img_path = item['img_path'] yield scrapy.Request(url=img_path) def file_path(self, request, response=None, info=None): img_name = request.url.split('/')[-1] return img_name def item_completed(self, results, item, info): return item
- 报如下错
self._line = linecache.getline(self.filename, self.lineno).strip() File "D:\install\anaconda3\lib\linecache.py", line 30, in getline lines = getlines(filename, module_globals) File "D:\install\anaconda3\lib\linecache.py", line 46, in getlines return updatecache(filename, module_globals) File "D:\install\anaconda3\lib\linecache.py", line 137, in updatecache lines = fp.readlines() File "D:\install\anaconda3\lib\codecs.py", line 322, in decode (result, consumed) = self._buffer_decode(data, self.errors, final) UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 288: invalid start byte Call stack: File "D:\install\anaconda3\Scripts\scrapy-script.py", line 10, in <module> sys.exit(execute()) File "D:\install\anaconda3\lib\site-packages\scrapy\cmdline.py", line 154, in execute _run_print_help(parser, _run_command, cmd, args, opts) File "D:\install\anaconda3\lib\site-packages\scrapy\cmdline.py", line 109, in _run_print_help func(*a, **kw) File "D:\install\anaconda3\lib\site-packages\scrapy\cmdline.py", line 162, in _run_command cmd.run(args, opts) File "D:\install\anaconda3\lib\site-packages\scrapy\commands\crawl.py", line 27, in run self.crawler_process.start() File "D:\install\anaconda3\lib\site-packages\scrapy\crawler.py", line 348, in start reactor.run(installSignalHandlers=False) # blocking call File "D:\install\anaconda3\lib\site-packages\twisted\internet\base.py", line 1315, in run self.mainLoop() File "D:\install\anaconda3\lib\site-packages\twisted\internet\base.py", line 1325, in mainLoop reactorBaseSelf.runUntilCurrent() File "D:\install\anaconda3\lib\site-packages\twisted\internet\base.py", line 991, in runUntilCurrent call.func(*call.args, **call.kw) File "D:\install\anaconda3\lib\site-packages\twisted\internet\task.py", line 680, in _tick taskObj._oneWorkUnit() File "D:\install\anaconda3\lib\site-packages\twisted\internet\task.py", line 526, in _oneWorkUnit result = next(self._iterator) File "D:\install\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 86, in <genexpr> work = (callable(elem, *args, **named) for elem in iterable) File "D:\install\anaconda3\lib\site-packages\scrapy\core\scraper.py", line 207, in _process_spidermw_output dfd.addBoth(self._itemproc_finished, output, response, spider) File "D:\install\anaconda3\lib\site-packages\twisted\internet\defer.py", line 538, in addBoth return self.addCallbacks( File "D:\install\anaconda3\lib\site-packages\twisted\internet\defer.py", line 477, in addCallbacks self._runCallbacks() File "D:\install\anaconda3\lib\site-packages\twisted\internet\defer.py", line 857, in _runCallbacks current.result = callback( # type: ignore[misc] File "D:\install\anaconda3\lib\site-packages\scrapy\core\scraper.py", line 267, in _itemproc_finished logger.log(*logformatter_adapter(logkws), extra={'spider': spider}, Message: 'Error processing %(item)s' Arguments: {'item': {'img_path': '//scpic1.chinaz.net/files/default/imgs/2023-08-14/d0ef55a2d710556e_s.jpg'}} --- Logging error ---
- 排查步骤
其实上述报错我是没看明白的,不知道从何下手,报错信息太多了。之后我改造我的代码,把可能报错的代码用try except包起来,然后打印异常,结果就更简洁了。改造后代码和打印信息如下:
def get_media_requests(self, item, info): try: # 发送请求 img_path = item['img_path'] yield scrapy.Request(img_path) except Exception as e: print(e) pass
打印结果
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/175877a01a833b7b_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-14/d0ef55a2d710556e_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-05/df791032f53f5179_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-05/c3cbafa5efd7c9ab_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-09/83640b0005a19b14_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-07/da2193a1cc52969b_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/4283b939393b18f6_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/8aaae68f0126a080_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/7a4ea6cc5a00bd3b_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/e898b7f60eed9d1f_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-14/e241fbe4d407735c_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-13/0a126e9116b6e42a_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/c2f8d261e10a52d8_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/cc7e32b1e8668549_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/707accf28feb2e19_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/30755627eaa7e8a9_s.png
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/cfb35c6a5806f448_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/5c66bd93443af90d_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/256cfb52805042c2_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/0fc78e89f09b6ee1_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/89c3c26897434973_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-13/68f8aa9f8e3d2721_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-14/51476979c441e36d_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-13/c4ec043560a2d430_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/e1c7a0385d287cf7_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/2c58476097790381_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/bd272dac73b8076e_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-12/047e9c5da4f3bf35_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-09/785dc12b836a4c30_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-16/ab27815136279b59_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/a61a30d1cc7a57f5_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/895682e8d1955b77_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/20728bc7977fd9db_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-14/a6d94e3ba56f8e24_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-13/6f58296053538a39_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-14/a7f68bc50f78ebb9_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/f1a3803dfe4137ce_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/ba608ba73e84a56d_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-15/fa99e71609dcfa67_s.jpg
Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-09/c2d2991420a43430_s.jpg
看着这个日志信息“Missing scheme in request url: //scpic1.chinaz.net/files/default/imgs/2023-08-09/c2d2991420a43430_s.jpg”提示缺失什么。仔细看这个url:“//scpic1.chinaz.net/files/default/imgs/2023-08-09/c2d2991420a43430_s.jpg”,感觉也没有问题,我把这个地址复制到浏览器允许也是能正常显示图片的。但是总感觉这个地址有问题,查阅了很多资料,最终解决办法是给这个url地址拼接上http:/https:改造后的代码如下
def get_media_requests(self, item, info): try: # 发送请求 img_path = "https://" + item['img_path'] yield scrapy.Request(img_path) except Exception as e: print(e) pass
修改后再去运行就可以正常爬取图片了。这里记录一下爬取的地址尽量完整
标签:Missing,url,08,request,chinaz,报错,net,scheme From: https://www.cnblogs.com/xlisteven/p/17637099.html