1 import redis 2 import chardet 3 import hashlib 4 import asyncio 5 import aiohttp 6 from lxml import etree 7 from fake_useragent import UserAgent 8 from motor.motor_asyncio import AsyncIOMotorClient 9 10 11 class CarSpider: 12 user_agent = UserAgent() 13 redis_client = redis.Redis() 14 mongo_client = AsyncIOMotorClient('localhost', 27017)['py_spider']['car_info'] 15 16 def __init__(self): 17 self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion' 18 self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}' 19 20 def __del__(self): 21 # 爬虫完毕时关闭redis服务 22 self.redis_client.close() 23 24 # 获取汽车id 25 async def get_car_id(self, page, session): 26 async with session.get(self.url.format(page), headers={'User-Agent': self.user_agent.random}) as response: 27 content = await response.read() 28 encoding = chardet.detect(content)['encoding'] 29 30 if encoding == 'GB2312' or encoding == 'ISO-8859-1': 31 result = content.decode('gbk') 32 tree = etree.HTML(result) 33 id_list = tree.xpath('//ul[@class="viewlist_ul"]/li/@specid') 34 if id_list: 35 # 创建获取汽车详细信息的task任务 36 tasks = [loop.create_task(self.get_car_info(spec_id, session)) for spec_id in id_list] 37 await asyncio.wait(tasks) 38 else: 39 print('id为空...') 40 else: 41 print('错误页面...') 42 43 # 获取汽车详细信息 44 async def get_car_info(self, spec_id, session): 45 async with session.get(self.api_url.format(spec_id), headers={'User-Agent': self.user_agent.random}) as response: 46 result = await response.json() 47 if result['result'].get('paramtypeitems'): 48 item = dict() 49 item['name'] = result['result']['paramtypeitems'][0]['paramitems'][0]['value'] 50 item['price'] = result['result']['paramtypeitems'][0]['paramitems'][1]['value'] 51 item['brand'] = result['result']['paramtypeitems'][0]['paramitems'][2]['value'] 52 item['altitude'] = result['result']['paramtypeitems'][1]['paramitems'][2]['value'] 53 item['breadth'] = result['result']['paramtypeitems'][1]['paramitems'][1]['value'] 54 item['length'] = result['result']['paramtypeitems'][1]['paramitems'][0]['value'] 55 await self.save_car_info(item) 56 else: 57 print('数据不存在...') 58 59 # 数据去重 60 @staticmethod 61 def get_md5(dict_item): 62 md5 = hashlib.md5() 63 md5.update(str(dict_item).encode('utf-8')) 64 return md5.hexdigest() 65 66 # 数据保存 67 async def save_car_info(self, item): 68 md5_hash = self.get_md5(item) 69 redis_result = self.redis_client.sadd('car:filter', md5_hash) 70 if redis_result: 71 await self.mongo_client.insert_one(item) 72 print('数据插入成功:', item) 73 else: 74 print('数据重复...') 75 76 async def main(self): 77 async with aiohttp.ClientSession() as session: 78 tasks = [asyncio.create_task(self.get_car_id(page, session)) for page in range(1, 101)] 79 await asyncio.wait(tasks) 80 81 82 if __name__ == '__main__': 83 loop = asyncio.get_event_loop() 84 car_spider = CarSpider() 85 loop.run_until_complete(car_spider.main())
标签:item,car,self,get,爬虫,并发,result,motor,id From: https://www.cnblogs.com/kojya/p/18461167