csv 存储
import requests import re import json from scrapy.exporters import CsvItemExporter from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor class TBSpider(object): executor = ThreadPoolExecutor(max_workers=8) def __init__(self): # 用户输入检索商品名称 self.user_input = '香水' # 请求头 self.headers = { } # 文件存储初始化操作 self.file = open(f'{self.user_input}.xls', 'wb') self.exporter = CsvItemExporter(file=self.file, include_headers_line=False, encoding='gbk') self.exporter.start_exporting() """发送请求,获取响应""" def parse_start_url(self): all_tasks = [] for i in range(1, 100): start_url = f'https://s.taobao.com/search?spm=a21bo.jianhua.201867-main.5.1c1611d9bwOdR9&q={self.user_input}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-8&ntoffset=-8&p4ppushleft=2%2C48&s={i*44}' try: # 发起请求 response = requests.get(url=start_url, headers=self.headers).content.decode('utf-8') task = self.executor.submit(self.parse_start_url, i, 2) all_tasks.append(task) for result in as_completed(all_tasks): exception = result.exception() if exception: self.parse(response) except BaseException as e: print(e) def parse(self, response): global num data = re.findall('g_page_config = (.*?)g_srp_loadCss', response, re.S)[0] if data: data = data[0:-6] json_dict = json.loads(data) li = json_dict['mods']['itemlist']['data']['auctions'] if li: for i in li: # 标题 raw_title = i['raw_title'] # 价格 view_price = i['view_price'] # 发货地 item_loc = i['item_loc'] # 付款人数 comment_count = i['comment_count'] # 店铺名 nick = i['nick'] # 店铺链接 shop_link = i['shopLink'] if 'https:' not in shop_link: shop_link = 'https:' + shop_link # 数据封装 detail_data = { 'num': num, 'raw_title': raw_title, 'view_price': view_price, 'item_loc': item_loc, 'comment_count': comment_count, 'nick': nick, 'shopLink': shop_link, } num += 1 print(detail_data) self.exporter.export_item(detail_data) def __del__(self): self.exporter.finish_exporting() self.file.close() if __name__ == '__main__': # 定义计数 num = 1 s = TBSpider() s.parse_start_url()
标签:__,03,start,url,self,item,淘宝,data From: https://www.cnblogs.com/modly/p/16907576.html