首页 > 其他分享 >03. 淘宝

03. 淘宝

时间:2022-11-20 00:01:03浏览次数:35  
标签:__ 03 start url self item 淘宝 data

csv 存储

import requests
import re
import json
from scrapy.exporters import CsvItemExporter
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor


class TBSpider(object):
    executor = ThreadPoolExecutor(max_workers=8)

    def __init__(self):
        # 用户输入检索商品名称
        self.user_input = '香水'

        # 请求头
        self.headers = {

        }

        # 文件存储初始化操作
        self.file = open(f'{self.user_input}.xls', 'wb')
        self.exporter = CsvItemExporter(file=self.file, include_headers_line=False, encoding='gbk')
        self.exporter.start_exporting()

    """发送请求,获取响应"""
    def parse_start_url(self):
        all_tasks = []
        for i in range(1, 100):
            start_url = f'https://s.taobao.com/search?spm=a21bo.jianhua.201867-main.5.1c1611d9bwOdR9&q={self.user_input}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-8&ntoffset=-8&p4ppushleft=2%2C48&s={i*44}'
            try:
                # 发起请求
                response = requests.get(url=start_url, headers=self.headers).content.decode('utf-8')
                task = self.executor.submit(self.parse_start_url, i, 2)
                all_tasks.append(task)

                for result in as_completed(all_tasks):
                    exception = result.exception()
                    if exception:

                        self.parse(response)
            except BaseException as e:
                print(e)

    def parse(self, response):
        global num
        data = re.findall('g_page_config = (.*?)g_srp_loadCss', response, re.S)[0]
        if data:
            data = data[0:-6]
            json_dict = json.loads(data)

            li = json_dict['mods']['itemlist']['data']['auctions']
            if li:
                for i in li:
                    # 标题
                    raw_title = i['raw_title']
                    # 价格
                    view_price = i['view_price']
                    # 发货地
                    item_loc = i['item_loc']
                    # 付款人数
                    comment_count = i['comment_count']
                    # 店铺名
                    nick = i['nick']
                    # 店铺链接
                    shop_link = i['shopLink']
                    if 'https:' not in shop_link:
                        shop_link = 'https:' + shop_link

                    # 数据封装
                    detail_data = {
                        'num': num,
                        'raw_title': raw_title,
                        'view_price': view_price,
                        'item_loc': item_loc,
                        'comment_count': comment_count,
                        'nick': nick,
                        'shopLink': shop_link,
                    }
                    num += 1
                    print(detail_data)

                    self.exporter.export_item(detail_data)

    def __del__(self):
        self.exporter.finish_exporting()
        self.file.close()


if __name__ == '__main__':
    # 定义计数
    num = 1
    s = TBSpider()
    s.parse_start_url()

 

标签:__,03,start,url,self,item,淘宝,data
From: https://www.cnblogs.com/modly/p/16907576.html

相关文章