首页 > 其他分享 >04. 中国招标投标公众服务平台

04. 中国招标投标公众服务平台

时间:2022-11-28 09:14:11浏览次数:36  
标签:__ xpath 04 title self replace 服务平台 投标 data

中国招标投标公众服务平台  https://bulletin.cebpubservice.com/  

 

import httpx
import warnings
from lxml import etree
from scrapy.exporters import CsvItemExporter
from loguru import logger

warnings.filterwarnings("ignore")


class Spider:

    def __init__(self):
        self.headers = {

            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"
        }

        # 文件存储初始化操作
        self.file = open(f'data.csv', 'wb')
        self.exporter = CsvItemExporter(file=self.file, include_headers_line=True, encoding='gbk')
        self.exporter.start_exporting()

    def request_url(self):

        for i in range(1, 500 + 1):  # 翻页
            url = f'https://bulletin.cebpubservice.com/xxfbcmses/search/bulletin.html?searchDate=1997-10-26&dates=300&word=&categoryId=88&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=1&page={i}'
            try:
                # 发起请求
                response = httpx.get(url=url, headers=self.headers, verify=False, timeout=30)
                self.parse_data(response)
            except BaseException as e:
                print(e)

    def parse_data(self, response):
        global num
        if response.status_code == 200:
            result = response.content.decode('utf-8')

            html = etree.HTML(result)
            data_list = html.xpath('//body/table[@class="table_text"]/tr[position()>1]')

            for data in data_list:
                num += 1  # 采集计数
                # 招标公告
                title = ''.join(data.xpath('./td[1]/a/@title')).replace('•', '-').replace('\xb3', '').replace('\u2082', '')
                # 公告连接
                title_url = ''.join(data.xpath('./td[1]/a/@href')).replace('\t', '')[20:-2].replace('\n', '')
                # 行业所属
                bussine = ''.join(data.xpath('./td[2]/span/text()')).replace('\t', '').replace('\n', '').replace('\r','')
                # 地区
                address = ''.join(data.xpath('./td[3]/span/@title'))
                # 渠道
                tools = ''.join(data.xpath('./td[4]/text()')).replace('\t', '').replace('\n', '')
                # 发布时间
                times = ''.join(data.xpath('./td[5]/text()')).replace('\t', '').replace('\n', '').replace('\r', '')
                # 开标时间
                open_time = ''.join(data.xpath('./td[6]/@id'))

                # 数据封装
                dict_data = {
                    'num': num,
                    'title': title,
                    'title_url': title_url,
                    'bussine': bussine,
                    'address': address,
                    'tools': tools,
                    'times': times,
                    'open_time': open_time,
                }

                logger.info('{}', dict_data)
                self.exporter.export_item(dict_data)

    def __del__(self):
        self.exporter.finish_exporting()
        self.file.close()


if __name__ == '__main__':
    # 计数
    num = 0
    s = Spider()
    s.request_url()

 

标签:__,xpath,04,title,self,replace,服务平台,投标,data
From: https://www.cnblogs.com/modly/p/16931289.html

相关文章