中国招标投标公众服务平台 https://bulletin.cebpubservice.com/
import httpx import warnings from lxml import etree from scrapy.exporters import CsvItemExporter from loguru import logger warnings.filterwarnings("ignore") class Spider: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52" } # 文件存储初始化操作 self.file = open(f'data.csv', 'wb') self.exporter = CsvItemExporter(file=self.file, include_headers_line=True, encoding='gbk') self.exporter.start_exporting() def request_url(self): for i in range(1, 500 + 1): # 翻页 url = f'https://bulletin.cebpubservice.com/xxfbcmses/search/bulletin.html?searchDate=1997-10-26&dates=300&word=&categoryId=88&industryName=&area=&status=&publishMedia=&sourceInfo=&showStatus=1&page={i}' try: # 发起请求 response = httpx.get(url=url, headers=self.headers, verify=False, timeout=30) self.parse_data(response) except BaseException as e: print(e) def parse_data(self, response): global num if response.status_code == 200: result = response.content.decode('utf-8') html = etree.HTML(result) data_list = html.xpath('//body/table[@class="table_text"]/tr[position()>1]') for data in data_list: num += 1 # 采集计数 # 招标公告 title = ''.join(data.xpath('./td[1]/a/@title')).replace('•', '-').replace('\xb3', '').replace('\u2082', '') # 公告连接 title_url = ''.join(data.xpath('./td[1]/a/@href')).replace('\t', '')[20:-2].replace('\n', '') # 行业所属 bussine = ''.join(data.xpath('./td[2]/span/text()')).replace('\t', '').replace('\n', '').replace('\r','') # 地区 address = ''.join(data.xpath('./td[3]/span/@title')) # 渠道 tools = ''.join(data.xpath('./td[4]/text()')).replace('\t', '').replace('\n', '') # 发布时间 times = ''.join(data.xpath('./td[5]/text()')).replace('\t', '').replace('\n', '').replace('\r', '') # 开标时间 open_time = ''.join(data.xpath('./td[6]/@id')) # 数据封装 dict_data = { 'num': num, 'title': title, 'title_url': title_url, 'bussine': bussine, 'address': address, 'tools': tools, 'times': times, 'open_time': open_time, } logger.info('{}', dict_data) self.exporter.export_item(dict_data) def __del__(self): self.exporter.finish_exporting() self.file.close() if __name__ == '__main__': # 计数 num = 0 s = Spider() s.request_url()
标签:__,xpath,04,title,self,replace,服务平台,投标,data From: https://www.cnblogs.com/modly/p/16931289.html