import time import requests from lxml import etree from 常用功能.mongodb链接 import connect_mongodb user_value = 'bk_spider' pwd_value = 'ke@spider01' mongo_id_value = '119.45.40.170' port_value = 27017 db_value = 'spider' coll_value = "channel_directional_crawl" db_coll = connect_mongodb(user_value, pwd_value, mongo_id_value, port_value, db_value, coll_value) db = db_coll[0] coll = db_coll[1] def get_gaozhong_url(num_list: int): scool_list = [] for num in range(1, num_list): scool_list.append(f'https://www.hxx.net/school/gaozhong/list_{num}.html') return scool_list def get_chuzhong_url(num_list: int): scool_list = [] for num in range(1, num_list): scool_list.append(f'https://www.hxx.net/school/chuzhong/list_{num}.html') return scool_list def get_detail(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0', } session = requests.Session() html = session.get(url=url, headers=headers).content.decode() tree = etree.HTML(html) second_url_list = tree.xpath("//div[@class='zhongxueSchList']//li//dt/a/@href") scool_name_list = tree.xpath("//div[@class='zhongxueSchList']//li//dt/a/text()") for second_url in second_url_list: num = second_url_list.index(second_url) print(f'第{num}个') scool_name = scool_name_list[num] second_html = session.get(url=second_url, headers=headers).content.decode() second_tree = etree.HTML(second_html) # 院校性质 yuanxiaoxingzhi = second_tree.xpath("//dd/span[contains(text(), '院校性质')]/text()") yuanxiaoxingzhi = ''.join(yuanxiaoxingzhi).replace('院校性质:', '') # 办学层次 banxuecengci = second_tree.xpath("//dd/span[contains(text(), '办学层次')]/i/text()") banxuecengci = ''.join(banxuecengci) # 学校形式 xuexiaoxingshi = second_tree.xpath("//dd/span[contains(text(), '学校形式')]/text()") xuexiaoxingshi = ''.join(xuexiaoxingshi).replace('学校形式:', '') # 建校时间jianxiaoshijian jianxiaoshijian = second_tree.xpath("//div[@class='dl']/dd[2]/span[2]/i/text()") jianxiaoshijian = ''.join(jianxiaoshijian) # 学校地址xuexiaodizhi xuexiaodizhi = second_tree.xpath("//div[@class='dl']/dd[4]/span/text()") xuexiaodizhi = ''.join(xuexiaodizhi).replace('学校地址:', '') # 使用 XPath 提取评分信息 pf_div = second_tree.xpath('//div[@class="pf"]')[0] # 找到 class="pf" 的 div 元素 if pf_div is not None: text = pf_div.xpath('string()').strip() # 获取 div 元素内的文本内容,并去除首尾空白字符 # 使用正则表达式提取分数 import re pattern = r'环境(\d+\.\d+)分\s*师资(\d+\.\d+)分\s*服务(\d+\.\d+)分\s*效果(\d+\.\d+)分' match = re.search(pattern, text) if match: # 热度 redu=second_tree.xpath('//dt/span/text()')[0] # 环境评分 huanjingpingfen = match.group(1) huanjingpingfen = ''.join(huanjingpingfen) # 师资评分 shizipingfen = match.group(2) shizipingfen = ''.join(shizipingfen) # 服务评分 fuwupingfen = match.group(3) fuwupingfen = ''.join(fuwupingfen) # 效果评分 xiaoguopingfen = match.group(4) xiaoguopingfen = ''.join(xiaoguopingfen) # 学校简介 jianjie = second_tree.xpath( '//div[@class="schoo_introduction nomargintop"]/div[@class="content"]//text()') jianjie = ''.join(jianjie).replace('\ue614点击展开','').replace('\u3000','') dict1 = { 'task_name': '好学校', 'domain': 'https://www.hxx.net/', 'url': 'https://www.hxx.net/', 'ts': int(time.time()), 'tag': '', 'channal': '好学校', 'quality': '非官方', # 会修改 'batch': 'batch 20240711_batch_1', # 240711渠道名,会修改 # json/240711/全国各省-城市-中学.json # 文件类型/渠道名/上传到cos的文件名 # key值,也是文件类型,渠道,标题,后缀的集合 # a.网页端目录命名: html / {网站英文名or缩写or拼音} / {网页title # or其他可标识内容}.html # b.视频: videos / {渠道(比如douyin)} / {title}.mp4 # c.图片: IMG / {渠道} / {title}.jpg # d.PDF: PDF / {渠道} / {title}.pdf # e.excel: excel / {渠道} / {title}.xlsx # f.json: json / {渠道} / {title}.json # cos_url构成:https://data-crawler-1325559378.cos.ap-beijing.myqcloud.com/文件类型/渠道名/上传到cos的文件名 'cos_url': 'https://data-crawler-1325559378.cos.ap-beijing.myqcloud.com/json/240711/全国各省-城市-中学.json', 'meta': {'一级分类': '教育配套', '二级分类': '学校基础信息', '三级分类': '全国', '性质': '非官方', '信息来源': '好学校', '说明': '全国各省-城市-中学', '链接': 'https://www.hxx.net/' }, 'content': { '学校名称': scool_name, '详情页链接': second_url, '院校性质': yuanxiaoxingzhi, '办学层次': banxuecengci, '学校形式': xuexiaoxingshi, '建校时间': jianxiaoshijian, '学校地址': xuexiaodizhi, '热度': redu, '环境评分': huanjingpingfen, '师资评分': shizipingfen, '服务评分': fuwupingfen, '效果评分': xiaoguopingfen, 'ocr_content': f'学校名称:{scool_name},热度:{redu},环境评分:{huanjingpingfen},师资评分:{shizipingfen}服务评分:{fuwupingfen},效果评分:{xiaoguopingfen}'+jianjie, } } # python中mongodb判断某字段的值是否存在 print({"content['学校名称']": scool_name}) count = coll.count_documents({"content.学校名称": scool_name}) print(count) if count != 0: print(f"{scool_name}已存在") print('\n') else: # coll.insert_one(dict1) print(f'{scool_name}插入完成') print('\n') if __name__ == '__main__': gaozhong_url_list = get_gaozhong_url(1103) chuzhong_url_list = get_chuzhong_url(189) print(f'gaozhong_url_list:{gaozhong_url_list}') for gaozhong_url in gaozhong_url_list: print(f'gaozhong_url:{len(gaozhong_url),gaozhong_url}') get_detail(gaozhong_url) # print(f'chuzhong_url_list:{chuzhong_url_list}') # for chuzhong_url in chuzhong_url_list: # get_detail(chuzhong_url)
标签:贝壳,项目,url,text,list,scool,gaozhong,second,入库 From: https://www.cnblogs.com/wolvies/p/18305679