# !/usr/bin/env python # -*- coding:utf-8 -*- """ # File : chengyu-001.py # Time :2024/1/16 11:55 # Author :lrtao2010 # version :python 3.10.1 # Description:记录指定个数的成语 """ #导入模块 import requests #下载网页 import re import time import random def spider(url): # 自定义请求头 my_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', #'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Host': 'XXXX.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' } resp = requests.get(url,headers=my_headers,timeout=(30,30)) #print(resp.text) obj = re.compile(r'<a href=.*? target="_blank"><span>(?P<PIN>.*?)</span>(?P<CI>.*?)</a>',re.S) # re.S可以让re匹配到换行符 result = obj.finditer(resp.text) for item in result: dic = item.groupdict() #print("|".join(dic.values())) with open("./shuju/chengyu-4.txt", 'a+', encoding='utf-8')as s_f: s_f.write("|".join(dic.values()) + "\n") #主程序,注意修改英文字母数量 if __name__ == '__main__': # url = f"https://XXXX.com/zishu_4.html" # print(url) # spider(url) # time.sleep(random.randint(2, 5)) for page in range(13,21): url = f"https://XXXX.com/zishu_4_p{page}.html" print(url) spider(url) time.sleep(random.randint(2, 5))
标签:__,url,text,个数,指定,re,print,import,成语 From: https://www.cnblogs.com/lrtao2010/p/18041423