# -*- coding: utf-8 -*- import requests import re import os
# 判断有没有该文件夹,没有加创建一个 if not os.path.exists('青春无悔'): os.mkdir('青春无悔') url = 'https://www.yibige.cc/110006/index.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4506.400' } response_1 = requests.get(url=url, headers=headers) # 自动转码 response_1.encoding = response_1.apparent_encoding html_data = response_1.text # 获取小说章节列表 result_list = re.findall('<dd><a href="(.*?)">.*?</a></dd>', html_data, re.S) # result_list: 获取第一次提取的链接,方便第二次提取 result_list = result_list[:308] for result_name in result_list[:308]: # 链接拼接 all_url = 'https://www.yibige.cc/110006/' + result_name # 发送网络请求 response_2 = requests.get(all_url) response = requests.get(url=url, headers=headers) # 自动转码 response_2.encoding = response.apparent_encoding html_data_2 = response_2.text # 标题 title = re.findall('<h1>(.*?)</h1>', html_data_2, re.S)[0] # 文章的内容 result = re.findall('<div id="content" class="contentjs">(.*?)</div>', html_data_2, re.S) txt = result[0].replace(' ', ' ').replace('</p><p>', '\n').replace('<p>', '').replace("</p><script>site_con_ad('亿笔阁','https://www.yibige.cc');</script>", '') # 保存 with open('青春无悔\\' + title + '.txt', mode='w', encoding='utf-8') as f: f.write(txt) print('下载成功:', title)
标签:陷入,Python,list,re,html,result,url,response,下载 From: https://www.cnblogs.com/Clx23977/p/17064380.html