ssr2
重点:关闭证书验证和警告提示
ssr2和ssr1的网站类型相似,只是在请求的基础上加了ssl验证,只需要在requests的请求中把证书验证关掉就可以了。
-
verify=False
关掉验证再发送请求会出现如上的警告出现,虽然不会影响代码的运行,但是从美观度来看总归不太好,我们可以利用warnings或urllib3模块关掉警告。
- warnings
import warnings
# 设置忽略警告
warnings.filterwarnings("ignore")
- urllib3
from requests.packages import urllib3
# 关闭警告
urllib3.disable_warnings()
接下来,关于数据的解析就不再赘述,这样就可以获取该类型的电影网站数据了。
具体代码如下:
ssr2
import requests
from lxml import etree
import warnings
from requests.packages import urllib3
# 关闭警告
urllib3.disable_warnings()
# warnings.filterwarnings("ignore")
"""
1、根据第一层的目录获取url请求第二层的内容,解析保存至txt文件
"""
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "https://spa2.scrape.center/page/1",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57",
}
# 页面1
def get_parse_page1(i=None):
# url = f'https://ssr1.scrape.center/page/{i}'
url = 'https://ssr2.scrape.center/'
resp = requests.get(url=url, headers=headers, verify=False)
html = etree.HTML(resp.text)
content_list = html.xpath('.//div[@class="p-h el-col el-col-24 el-col-xs-9 el-col-sm-13 el-col-md-16"]')
for data in content_list:
href = data.xpath('.//a[@class="name"]/@href')[0]
get_parse_page2(href)
# 页面2
def get_parse_page2(href):
url = f'https://ssr1.scrape.center{href}'
resp = requests.get(url, headers=headers)
html = etree.HTML(resp.text)
content_list = html.xpath('.//div[@class="p-h el-col el-col-24 el-col-xs-16 el-col-sm-12"]')
# print(resp.text)
for data in content_list:
title = data.xpath('.//a/h2/text()')[0]
# print(title)
category = data.xpath('.//div[@class="categories"]//button/span/text()')
area = data.xpath('./div[@class="m-v-sm info"]//span/text()')
content = str(data.xpath('.//div[@class="drama"]/p/text()')[0]).strip()
score = str(html.xpath('.//div[@class="el-col el-col-24 el-col-xs-8 el-col-sm-4"]/p/text()')[0]).strip()
with open(f'{title}.txt', 'a', encoding='utf-8') as f:
f.write(title + '\n' + str(category) + '\n' + str(area) + '\n' + content + '\n' + score)
print(title, category, area, score)
if __name__ == '__main__':
get_parse_page1()
# for i in range(10):
# get_parse_page1(i)