from lxml import etree
import re
import requests
import os
import time
from fake_useragent import UserAgent
# 滚动式刷新的爬取方法
base_url = 'https://www.pearvideo.com/'
for page in range(1, 4):
new_page = (page - 1) * 24
url = f'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start={new_page}'
headers = {'user-agent': UserAgent().random}
res = requests.get(url=url, headers=headers)
html = res.text
e = etree.HTML(html)
href = e.xpath('//a[@class="vervideo-lilink actplay"]/@href')
for video_id in href:
cont_id = video_id.split('_')[-1]
status_url = f'https://www.pearvideo.com/videoStatus.jsp?contId={cont_id}'
# 添加来源
headers['referer'] = base_url + video_id
res = requests.get(url=status_url, headers=headers)
data = res.json()
video_url = data['videoInfo']['videos']['srcUrl']
video_url = video_url.replace(data['systemTime'], 'cont-' + cont_id)
video_res = requests.get(url=video_url, headers=headers)
video_data = video_res.content
with open(f'./video/{cont_id}.mp4', 'wb') as f:
f.write(video_data)
print(f'{cont_id}下载完成')
time.sleep(100)
标签:滚动,url,import,cont,headers,video,刷新,网页,id
From: https://www.cnblogs.com/hacker-dvd/p/17572972.html