import requests
from lxml import html
import random
import os
from multiprocessing.dummy import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
请求梨视频的人物模块的视频数据,视频+标题
URL = 'https://www.pearvideo.com/category_1'
page_text = requests.get(url=URL, headers=headers).text
etree = html.etree
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="categoryList" ]/li')
urls = []
for li in li_list:
# 获取视频的id
video_no = li.xpath('./div/a/@href')[0]
video_no = str(video_no)
video_id = video_no.split('')[1]
video_url = 'https://www.pearvideo.com/videoStatus.jsp?'
# 获取视频的标题
video_title = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
# 请求ajax中的地址
params = {
'contId':video_id,
'mrd': str(random.random())
}
ajax_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Referer': 'https://www.pearvideo.com/video' + video_id
}
response = requests.get(url=video_url, headers=ajax_headers, params=params).json()
# 得到伪装地址,进行转换,得到真实地址
id_list = []
camouflage_url = str(response["videoInfo"]["videos"]["srcUrl"])
camouflage_url_id = str(camouflage_url).split('/')[-1].split('-')[0]
id_list.append(camouflage_url_id)
for id in id_list:
# 真实地址
really_url = camouflage_url.replace(id,'cont-'+video_id)
dic_ = {
'name':video_title,
'url':really_url
}
urls.append(dic_)
请求下载视频数据,保存为.mp4格式
def get_video(dic):
url = dic['url']
print(dic['name'], "开始下载……")
video_data = requests.get(url=url, headers=headers).content
if not os.path.exists('./梨视频'):
os.mkdir('./梨视频')
video_path = './梨视频/'+dic['name']
with open(video_path, 'wb') as fp:
fp.write(video_data)
print(dic['name'], "下载成功!")
线程池异步处理
pool = Pool(25)
pool.map(get_video,urls)
pool.close()
pool.join()