from lxml import etree标签:视频,扒取,name,headers,url,li,案例,time,dic From: https://www.cnblogs.com/charleyhoot/p/16630237.html
import requests
import random
import os
import time
from multiprocessing.dummy import Pool
# 高性能异步爬虫
# 多进程,多线程异步爬虫(不推荐)
# 好处:可以为相关阻塞的操作单独开启线程或者进程,这些阻塞操作可以异步执行
# 缺点:无法无限制开启多线程多进程,因为CPU资源会被严重的耗费,以至于导致CPU对外界数据的响应,从而降低性能
# 线程池和进程池(推荐且下面的例子就是对此的联系)
# 优点:可以降低系统对进程或者线程的创建,销毁的频率,从而降低系统的开销
# 弊端:池中线程或进程的数量是有上限的
if __name__ == "__main__":
start_time = time.time()
if not os.path.exists('./Video'):
os.mkdir('./Video')
# 梨视频首页的URL
url = 'https://www.pearvideo.com/category_1'
# 伪装UA
headers = {
'User-Agent': 'Mozilla /5.0(Windows NT 10.0;Win64;x64;rv:104.0) Gecko /20100101 Firefox /104.0'
}
page_text = requests.get(url=url, headers=headers)
# 创建etree对象,接受来自page_text的HTML数据
tree = etree.HTML(page_text.text)
# xpath定位标签拿到 li下面的href属性
list_li = tree.xpath('/html/body/div[2]/div[1]/div/ul/li')
detail_url = []
# 至此已经拿到了视频详情页的url,但由于其视频加载是使用ajax动态申请的,因此我们仍然需要对ajax请求进行分析
# ajax申请的url案例为:https://www.pearvideo.com/videoStatus.jsp?contId=1731718&mrd=0.6200729200474171
# 不难看出contID 即ID视频号,mrd就是0-1之间的随机数,可以通过调用random()函数完成
for li in list_li:
second_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
Id = li.xpath('./div/a/@href')[0].split('_')[1]
Name = li.xpath('./div/a/div[2]/text()')[0]
ajax_url = 'https://www.pearvideo.com/videoStatus.jsp?'
new_headers = {
'User-Agent': 'Mozilla /5.0(Windows NT 10.0;Win64;x64;rv:104.0) Gecko /20100101 Firefox /104.0',
'Referer': 'https://www.pearvideo.com/video_' + Id
}
params = {
'contId': Id,
'mrd': str(random.random())
}
dic_url = requests.get(url=ajax_url, params=params, headers=new_headers).json()
real_url = dic_url['videoInfo']['videos']['srcUrl']
# 分割url然后拼接成真正的url
url_split1 = real_url.split('/')
url_split2 = real_url.split('-')
url_front = ''
for split in range(len(url_split1)-1):
url_front = url_front + url_split1[split] + '/'
real_url = url_front + 'cont-' + Id + '-' + url_split2[1] + '-' + url_split2[2]
# 将视频的name和url封装成字典存储到链表中
dic_video = {
'name': Name,
'url': real_url
}
detail_url.append(dic_video)
def get_video(dic):
true_url = dic['url']
true_name = dic['name'] + '.mp4'
path = './Video/' + true_name
data = requests.get(url=true_url, headers=headers).content
with open(path, 'wb') as fp:
fp.write(data)
print(dic['name'], 'Download Successfully!!!!')
# 创建线程池对象,最大线程数为4
pool = Pool(4)
pool.map(get_video, detail_url)
pool.close()
pool.join()
end_time = time.time()
# 输出运行时间
print('Total Time Is :', (end_time-start_time))
# 有兴趣的可以不用线程池,使用串行下载方式,观察两种方式的速度差异