首页 > 编程语言 >Python爬虫之使用单线程+协程高性能扒取梨视频人物版块视频源码

Python爬虫之使用单线程+协程高性能扒取梨视频人物版块视频源码

时间:2022-08-29 15:25:17浏览次数:63  
标签:视频 扒取 name url li 源码 video time div

# 由于request是基于同步的,因此asyncio的异步失效了
# 因此使用aiohttp来异步扒取
import requests
import asyncio
import time
import os
import random
from lxml import etree
import aiohttp

if __name__ == "__main__":
start_time = time.time()
if not os.path.exists('./Video'):
os.mkdir('./Video')
# 梨视频首页的URL
url = 'https://www.pearvideo.com/category_1'
# 伪装UA
headers = {
'User-Agent': 'Mozilla /5.0(Windows NT 10.0;Win64;x64;rv:104.0) Gecko /20100101 Firefox /104.0'
}
page_text = requests.get(url=url, headers=headers)
# 创建etree对象,接受来自page_text的HTML数据
tree = etree.HTML(page_text.text)
# xpath定位标签拿到 li下面的href属性
list_li = tree.xpath('/html/body/div[2]/div[1]/div/ul/li')
detail_url = []
# 至此已经拿到了视频详情页的url,但由于其视频加载是使用ajax动态申请的,因此我们仍然需要对ajax请求进行分析
# ajax申请的url案例为:https://www.pearvideo.com/videoStatus.jsp?contId=1731718&mrd=0.6200729200474171
# 不难看出contID 即ID视频号,mrd就是0-1之间的随机数,可以通过调用random()函数完成
for li in list_li:
second_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
Id = li.xpath('./div/a/@href')[0].split('_')[1]
Name = li.xpath('./div/a/div[2]/text()')[0]
ajax_url = 'https://www.pearvideo.com/videoStatus.jsp?'
new_headers = {
'User-Agent': 'Mozilla /5.0(Windows NT 10.0;Win64;x64;rv:104.0) Gecko /20100101 Firefox /104.0',
'Referer': 'https://www.pearvideo.com/video_' + Id
}
params = {
'contId': Id,
'mrd': str(random.random())
}
dic_url = requests.get(url=ajax_url, params=params, headers=new_headers).json()
real_url = dic_url['videoInfo']['videos']['srcUrl']
# 分割url然后拼接成真正的url
url_split1 = real_url.split('/')
url_split2 = real_url.split('-')
url_front = ''
for split in range(len(url_split1) - 1):
url_front = url_front + url_split1[split] + '/'
real_url = url_front + 'cont-' + Id + '-' + url_split2[1] + '-' + url_split2[2]
# 将视频的name和url封装成字典存储到链表中
dic_video = {
'name': Name,
'url': real_url
}
detail_url.append(dic_video)
list_tasks = []
# 凡是阻塞的地方都要加上await
async def get_video(url, name):
async with aiohttp.ClientSession() as session:
async with await session.get(url) as response:
video = await response.read()
path = './Video/'+name+'.mp4'
with open(path, 'wb') as fp:
fp.write(video)
print(name, 'Download Successfully!!!')


for url in detail_url:
c = get_video(url['url'],url['name'])
task = asyncio.ensure_future(c)
list_tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(list_tasks))
end_time = time.time()
#输出总时间
print('Total Time:', end_time-start_time)














标签:视频,扒取,name,url,li,源码,video,time,div
From: https://www.cnblogs.com/charleyhoot/p/16636040.html

相关文章