首页 > 其他分享 >爬取BiliBili视频

爬取BiliBili视频

时间:2022-09-23 01:56:05浏览次数:76  
标签:info BiliBili play 视频 get 爬取 mp4 content

https://github.com/BtbN/FFmpeg-Builds/releases/tag/latest

这是ffmpeg下载地址,下载好要配置环境变量,合成视频要用到,因为B站的视频和音频是分开的
花了段时间分析的下BiliBili的网页结构,根据XHR分析出B站的视频分为音频与视频组成,分别爬取出音频与视频,在用ffmpge工具将他两整合到一起

import json
import requests
import os
from lxml import etree
import re
 
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'referer': 'https://www.bilibili.com/'}
 
 
def get_requests(url, num):
    res = requests.get(url, headers=headers)
    print(res.url)
    write_down(res.content, num)
 
 
# 当前目录下创建目录
def write_down(content, num):
    path = 'BiliBili'
    if not os.path.exists(path):
        os.mkdir(path)
 
    with open(path + '\\' + str(num) + '.mp4', 'wb') as f:
        f.write(content)
    f.close()
 
 
def get_together():
    # 正则表达式给MP4取名
    content = re.compile('video/([0-9a-zA-Z].*)\?')
    content = content.findall(url)
    if not content:
        content = url.split('/')[-1]
    else:
        content = content[0]
    pwd = os.getcwd()
    os.system(
        'ffmpeg -i "{0}{1}BiliBili\\3.mp4" -i "{0}{1}BiliBili\\2.mp4" -c copy "{0}{1}BiliBili\\{2}.mp4"'.format(pwd,
                                                                                                                '\\',
                                                                                                                content))  # 如果出现乱码,就在当前目录下cmd运行ffmpeg -i "1.mp4" -i "2.mp4" -c copy "3.mp4“,可能是你的ffmpeg版本太低
    os.remove(os.getcwd() + '\\BiliBili\\' + '3.mp4')
    os.remove(os.getcwd() + '\\BiliBili\\' + '2.mp4')
    print('下载成功')
 
 
# 获取视频与音频的json
def get_json(url):
    res = requests.get(url, headers=headers)
    tree = etree.HTML(res.text)
    play_info = tree.xpath('//script[4]/text()')
    play_info = str(play_info)
    play_info = play_info.split('=', 1)
    play_info = str(play_info[1]).split('\']')
    play_info_js = json.loads(play_info[0])
    video_info_js = play_info_js['data']['dash']['video'][1]['baseUrl']
    audio_info_js = play_info_js['data']['dash']['audio'][1]['baseUrl']
    # print(video_info_js)
    # print(audio_info_js)
    get_requests(video_info_js, 3)
    get_requests(audio_info_js, 2)
 
 
if __name__ == '__main__':
    url = str(input('请输入BiliBili的视频URL:'))
    get_json(url)
    get_together()

标签:info,BiliBili,play,视频,get,爬取,mp4,content
From: https://www.cnblogs.com/miyol/p/16721394.html

相关文章