Python爬取抖音创作者所有短视频

标签：info 视频 get Python 爬取抖音 video print ID

小伙伴们，大家好呀，上次给大家分享了如何爬取快手up主所有的短视频后，不少人在后台留言说，想要爬取抖音up主的所有短视频，那么今天代码就来了。

其实有了爬取快手up主的经验后，想必大家对于爬取抖音up主的所有视频已经不陌生了，毕竟一通百通，无非是请求接口的改变，思路大致是一样的，请看如下分析。

首先当然是要获取作者所有短视频的ID，本次获取up主主页的所有视频的ID，是通过 selenium 来实现的，通过浏览器自动化，配合 xpath 和页面滚动来获取所有的视频ID，然后在通过接口逐个获取短视频详细信息。如下图：

Python爬取抖音创作者所有短视频_ide

在获取到所有的视频ID后，则通过请求api接口来获取抖音短视频的具体视频地址，详情请参考 抖音无水印下载，拿到详细的视频信息后就可以下载视频了。在这里又使用了iter_content进行视频下载，具体介绍可参考 人人视频 中的文章讲解。

还有需要注意的是，请在确保安装了selenium 以及配置了 chromedriver 的前提下运行代码，否则会报错。具体怎么安装和配置在这里不做过多赘述，如有不懂可自行百度，也可下次写一篇关于如何使用 selenium 来进行爬虫。

代码如下：

import os
import time
import re
from lxml import etree
import requests
from selenium import webdriver




class Dyopen(object):


    # 定义函数get_video_ids(author_url),返回UP主全部短视频的ID的列表
    # 参数author_url:抖音UP主的主页
    # 例如，XXX的主页 https://www.douyin.com/user/xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
    def get_video_ids(self, author_url):
        ids = list()
        count = 0
        retry = 0
        n = 0
        flag = True
        chrome_option = webdriver.ChromeOptions()
        chrome_option.add_argument('headless')  # 静默模式
        driver = webdriver.Chrome(options=chrome_option)
        driver.get(author_url)
        while flag and retry <= 5:
            driver.execute_script("window.scrollBy(0,2000)")  # scrollBy(x, y)，JavaScript中操作内容滚动指定的像素数
            n = n + 1
            time.sleep(2)
            html_source = driver.page_source
            items = etree.HTML(html_source).xpath("//li[@class='ECMy_Zdt']")
            count_items = len(items)
            print("操作页面内容滚动{0:0>3}次后,获取视频ID{1:0>4}个。".format(n, count_items))
            if count_items != count:
                count = count_items
            else:
                if retry < 5:
                    retry = retry + 1
                else:
                    flag = False
                    print("已经达到可获取视频ID的最大数量,开始逐个获取视频ID:\n")
                    for item in items:
                        video_id = item.xpath("a/@href")[0].split("/")[-1]
                        print("获取短视频ID:{}".format(video_id))
                        ids.append(video_id)
        return ids


    # 定义函数get_video_info(video_id),返回元组(短视频下载地址,短视频标题)
    # 参数video_id:抖音短视频唯一ID
    def get_video_info(self, video_id):
        # 通过url0获取json数据(Chrome浏览器，F12进入开发者模式，模拟手机端，可以看到url0)
        url0 = "https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids=" + video_id
        r_url0 = requests.get(url0, headers={"user-agent": "Mozilla/5.0"})
        # 获取json数据中的视频地址及视频标题
        url1 = r_url0.json()["item_list"][0]['video']["play_addr"]["url_list"][0]
        # 防止出现空标题，加上短视频ID
        title = video_id + "-" + r_url0.json()["item_list"][0]['share_info']["share_title"].split("#")[0].split("@")[0]
        # 获取url1重定向后的真实视频地址
        r_url1 = requests.get(url1, headers={"user-agent": "Mozilla/5.0"}, allow_redirects=False)
        url = r_url1.headers['Location']
        return url, title


    # 定义函数get_file_name(string),从字符串中提取合法文件名
    def get_file_name(self, string):
        pattern = re.compile(r'[?*/\\|.:><]')
        txt = re.sub(pattern, '', string)
        return txt


    # 定义函数download_video(save_path, url, title),下载并以短视频标题作为文件名保存短视频到指定路径
    def download_video(self, save_path, url, title):
        if os.path.exists(save_path):
            pass
        else:
            os.makedirs(save_path)
        with requests.get(url, headers={"user-agent": "Mozilla/5.0"}, stream=True) as r:
            total_size = int(int(r.headers["Content-Length"]) / 1024 + 0.5)
            video_name = self.get_file_name(title).strip().replace(" ", "")\
                .replace("\n", "").replace("\r", "")
            full_path = save_path + video_name + ".mp4"
            with open(file=full_path, mode="wb") as f:
                print('当前下载:【{}】,视频文件大小:【{}KB】'.format(video_name, total_size))
                count = 0
                scale = 50
                start = time.perf_counter()
                for chunk in r.iter_content(chunk_size=1024):
                    f.write(chunk)
                    count = count + 1
                    i = int(scale * (count / total_size))
                    a = "=" * i
                    b = "." * (scale - i)
                    c = (i / scale) * 100
                    dur = time.perf_counter() - start
                    speed = count / dur
                    print("\r下载进度:{0:^3.0f}%[{1:}>{2:}] 耗时:{3:.2f}s 平均下载速度:{4:.2f}KB/S。".format(c, a, b, dur, speed),
                          end="")
                print("\n视频文件下载完毕,存放于:【{0:}】。".format(full_path))


    # 定义主程序
    def sprider_start(self, author_url):
        # 获取UP主全部短视频的ID
        print("\n获取UP主全部短视频的ID...")
        ids = self.get_video_ids(author_url)
        print("获取完毕!共获取短视频ID{}个!".format(len(ids)))


        # 根据短视频ID,批量获取下载地址、短视频标题
        print("\n根据短视频的ID获取短视频的下载地址、标题信息...")
        videos_info = list()
        for video_id in ids:
            video_info = self.get_video_info(video_id)
            videos_info.append(video_info)
            print("短视频标题:【{0:}】;下载地址:【{1:}】".format(video_info[1], video_info[0]))


        # 批量下载短视频
        print("\n开始批量下载短视频:")
        cwd = os.getcwd()
        path = cwd + "/videos/"
        total = len(videos_info)
        for i in range(total):
            print("\n将下载第【{0:0>4}/{1:0>4}】个短视频:".format(i + 1, total))
            print("=" * 50)
            self.download_video(path, videos_info[i][0], videos_info[i][1])




if __name__ == "__main__":
    dyurl = "https://www.douyin.com/user/MS4wLjABAAAAN0DV9zb5-zgJ2TYFLfAtxptcn_1qWHpzy3QTJFfrztc"
    dySpider = Dyopen()
    dySpider.sprider_start(dyurl)

标签：info,视频,get,Python,爬取,抖音,video,print,ID
From： https://blog.51cto.com/u_15924937/5976121

Python爬取抖音创作者所有短视频

相关文章

赞助商

阅读排行