首页 > 其他分享 >【15.0】案例

【15.0】案例

时间:2024-01-23 14:49:12浏览次数:22  
标签:15.0 img url tree self list 案例 data

【一】多进程和多线程

import os.path
import time
from multiprocessing import Process
from threading import Thread
import requests
from lxml import etree
from fake_useragent import UserAgent


class BaseSpider(object):
    def __init__(self):
        self.url_list = self.create_url_list()
        # self.url_list = ['https://pic.netbian.com/4kdongman/']
        self.headers = {
            'User-Agent': UserAgent().random
        }
        self.BASE_DIR = os.path.dirname(__file__)
        self.file_name_path = self.create_file_name()

    # 创建url列表
    def create_url_list(self):
        url_list = []
        for i in range(1, 10):
            if i == 1:
                index_url = 'https://pic.netbian.com/4kdongman/'
                url_list.append(index_url)
            else:
                index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html'
                url_list.append(index_url)
        return url_list

    def get_tree(self, page_text):
        tree = etree.HTML(page_text)
        return tree

    def get_page_text(self, url, encoding='gbk'):
        response = requests.get(url, headers=self.headers)
        response.encoding = encoding
        return response.text

    def create_file_name(self, path='img'):
        file_name_path = os.path.join(self.BASE_DIR, path)
        os.makedirs(file_name_path, exist_ok=True)
        return file_name_path


class SpiderImg(BaseSpider):

    def __init__(self):
        super().__init__()

    @staticmethod
    def timer(func):
        def inner(*args, **kwargs):
            start_time = time.time()
            res = func(*args, **kwargs)
            print(f" {func.__name__} | 总耗时 :>>>> {time.time() - start_time} s")
            return res

        return inner

    def spider_index_tree(self):
        tree_list = []
        for url in self.url_list:
            # 获取每一页的页面源码
            page_text = self.get_page_text(url=url)
            tree = self.get_tree(page_text=page_text)
            tree_list.append(tree)
        return tree_list

    def __get_tree_data(self, tree):
        img_data_list = []
        li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
        for li in li_list:
            # ./a/img
            img_title = li.xpath('./a/img/@alt')[0]
            img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
            img_data_list.append({'img_title': img_title, 'img_src': img_src})
        return img_data_list

    def spider_index_img_data(self):
        img_data_list = []
        tree_list = self.spider_index_tree()
        for tree in tree_list:
            img_list = self.__get_tree_data(tree=tree)
            # [{},{}]
            img_data_list.extend(img_list)
        return img_data_list

    def download(self, img_src, img_title):
        response = requests.get(url=img_src)
        file_path = os.path.join(self.file_name_path, f'{img_title}.png')
        with open(file_path, mode='wb') as fp:
            for data in response.iter_content():
                fp.write(data)
        print(f"当前图片 :>>>> {img_title} 保存成功!")

    @timer
    def download_normal(self):
        img_data_list = self.spider_index_img_data()
        for img_data in img_data_list:
            img_title = img_data.get('img_title')
            img_src = img_data.get('img_src')
            self.download(img_src=img_src, img_title=img_title)

    @timer
    def download_process(self):
        img_data_list = self.spider_index_img_data()
        task_list = [Process(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for
                     img_data in img_data_list]
        [task.start() for task in task_list]
        [task.join() for task in task_list]

    @timer
    def download_thread(self):
        img_data_list = self.spider_index_img_data()
        task_list = [Thread(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for
                     img_data in img_data_list]
        [task.start() for task in task_list]
        [task.join() for task in task_list]


if __name__ == '__main__':
    spider = SpiderImg()
    # spider.download_normal() #  download_normal | 总耗时 :>>>> 31.3393292427063 s
    # spider.download_process() #  download_process | 总耗时 :>>>> 34.51722550392151 s
    spider.download_thread()  # download_thread | 总耗时 :>>>> 15.272460699081421 s

'''
    # num_list_one = [1, 2, 3, 4]
    # num_list_two = [7, 8, 9, 10]
    # num_list_new = []
    # print(num_list_new)
    # num_list_new.extend(num_list_one)
    # num_list_new.extend(num_list_two)
    # print(num_list_new)
'''

【二】协程

import asyncio
import os
import time

from fake_useragent import UserAgent
import aiohttp
from lxml import etree

headers = {
    'User-Agent': UserAgent().random
}
BASE_DIR = os.path.dirname(__file__)


def create_file_name(path='img'):
    file_name_path = os.path.join(BASE_DIR, path)
    os.makedirs(file_name_path, exist_ok=True)
    return file_name_path


file_name_path = create_file_name()


async def create_url_list():
    url_list = []
    for i in range(1, 10):
        if i == 1:
            index_url = 'https://pic.netbian.com/4kdongman/'
            url_list.append(index_url)
        else:
            index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html'
            url_list.append(index_url)
    return url_list


async def get_tree(page_text):
    tree = etree.HTML(page_text)
    return tree


async def get_page_text(tag_url, encoding='gbk'):
    async with aiohttp.ClientSession() as session:
        # 如果遇到 ssl error 这种错,一般都是 ssl=False
        async with session.get(url=tag_url, headers=headers, ssl=False) as response:
            page_text = await response.text(encoding='gbk')
    return page_text


async def spider_index_tree():
    tree_list = []
    url_list = await create_url_list()
    # url_list = ['https://pic.netbian.com/4kdongman/']
    for url in url_list:
        # 获取每一页的页面源码
        page_text = await get_page_text(tag_url=url)
        tree = await get_tree(page_text=page_text)
        tree_list.append(tree)
    return tree_list


async def get_tree_data(tree):
    img_data_list = []
    li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    for li in li_list:
        # ./a/img
        img_title = li.xpath('./a/img/@alt')[0]
        img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
        img_data_list.append({'img_title': img_title, 'img_src': img_src})
    return img_data_list


async def spider_index_img_data():
    img_data_list = []
    tree_list = await spider_index_tree()
    for tree in tree_list:
        img_list = await get_tree_data(tree=tree)
        # [{},{}]
        img_data_list.extend(img_list)
    return img_data_list


async def download(img_src, img_title):
    async with aiohttp.ClientSession() as session:
        async with session.get(url=img_src, headers=headers, ssl=False) as response:
            data_all = await response.read()
            file_path = os.path.join(file_name_path, f'{img_title}.png')
            with open(file_path, mode='wb') as fp:
                fp.write(data_all)
            print(f"当前图片 :>>>> {img_title} 保存成功!")


async def main():
    img_data_list = await spider_index_img_data()
    # 创建Task对象列表
    task_list = [asyncio.create_task(download(img_src=img_data.get('img_src'), img_title=img_data.get('img_title'))) for
                 img_data in img_data_list]
    # 等待任务完成
    await asyncio.wait(task_list)


if __name__ == '__main__':
    start_time = time.time()
    # 启协程
    asyncio.run(main())
    print(f"总耗时 :>>>> {time.time() - start_time} s")

    # 总耗时 :>>>> 6.5860209465026855 s

标签:15.0,img,url,tree,self,list,案例,data
From: https://www.cnblogs.com/dream-ze/p/17982405

相关文章

  • 服务器数据恢复—EVA存储raid5硬盘性能不稳定离线的数据恢复案例
    服务器数据恢复环境:某品牌EVA某型号存储,底层是RAID5阵列,划分了若干lun。服务器故障&分析:该存储设备中raid5阵列有两块硬盘掉线,存储中的lun丢失。将故障服务器存储中的所有磁盘编号后取出,硬件工程师检测后发现掉线硬盘不存在物理故障,也没有发现坏道,都可以正常读取数据。掉线硬......
  • MySQL索引条件下推优化案例
    索引条件下推优化意思是:存储引擎使用索引从表中获取数据,而不是存储引擎会遍历索引来查找表中的行,并将其返回给MySQL服务器,由服务器进行WHERE查找。官方原文如下:定义:IndexConditionPushdown(ICP)isanoptimizationforthecasewhereMySQLretrievesrowsfromatable......
  • openGauss学习笔记-204 openGauss 数据库运维-常见故障定位案例-重建索引失败
    openGauss学习笔记-204openGauss数据库运维-常见故障定位案例-重建索引失败204.1重建索引失败204.1.1问题现象当Desc表的索引出现损坏时,无法进行一系列操作,可能的报错信息如下。index\"%s\"containscorruptedpageatblock%u",RelationGetRelationName(rel),BufferG......
  • BOSHIDA DC电源模块的特点及应用案例分享
    BOSHIDADC电源模块的特点及应用案例分享DC电源模块是一种可以将交流电转换为直流电的设备,具有以下特点: 1.高效稳定:DC电源模块采用高效稳定的电源转换技术,可以将输入的交流电转换为输出的稳定直流电,并且具有高效能的转换效率。2.多功能性:DC电源模块通常具有多种输出电压和电......
  • FICO 资产负债表案例
    *&---------------------------------------------------------------------**&包含ZRFI0058_FRM*&---------------------------------------------------------------------**&----------------------------------------------------------......
  • 巴拉巴拉 HANA CLOUD 版本的ODATE 接口案例
    今天分享一个S4HC的ODATA接口这是一个维护PM中设备的的扣子 REPORT  YTEST029.DATA: LO_HTTP_CLIENT TYPE REF TO IF_HTTP_CLIENT.DATA: LO_HTTP_CLIENT2 TYPE REF TO IF_HTTP_CLIENT.DATA: RESPONSE TYPE STRING.DATA:IV_URL TYPE STRING.根据SM59中配置......
  • 案例:常用组件与布局
    介绍HarmonyOSArkUI提供了丰富多样的UI组件,您可以使用这些组件轻松地编写出更加丰富、漂亮的界面。在本篇Codelab中,您将通过一个简单的购物社交应用示例,学习如何使用常用的基础组件和容器组件。本示例主要包含:“登录”、“首页”、“我的”三个页面。相关概念Text:显......
  • 中断机制之中断协商案例深度解析 上
    说明具体来说,当一个线程调用interrupt()时:如果线程处于正常活动状态,那么会将该线程的中断标志设置为true,仅此而已。被设置中断标志的线程将继续正常运行,不受影响。所以,innerupt()并不能真正的中断线程,需要被调用的线程自己进行配合才行。如果线程处于被阻塞状态(例如处于sl......
  • springboot整合springSecurity入门案例(实现登录,记住我等常用标签使用)
    一,整合进依赖每个依赖都标了注释,大家可以按照自己需要的来添加,置于配置问件啥的,大家可以参考springboot+mybatisplus+redis整合(附上脚手架完整代码)<!--主要就是加了这个依赖--><dependency><groupId>org.springframework.security</groupId><artifact......
  • mybatis一对多,多对一经典案例及优缺点分析
    准备数据新建俩张表(student表,clalss表)sql语句如下:createtablestudent(sIdintprimarykeyauto_increment,sNamevarchar(20)notnull,cIdintnotnull,constraintf_sid_cidforeignkey(cId)referencesclass(cId))engine=innodb,charset=utf8;createtableclass......