首页 > 其他分享 >关于异步请求的两个案例

关于异步请求的两个案例

时间:2023-07-27 17:13:09浏览次数:39  
标签:异步 请求 err get url self 案例 query pn

下载百度图片:

import asyncio
import json
import random
import re
import httpx
import requests
import time
from urllib import parse
import os
from loguru import logger

logger.add("children.log", rotation="23:59", encoding="utf-8")


class BaiduPicture:

    def __init__(self, query_list):
        self.query_list = query_list
        # 共四个需要格式化的 word queryWord pn 和时间戳
        self.base_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10807585482968436429&ipn=rj&ct=201326592&is=&fp=result&fr=&word={}&queryWord={}E7%90%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=5a&{}='
        self.search_headers = {
            "Accept": "text/plain, */*; q=0.01",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Referer": "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=detail&fr=&hs=0&xthttps=111110&sf=1&fmq=1652751245395_R&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%93%88%E5%93%88&oq=%E5%93%88%E5%93%88&rsp=-1",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\""
        }
        self.search_cookies = {
            "BDqhfp": "%E5%93%88%E5%93%88%26%260-10-1undefined%26%26708%26%262",
            "BIDUPSID": "29D534423307903C62A41306DE256BDB",
            "PSTM": "1646999926",
            "BAIDUID": "44E4B3A47C120FD98AC4C1C1B43B1641:FG=1",
            "indexPageSugList": "%5B%22%E5%84%BF%E7%AB%A5%E8%B6%B4%E7%88%AC%22%2C%22%E5%84%BF%E7%AB%A5%E8%B6%B3%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E7%AF%AE%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E4%B9%92%E4%B9%93%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%8B%8D%E7%9A%AE%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%B8%B8%E6%B3%B3%22%2C%22%E5%84%BF%E7%AB%A5%E8%8D%A1%E7%A7%8B%E5%8D%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%BB%91%E6%BB%91%E6%A2%AF%22%2C%22%E5%84%BF%E7%AB%A5%E5%90%83%E9%A5%AD%22%5D",
            "BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598",
            "BAIDUID_BFESS": "44E4B3A47C120FD98AC4C1C1B43B1641:FG=1",
            "BA_HECTOR": "058h252g8l8h2k04n91h843p10r",
            "H_PS_PSSID": "31253_35911_36165_34584_35979_36055_36337_26350_36301_36311",
            "delPer": "0",
            "PSINO": "6",
            "BDRCVFR[X_XKQks0S63]": "mk3SLVN4HKm",
            "userFrom": "www.baidu.com",
            "firstShowTip": "1",
            "BDRCVFR[dG2JNJb_ajR]": "mk3SLVN4HKm",
            "ab_sr": "1.0.1_ZWE1OWY2NmRkNTUzYmRhMjFmYmNlNGQxMjQzOGEzNmQxNmYxYTgxZjgyNzNmOTYxMWI3MDczMWI3Nzc1ODk1OGM3YzU3Mjk5NTc5NzQwNWU2Nzg5OTc4MmIwNDg4MTZjMzI1ZGUxZTA4NmQwZGU4YzBhNWEzZmZiODgxYWUxMjhhMTU0YTljNmYzY2QyMTYxOWFmMzEwNTk3YTRhNzgzYg==",
            "BDRCVFR[-pGxjrCMryR]": "mk3SLVN4HKm"
        }
        self.img_headers = {
            "accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "referer": "https://image.baidu.com/",
            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "image",
            "sec-fetch-mode": "no-cors",
            "sec-fetch-site": "same-site",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"
        }

    def get_search_data(self, current_query, pn):
        try:
            quote_query = parse.quote(current_query)
            temp_times = str(int(time.time() * 1000))
            url = self.base_url.format(quote_query, quote_query, pn, temp_times)
            response = requests.get(url=url, headers=self.search_headers, cookies=self.search_cookies)
            if response.status_code == 200:
                result = response.content.decode('utf-8')
                return result
            else:
                logger.error(
                    f"get_search status is {response.status_code} ,query is {current_query} ,page is {pn}")
                logger.info(f"response is {response.content.decode('utf-8')}")
        except Exception as search_err:
            logger.error(f"get_search error,query is {current_query} ,page is {pn},err is {search_err}")
            logger.info(f"response is {response.content.decode('utf-8')}")

    def get_img_content(self, urls, current_query, pn):

        for url in urls:
            logger.info(f"get img ,url is {url}")
            try:
                content = requests.get(url=url, headers=self.img_headers).content
                if len(content) > 0:
                    with open('./{}/{}.jpg'.format(current_query, url[28:35]), 'wb') as f:
                        f.write(content)
                        f.flush()
                    print(f'{current_query}-{pn} 完成一张照片')
                else:
                    print(f"下载图片失败,图片没有内容")
            except Exception as err:
                logger.error(f"get image content err,err is {err} ")

    async def get_conent(self, client, url, current_query, pn):
        logger.info(f"get img ,url is {url}")
        try:
            res = await client.get(url)
            with open('./{}/{}.jpg'.format(current_query, url[28:35]), 'wb') as f:
                f.write(res.content)
                f.flush()
            print(f'{current_query}-{pn} 完成一张照片')
        except Exception as err:
            logger.error(f"get image content err,err is {err} ")

    async def get_img_content_async(self, urls, current_query, pn):
        async with httpx.AsyncClient(headers=self.img_headers) as client:
            task_list = []
            for url in urls:
                req = self.get_conent(client, url, current_query, pn)
                task = asyncio.create_task(req)
                task_list.append(task)
            await asyncio.gather(*task_list)

    def parse_search(self, html):
        num_pattern = re.compile('"displayNum":(.*?),')
        num_list = num_pattern.findall(html)
        return num_list

    def parse_search_img_url(self, html):
        stop = False
        url_pattern = re.compile('"middleURL":"(.*?)"')
        url_list = url_pattern.findall(html)
        if not url_list:
            try:
                dic = json.loads(html)
                data = dic.get("data")[0]
                if not data:
                    stop = True
                    print('=')
            except Exception as err:
                logger.error(f"parse json err.err is {err}")

        return stop, url_list

    # 是否翻页
    def run(self, next_page=True):
        """
        next_page 是否翻页
        :param next_page:
        :return:
        """

        for query in self.query_list:
            try:
                if not os.path.exists('./{}/'.format(query)):
                    os.mkdir('./{}'.format(query))
                response = self.get_search_data(query, 0)

                num_list = self.parse_search(response)
                if num_list:
                    num = num_list[0]
                    logger.info(f'{query} 一共有{num}张照片')
                else:
                    logger.error(f'{query} 获取不到数据总量')
                    continue
                if int(num) % 30 == 0:
                    pages = int(num) / 30
                else:
                    pages = int(num) // 30 + 2

                if not next_page:
                    pages = 1

                for pn in range(pages):
                    try:
                        resp = self.get_search_data(query, pn * 30)
                        stop, urls = self.parse_search_img_url(resp)
                        logger.info(f"query is {query},page is {pn},urls is \n {urls}")
                        if stop:
                            break
                        if urls:
                            # self.get_img_content(urls, query, pn)
                            # 异步请求
                            asyncio.run(self.get_img_content_async(urls, query, pn))
                        else:
                            logger.error(f"get origin err,can not get picture urls,query is {query},page is {pn}")
                            logger.info(f"parse img is none,resp  is \n {resp} ")
                    except Exception as for_num_err:
                        logger.error(f"for_num_err ,query is {query},err is {for_num_err}")
                    finally:
                        time.sleep(random.randint(3, 5))
            except Exception as for_query_err:
                logger.error(f"for_query_err ,query is {query},err is {for_query_err}")


if __name__ == '__main__':
    query = [
        # '少儿 乒乓 照片',
        '萌娃趴在床上', '婴儿趴在床上', '婴儿趴在地上', '萌娃趴在地上', '躺在床上小孩', '躺在地上小孩', '萌娃躺在床上', '萌娃躺在地上', '萌娃跑步', '小孩跑步', '儿童室内搭积木',
        '婴儿电动车', '小朋友滑滑梯照片', '儿童滑滑梯照片', '小朋友荡秋千照片', '小孩荡秋千照片', '儿童游泳照片', '儿童拍皮球照片', '儿童篮球照片', '儿童足球照片',
        '儿童舞台照片', '舞台上的小朋友', '儿童在卧室', '卧室里的小朋友', '儿童学步车', '看书的小朋友', '画画的小朋友', '小朋友写作业']



    print(len(query))
    bp = BaiduPicture(query_list=query)
    bp.run(next_page=False)

 

 

异步下载小说

# url记录
# 章节id
# http://dushu.baidu.com/api/pc/getCatalog?data={book_id:4306063500}

# 第一章
# http://dushu.baidu.com/api/pc/getChapterContent?data={book_id:4306063500,cid:4306063500|11348571,need_bookinfo:1}

import requests
import aiohttp
import asyncio
import json
import aiofiles

url = ''
# html = requests.get(url)
'''
步骤:
1.同步操作,获取所有章节名称
2.异步操作,获取章节具体内容
'''


async def downwenzhang(cid,bid,title):
    # 准备url
    data={
        'book_id':bid,
        'cid':f"{bid}|{cid}",
        'need_bookinfo': 1
    }
    data=json.dumps(data)
    url=f'http://dushu.baidu.com/api/pc/getChapterContent?data={data}'
    # 开始请求数据
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            dic = await resp.json()

            async with aiofiles.open(title+".txt",mode='w',encoding='utf-8') as f:
                await f.write(dic['data']['novel']['content'])

async def getCatalog(url):
    resp=requests.get(url)
    txt=resp.json()
    tasks = []
    print(txt)
    for item in txt["data"]["novel"]["items"]:
        title = item["title"]
        cid = item["cid"]
        tasks.append(downwenzhang(cid, "4306063500", title))
    await asyncio.wait(tasks)


if __name__ == '__main__':
    bid = "4306063500"
    url ='http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}'
    asyncio.run(getCatalog(url))

 

标签:异步,请求,err,get,url,self,案例,query,pn
From: https://www.cnblogs.com/tjp40922/p/17585472.html

相关文章

  • 配置文件的介绍,静态文件的配置,request对象请求方法,pycharm连接数据库,Django连接My
    配置文件的介绍#注册应用的INSTALLED_APPS=['django.contrib.admin','django.contrib.auth','django.contrib.contenttypes','django.contrib.sessions','django.contrib.messages','django.c......
  • Profinet转EtherNet/IP网关连接AB PLC的应用案例
    西门子S7-1500PLC(profinet)与ABPLC以太网通讯(EtherNet/IP)。本文主要介绍捷米特JM-EIP-PN的Profinet转EtherNet/IP网关,连接西门子S7-1500PLC与ABPLC 通讯的配置过程,供大家参考。1, 新建工程:运行 RSLogix5000 程序,选择菜单 File->New,弹出对话框:  2, 在“Type”中选......
  • 一个请求到达后端,会按照什么顺序执行过滤,验证,执行请求。
    请求到达后端后,NioEndPoint对象(他提供socket接受线程、轮询socket线程、Worker线程池),在轮询中发现一个请求过来,新建一个socket接收线程SocketProcessorBase并绑定请求,一直到请求执行完毕才释放。此接收线程放在Worker线程池里,等待执行。当执行线程时,dorun的过程中调用了invoke(r......
  • web | 在node中使用axios进行同步和异步请求
    web|在node中使用axios进行同步和异步请求最近在看怎么用nodejs整爬虫,摸索一下axios的使用。constaxios=require('axios');//异步写法axios("https://mz1.top") .then(res=>{ console.log("https://mz1.top"); console.log(res.headers); }) .catch(err=>......
  • 使用EasyExcel异步导出excel技术方案
    主线程:处理请求响应,同时开启子线程,让子线程处理导出任务子线程:将导出的文件写入到磁盘临时文件,临时文件上传到oss中获取上传文件的url路径,记录url路径到数据库中,最后再删除临时文件通过单独一个页面查询导出文件流水的列表,进行下载文件代码实现线程池配置@Componentpublic......
  • java post请求带参数
    JavaPost请求带参数在Java程序中,我们经常需要使用HTTP来进行网络通信。其中,POST请求是一种常见的方式,用于向服务器发送数据。在POST请求中,我们可以通过参数来传递需要的数据。本文将介绍如何在Java程序中使用POST请求发送带有参数的数据,并提供相应的代码示例。什么是POST请求?HT......
  • 浅谈Excel开发:六 Excel 异步自定义函数
    上文介绍了Excel中的自定义函数(UDF),它极大地扩展了Excel插件的功能,使得我们可以将业务逻辑以Excel函数的形式表示,并可以根据这些细粒度的自定义函数,构建各种复杂的分析报表。普通的UDF自定义函数的基本执行逻辑是,Excel接受用户输入的函数表达式,然后通过UDF函数的处理逻辑进行处......
  • 通过机器学习增强移动广告能力:案例研究
    介绍在数字时代,移动广告主和跨渠道广告活动家越来越需要利用机器学习的力量来创建更有效的营销策略。为了扩大他们的软件优势,一位这样的客户与FissionLabs接洽,希望建立一个先进的、支持机器学习的广告平台。该平台旨在为广告商提供无缝的营销活动管理、实时竞价功能以及以TB级......
  • JVM系统优化实践(23):GC生产环境案例(6)
    您好,这里是「码农镖局」51CTO博客,欢迎您来,欢迎您再来~在互联网大厂中,对每天亿级流量的日志进行清洗、整理是非常常见的工作。在某个系统中,需要对用户的访问日志做脱敏处理,也就是清洗掉姓名、身份证号、手机号等个人隐私信息后在保存到数据库中或者交付给其他应用使用。系统的设计者......
  • EtherNet/IP转 Modbus网关实现AB PLC控制变频器案例
    捷米特JM-EIP-RTU网关Modbus转ETHERNET/IP用于将多个变频器连接到Ethernet/Ip主网,以便森兰变频器可以由ABPLC控制。 配备专用于JM-EIP-RTU网关的EDS文件,ABPLC主站可以控制森兰逆变器从站。使用 AB 系统的配置方法1,运行 RSLogix5000 程序加载捷米特JM-EIP-RTU的EDS ......