下载百度图片:
import asyncio import json import random import re import httpx import requests import time from urllib import parse import os from loguru import logger logger.add("children.log", rotation="23:59", encoding="utf-8") class BaiduPicture: def __init__(self, query_list): self.query_list = query_list # 共四个需要格式化的 word queryWord pn 和时间戳 self.base_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10807585482968436429&ipn=rj&ct=201326592&is=&fp=result&fr=&word={}&queryWord={}E7%90%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=5a&{}=' self.search_headers = { "Accept": "text/plain, */*; q=0.01", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Referer": "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=detail&fr=&hs=0&xthttps=111110&sf=1&fmq=1652751245395_R&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%93%88%E5%93%88&oq=%E5%93%88%E5%93%88&rsp=-1", "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36", "X-Requested-With": "XMLHttpRequest", "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"" } self.search_cookies = { "BDqhfp": "%E5%93%88%E5%93%88%26%260-10-1undefined%26%26708%26%262", "BIDUPSID": "29D534423307903C62A41306DE256BDB", "PSTM": "1646999926", "BAIDUID": "44E4B3A47C120FD98AC4C1C1B43B1641:FG=1", "indexPageSugList": "%5B%22%E5%84%BF%E7%AB%A5%E8%B6%B4%E7%88%AC%22%2C%22%E5%84%BF%E7%AB%A5%E8%B6%B3%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E7%AF%AE%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E4%B9%92%E4%B9%93%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%8B%8D%E7%9A%AE%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%B8%B8%E6%B3%B3%22%2C%22%E5%84%BF%E7%AB%A5%E8%8D%A1%E7%A7%8B%E5%8D%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%BB%91%E6%BB%91%E6%A2%AF%22%2C%22%E5%84%BF%E7%AB%A5%E5%90%83%E9%A5%AD%22%5D", "BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598", "BAIDUID_BFESS": "44E4B3A47C120FD98AC4C1C1B43B1641:FG=1", "BA_HECTOR": "058h252g8l8h2k04n91h843p10r", "H_PS_PSSID": "31253_35911_36165_34584_35979_36055_36337_26350_36301_36311", "delPer": "0", "PSINO": "6", "BDRCVFR[X_XKQks0S63]": "mk3SLVN4HKm", "userFrom": "www.baidu.com", "firstShowTip": "1", "BDRCVFR[dG2JNJb_ajR]": "mk3SLVN4HKm", "ab_sr": "1.0.1_ZWE1OWY2NmRkNTUzYmRhMjFmYmNlNGQxMjQzOGEzNmQxNmYxYTgxZjgyNzNmOTYxMWI3MDczMWI3Nzc1ODk1OGM3YzU3Mjk5NTc5NzQwNWU2Nzg5OTc4MmIwNDg4MTZjMzI1ZGUxZTA4NmQwZGU4YzBhNWEzZmZiODgxYWUxMjhhMTU0YTljNmYzY2QyMTYxOWFmMzEwNTk3YTRhNzgzYg==", "BDRCVFR[-pGxjrCMryR]": "mk3SLVN4HKm" } self.img_headers = { "accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "no-cache", "pragma": "no-cache", "referer": "https://image.baidu.com/", "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "image", "sec-fetch-mode": "no-cors", "sec-fetch-site": "same-site", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36" } def get_search_data(self, current_query, pn): try: quote_query = parse.quote(current_query) temp_times = str(int(time.time() * 1000)) url = self.base_url.format(quote_query, quote_query, pn, temp_times) response = requests.get(url=url, headers=self.search_headers, cookies=self.search_cookies) if response.status_code == 200: result = response.content.decode('utf-8') return result else: logger.error( f"get_search status is {response.status_code} ,query is {current_query} ,page is {pn}") logger.info(f"response is {response.content.decode('utf-8')}") except Exception as search_err: logger.error(f"get_search error,query is {current_query} ,page is {pn},err is {search_err}") logger.info(f"response is {response.content.decode('utf-8')}") def get_img_content(self, urls, current_query, pn): for url in urls: logger.info(f"get img ,url is {url}") try: content = requests.get(url=url, headers=self.img_headers).content if len(content) > 0: with open('./{}/{}.jpg'.format(current_query, url[28:35]), 'wb') as f: f.write(content) f.flush() print(f'{current_query}-{pn} 完成一张照片') else: print(f"下载图片失败,图片没有内容") except Exception as err: logger.error(f"get image content err,err is {err} ") async def get_conent(self, client, url, current_query, pn): logger.info(f"get img ,url is {url}") try: res = await client.get(url) with open('./{}/{}.jpg'.format(current_query, url[28:35]), 'wb') as f: f.write(res.content) f.flush() print(f'{current_query}-{pn} 完成一张照片') except Exception as err: logger.error(f"get image content err,err is {err} ") async def get_img_content_async(self, urls, current_query, pn): async with httpx.AsyncClient(headers=self.img_headers) as client: task_list = [] for url in urls: req = self.get_conent(client, url, current_query, pn) task = asyncio.create_task(req) task_list.append(task) await asyncio.gather(*task_list) def parse_search(self, html): num_pattern = re.compile('"displayNum":(.*?),') num_list = num_pattern.findall(html) return num_list def parse_search_img_url(self, html): stop = False url_pattern = re.compile('"middleURL":"(.*?)"') url_list = url_pattern.findall(html) if not url_list: try: dic = json.loads(html) data = dic.get("data")[0] if not data: stop = True print('=') except Exception as err: logger.error(f"parse json err.err is {err}") return stop, url_list # 是否翻页 def run(self, next_page=True): """ next_page 是否翻页 :param next_page: :return: """ for query in self.query_list: try: if not os.path.exists('./{}/'.format(query)): os.mkdir('./{}'.format(query)) response = self.get_search_data(query, 0) num_list = self.parse_search(response) if num_list: num = num_list[0] logger.info(f'{query} 一共有{num}张照片') else: logger.error(f'{query} 获取不到数据总量') continue if int(num) % 30 == 0: pages = int(num) / 30 else: pages = int(num) // 30 + 2 if not next_page: pages = 1 for pn in range(pages): try: resp = self.get_search_data(query, pn * 30) stop, urls = self.parse_search_img_url(resp) logger.info(f"query is {query},page is {pn},urls is \n {urls}") if stop: break if urls: # self.get_img_content(urls, query, pn) # 异步请求 asyncio.run(self.get_img_content_async(urls, query, pn)) else: logger.error(f"get origin err,can not get picture urls,query is {query},page is {pn}") logger.info(f"parse img is none,resp is \n {resp} ") except Exception as for_num_err: logger.error(f"for_num_err ,query is {query},err is {for_num_err}") finally: time.sleep(random.randint(3, 5)) except Exception as for_query_err: logger.error(f"for_query_err ,query is {query},err is {for_query_err}") if __name__ == '__main__': query = [ # '少儿 乒乓 照片', '萌娃趴在床上', '婴儿趴在床上', '婴儿趴在地上', '萌娃趴在地上', '躺在床上小孩', '躺在地上小孩', '萌娃躺在床上', '萌娃躺在地上', '萌娃跑步', '小孩跑步', '儿童室内搭积木', '婴儿电动车', '小朋友滑滑梯照片', '儿童滑滑梯照片', '小朋友荡秋千照片', '小孩荡秋千照片', '儿童游泳照片', '儿童拍皮球照片', '儿童篮球照片', '儿童足球照片', '儿童舞台照片', '舞台上的小朋友', '儿童在卧室', '卧室里的小朋友', '儿童学步车', '看书的小朋友', '画画的小朋友', '小朋友写作业'] print(len(query)) bp = BaiduPicture(query_list=query) bp.run(next_page=False)
异步下载小说
# url记录 # 章节id # http://dushu.baidu.com/api/pc/getCatalog?data={book_id:4306063500} # 第一章 # http://dushu.baidu.com/api/pc/getChapterContent?data={book_id:4306063500,cid:4306063500|11348571,need_bookinfo:1} import requests import aiohttp import asyncio import json import aiofiles url = '' # html = requests.get(url) ''' 步骤: 1.同步操作,获取所有章节名称 2.异步操作,获取章节具体内容 ''' async def downwenzhang(cid,bid,title): # 准备url data={ 'book_id':bid, 'cid':f"{bid}|{cid}", 'need_bookinfo': 1 } data=json.dumps(data) url=f'http://dushu.baidu.com/api/pc/getChapterContent?data={data}' # 开始请求数据 async with aiohttp.ClientSession() as session: async with session.get(url) as resp: dic = await resp.json() async with aiofiles.open(title+".txt",mode='w',encoding='utf-8') as f: await f.write(dic['data']['novel']['content']) async def getCatalog(url): resp=requests.get(url) txt=resp.json() tasks = [] print(txt) for item in txt["data"]["novel"]["items"]: title = item["title"] cid = item["cid"] tasks.append(downwenzhang(cid, "4306063500", title)) await asyncio.wait(tasks) if __name__ == '__main__': bid = "4306063500" url ='http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}' asyncio.run(getCatalog(url))
标签:异步,请求,err,get,url,self,案例,query,pn From: https://www.cnblogs.com/tjp40922/p/17585472.html