异步爬虫例子:
import time
import aiohttp
import asyncio
import re
import os
os.environ['NO_PROXY'] = 'www.baidu.com'
class Asyn():
def __init__(self):
self.__headers = {
'authority': 'go.drugbank.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache',
'cookie': 'xxxxx',
'referer': 'https://go.drugbank.com/unearth/q?query=*&button=&searcher=drugs',
'sec-ch-ua': '"Chromium";v="112", "Microsoft Edge";v="112", "Not:A-Brand";v="99"',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
}
async def __fetch(self, session, url):
print("发送请求:", url)
async with session.get(url, verify_ssl=False, headers=self.__headers) as response:
content = await response.text()
# print(content)
try:
info1 = re.findall(r'href="/indications/.*?">(.*?)</a', content) # 第一列表
if not info1:
print(f"最大限度页")
return
except Exception as e:
print(f"最大限度页,error={e}")
return
# print(info1)
# print(len(info1))
info2 = re.findall(r'<div class="db-matches"><a (.*?)</a></div>', content)
info2_new = [] # 第二列表
for i in info2:
# i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '')
i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '').replace(' / ',
'【/】') # 修改的
# print(i)
info2_new.append(i)
# print(len(info1), info1)
# print(len(info2_new), info2_new)
for yaoming, chenfen in zip(info1, info2_new):
dic = {
"药名": yaoming,
"成分": chenfen
}
# total_list.append(dic)
print(dic)
with open('异步采集.txt', 'a', encoding='utf-8') as f:
f.write(f'{len(info1), info1}\n{len(info2_new), info2_new}\n')
# time.sleep(0.5)
async def main(self):
page = int(input("输入页数:"))
async with aiohttp.ClientSession() as session:
url_list = [f'https://go.drugbank.com/unearth/q?approved=1&ca=0&eu=0&page={i}&query=%2A&searcher=indications'
for i in range(1, page + 1)]
tasks = [asyncio.create_task(self.__fetch(session, url)) for url in url_list]
await asyncio.wait(tasks)
if __name__ == '__main__':
spide = Asyn()
asyncio.run(spide.main())
除了get请求,aiohttp还支持其它请求类型,如POST、PUT、DELETE等,和requests使用方式类似。
可获取:
async with aiohttp.ClientSession() as session:
async with session.post('https://www.httpbin.org/post', data=data) as response:
print('status:', response.status) # 状态码
print('headers:', response.headers) # 响应头
print('body:', await response.text()) # 响应体
print('bytes:', await response.read()) # 响应体二进制内容
print('json:', await response.json()) # 响应体json数据
可设置:# 设置 1 秒的超时
async def main():
# 设置 1 秒的超时
timeout = aiohttp.ClientTimeout(total=1)
async with aiohttp.ClientSession(timeout=timeout) as session:
url_list = [f'https://go.drugbank.com/unearth/q?approved=1&ca=0&eu=0&page={i}&query=%2A&searcher=indications'
for i in range(1, 200)]
# print(url_list)
tasks = [asyncio.create_task(fetch(session, url)) for url in url_list]
await asyncio.wait(tasks)
更多参考:https://z.itpub.net/article/detail/602E65B824B2FC8A6AB5BDC2A1279822
标签:异步,url,爬虫,replace,session,print,async,response,asyncio From: https://www.cnblogs.com/code3/p/17364397.html