异步爬虫
- 多线程
- 多进程
- 协程
多线程与多进程
进程:运行中的程序,每次我们执行一个程序,操作系统对自动的为这个程序准备一些必要的资源(如:分配内存,创建一个能够执行的线程)
线程:程序内,可以直接被CPU调度的执行过程,是操作系统能够进行运算调度的最小单位,它被包含在进程之中,是进程中的实际运作单位
# 两种写法
# 第一种
from threading import Thread
# 创建任务
def func(name):
for i in range(10):
print(name, i)
def main():
# 创建线程
t1 = Thread(target=func, args=("周杰伦",))
t2 = Thread(target=func, args=("马斯克",))
t3 = Thread(target=func, args=("周星驰",))
# 启动线程
t1.start()
t2.start()
t3.start()
main()
# 第二种 用方法来写 不会
多线程写法
# 两种写法
# 第一种
from threading import Thread
# 创建任务
def func(name):
for i in range(10):
print(name, i)
def main():
# 创建线程
t1 = Thread(target=func, args=("周杰伦",))
t2 = Thread(target=func, args=("马斯克",))
t3 = Thread(target=func, args=("周星驰",))
# 启动线程
t1.start()
t2.start()
t3.start()
main()
多线程练习
from threading import Thread
import requests
from bs4 import BeautifulSoup
def func(index):
url = f"http://2chsck.cc/vodtype/1-{index}.html"
resp = requests.get(url)
html = resp.text
page = BeautifulSoup(html, "html.parser")
# img
jpg = page.findAll("a", attrs={"class": "stui-vodlist__thumb lazyload"})
for item in jpg:
jpgHref = item.get("data-original")
photo = requests.get(jpgHref)
jpgName = item.get("title")
with open(f"images/{index}- {jpgName}.jpg", mode="wb") as f:
f.write(photo.content)
def main():
for i in range(101, 201):
Thread(target=func, args=(f"{i}",)).start()
main()
线程池
# import requests
# from bs4 import BeautifulSoup
# from concurrent.futures import ThreadPoolExecutor
#
#
# def func(page):
# url = f"https://pic.netbian.com/4kmeinv/index_{page}.html"
# resp = requests.get(url)
# resp.encoding = "gbk"
# html = resp.text
# page = BeautifulSoup(html, "html.parser")
# ul = page.find("div", attrs={"class": "slist"})
# img = ul.findAll("img")
#
# for item in img:
# src = "https://pic.netbian.com" + item.get("src")
# photo = requests.get(src)
# # print(src)
# try:
# with open(f"{src}.jpg", mode="wb") as f:
# f.write(photo.content)
# except:
# print("error")
#
#
# def main():
# # 线程池
# with ThreadPoolExecutor(3) as t:
# for i in range(2, 10):
# t.submit(func, i)
#
#
# if __name__ == "__main__":
# main()
# ---------------------- ^待查询^ -----------------------------
from concurrent.futures import ThreadPoolExecutor
线程池案例
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
}
f = open("新发地.csv", mode="a", encoding="utf-8")
def download(url):
resp = requests.get(url, headers=headers)
tree = etree.HTML(resp.text)
a_list = tree.xpath("//li[@class='market-list-item']/a")
for a in a_list:
span = a.xpath("./span/text()")[:4]
s = ",".join(span)
f.write(s)
f.write("\n")
print(f"{url} 已完成")
for i in range(1, 2):
with ThreadPoolExecutor(10) as t:
for i in range(1, 100):
# 新发地 请求过多需要验证 待解决 2023年4月18日13:16:28
url = f"https://www.cnhnb.com/hangqing/cdlist-2003192-0-0-0-0-{i}/"
t.submit(download, url)
多进程
from multiprocessing import Process
def func(name):
for i in range(1000):
print(name, i)
if __name__ == '__main__':
p1 = Process(target=func, args=("周杰伦",))
p2 = Process(target=func, args=("林俊杰",))
p1.start()
p2.start()
"""
何时使用多线程 何时使用多进程
1. 多线程:任务相对统一,互相特别的相似
2. 多进程:多个任务相互独立,很少有交集
"""
标签:__,异步,Thread,爬虫,func,import,main,def
From: https://www.cnblogs.com/sroot/p/17414902.html