1 import requests 2 from bs4 import BeautifulSoup 3 import pandas as pd 4 from openpyxl import Workbook 5 import concurrent.futures 6 7 # 读取 .txt 文件中的 URL 8 with open("urls.txt", "r") as file: 9 urls = file.read().splitlines() 10 11 # 存储 URL 和 title 12 data = [] 13 14 def fetch_title(url): 15 response = requests.get(url) 16 soup = BeautifulSoup(response.text, "html.parser") 17 title = soup.find("title").text 18 return (url, title) 19 20 with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: 21 futures = [executor.submit(fetch_title, url) for url in urls] 22 23 for future in concurrent.futures.as_completed(futures): 24 result = future.result() 25 data.append(result) 26 27 # 将 URL 和 title 写入 Excel 文件 28 df = pd.DataFrame(data, columns=["URL", "Title"]) 29 30 book = Workbook() 31 writer = pd.ExcelWriter("titles.xlsx", engine="openpyxl") 32 writer.book = book 33 34 df.to_excel(writer, index=False) 35 36 writer.save() 37 由于是最后一起写入到excel,所以单次URL获取不宜过多
标签:writer,批量,title,URL,futures,获取,url,import From: https://www.cnblogs.com/BreakLAC/p/17096362.html