import requests import bs4 from selenium.webdriver.common.by import By import pandas as pd driverPath = "C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe" url = 'https://so.gushiwen.cn/' url2 = 'gushi/tangshi.aspx' msg = requests.get(url=url + url2) soup = bs4.BeautifulSoup(msg.text, "html.parser") titles = soup.find_all(class_='typecont') result = [] from selenium import webdriver driver = webdriver.Edge(driverPath) for title in titles: list_name = title.find_all('a') for item in list_name: driver.get(url + item['href']) a = driver.find_element(By.CLASS_NAME,'contson').text result.append((item.text,a)) driver.quit() result = pd.DataFrame(result,columns=['诗名','诗词']) print(result)
有个唐诗三百首的爬取任务,而人家的是script动态加载,普通的request.get无法支持,因此引入selenium库,模拟浏览器访问,但是由于需要浏览器访问,大大增加了运行时间
标签:url,selenium,driver,学习,item,result,import From: https://www.cnblogs.com/cocotun/p/18085945