通过selenium框架爬取图片
环境搭建
搭建python环境就不在此展开了, 首先安装selenium框架:
pip install selenium
下载浏览器驱动, 此处仅仅以Chrome浏览器为例子, 驱动版本必须与当前系统安装的Chrome对应。
chromedriver下载,如果找不到对应版本, 是因为版本太新, 需要到此链接找到对应版本。
版本对应关系在链接中存在说明:
基本过程
首先将下载的chromedriver放置到环境变量:
current_script_path = os.path.abspath(os.path.dirname(__file__))
original_path = os.environ.get("PATH")
os.environ["PATH"] = original_path +";" + os.path.join(current_script_path,"chromedriver-win64","chromedriver.exe")
接下来启动浏览器驱动:
from selenium import webdriver
driver = webdriver.Chrome()
# ....
driver.get(search_link)
查找指定元素
如果要找到指定元素, 可通过xpath:
from selenium.webdriver.common.by import By
first_link = driver.find_element(By.XPATH, '//ul[@class="dgControl_list "][1]//li[1]//a[1]')
点击指定元素
如果要点击指定元素:
first_link = driver.find_element(By.XPATH, '//ul[@class="dgControl_list "][1]//li[1]//a[1]')
first_link.click()
等待元素出现
如果要等待某元素出现:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
image = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="mainImageWindow"]//img')))
获取元素属性
image = driver.find_element(By.XPATH, '//div[@id="mainImageWindow"]//img')
src = image.get_attribute("src")
关闭驱动
driver.close()
完整示例
该示例展示了如何从bing爬取指定关键字的图片:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_image_url_from_keyword(keyword, num_of_image):
search_link = f"https://cn.bing.com/images/search?q={keyword}"
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
current_image_amount = 0
urls = []
try:
driver.get(search_link)
first_link = driver.find_element(By.XPATH, '//ul[@class="dgControl_list "][1]//li[1]//a[1]')
# root = etree.HTML(driver.page_source)
# first_link = root.xpath('//ul[@class="dgControl_list "][1]//li[1]//a[1]')
first_link.click()
while True:
driver.refresh()
image = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="mainImageWindow"]//img')))
src = image.get_attribute("src")
urls.append(src)
current_image_amount+=1
if current_image_amount >= num_of_image:
break
next = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@id="mainImageContainer"]/div[@id="navr"]')))
next.click()
driver.implicitly_wait(0.5)
sys.stdout.write('\rprogress: {}/{}'.format(current_image_amount, num_of_image))
sys.stdout.flush()
except Exception as e:
print(e)
driver.close()
return set(urls)
if __name__ == "__main__":
key_words = ["美女", "帅哥"]
for key in key_words:
# 最多爬取500张截止
urls = list(get_image_url_from_keyword(key, 500))
标签:webdriver,框架,selenium,image,driver,爬取,link,import
From: https://www.cnblogs.com/quenwaz/p/18118754