selenium爬虫
简介:
能够模拟浏览器运行
是自动化测试工具
pip install selenium安装selenium
针对不同的浏览器需要安装不同的驱动
确认浏览器的版本
https://www.cnblogs.com/aiyablog/articles/17948703
下载驱动
# @作者: Alice
# @时间: 2024/12/7 22:10
from selenium import webdriver
# Chrome浏览器
chromedriver_path = r"./chromedriver-win64/chromedriver.exe"
driver = webdriver.Chrome(executable_path=chromedriver_path)
# Selenium 4中webdriver.Chrome的初始化方法不再接受executable_path参数来指定Chrome WebDriver的路径。
selenium4之后不支持executable_path参数的解决办法:
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
service = Service('./chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)
解决了驱动的问题,打开浏览器之后出现闪退的解决办法:
pip install selenium==4.5.0,通过降低当前selenium的版本来解决
详细解释代码:
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
关闭浏览器
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
time.sleep(2)
driver.close()
访问页面
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
url = "https://www.baidu.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
driver.get(url=url)
time.sleep(2)
driver.close()
设置浏览器的大小
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
url = "https://www.baidu.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window() # 全屏显示
driver.get(url=url)
time.sleep(2)
driver.set_window_size(900, 500) # 设置浏览器窗口具体的大小
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
time.sleep(2)
driver.close()
前进和后退
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
baidu_url = "https://www.baidu.com"
jd_url = "https://www.jd.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window() # 全屏显示
driver.get(url=baidu_url)
time.sleep(2)
driver.set_window_size(900, 500) # 设置浏览器窗口具体的大小
driver.get(url=jd_url)
driver.back() # 回退
time.sleep(2)
driver.forward() # 前进
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
time.sleep(2)
driver.close()
获取页面基本信息
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
baidu_url = "https://www.baidu.com"
jd_url = "https://www.jd.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window() # 全屏显示
driver.get(url=baidu_url)
print(driver.title) # 网页标题
print(driver.current_url) # 网页url
time.sleep(2)
driver.set_window_size(900, 500) # 设置浏览器窗口具体的大小
driver.get(url=jd_url)
print(driver.title) # 网页标题
print(driver.current_url) # 网页url
driver.back() # 回退
time.sleep(2)
driver.forward() # 前进
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
print(driver.name) # 浏览器名称
time.sleep(2)
print(driver.page_source) # 网页源代码
driver.close()
定位页面元素
等等信息都可以进行定位
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
from selenium.webdriver.common.by import By # 定位页面元素
import time
baidu_url = "https://www.baidu.com"
jd_url = "https://www.jd.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window() # 全屏显示
driver.get(url=baidu_url)
print(driver.title) # 网页标题
print(driver.current_url) # 网页url
time.sleep(2)
driver.set_window_size(900, 500) # 设置浏览器窗口具体的大小
driver.get(url=jd_url)
print(driver.title) # 网页标题
print(driver.current_url) # 网页url
driver.back() # 回退
time.sleep(2)
driver.forward() # 前进
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
print(driver.name) # 浏览器名称
time.sleep(2)
search = driver.find_element(by=By.ID,value='key')
# <selenium.webdriver.remote.webelement.WebElement (session="fc667113340e1e99705053d61f33698f", element="f.BE6C03606C78BBB8D11B9791071B748E.d.DBD267C241C18F6579A7387F952D3D0F.e.12")>
btn = driver.find_element(by=By.CLASS_NAME,value='button')
# <selenium.webdriver.remote.webelement.WebElement (session="fc667113340e1e99705053d61f33698f", element="f.BE6C03606C78BBB8D11B9791071B748E.d.DBD267C241C18F6579A7387F952D3D0F.e.33")>
# print(driver.page_source) # 网页源代码
driver.close()
退出整个浏览器与关闭当前页面
driver.quit() # 退出浏览器
driver.close() # 关闭当前页面
截取当前浏览器页面,并且保存截图
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
url = "https://www.baidu.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
driver.get(url=url)
time.sleep(2)
driver.save_screenshot("baidu.png")
driver.close()
可以使用selenium爬取的练手网站
https://www.gushiwen.cn/
输入数据
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
from selenium.webdriver.common.by import By
url = "https://www.baidu.com"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
driver.get(url=url)
search = driver.find_element(by=By.ID,value='kw')
time.sleep(2)
search.send_keys("python爬虫")
time.sleep(2)
driver.close()
点击搜索
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
from selenium.webdriver.common.by import By
url = "https://www.gushiwen.cn/"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
driver.get(url=url)
search = driver.find_element(by=By.ID,value='txtKey')
time.sleep(2)
search.send_keys("长恨歌")
time.sleep(2)
submit = driver.find_element(By.XPATH, '//*[@id="search"]/form/input[3]')
submit.click() # 点击搜索按钮
time.sleep(2)
driver.close()
获取所有指定元素,指定该元素的属性值
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
from selenium.webdriver.common.by import By
url = "https://www.gushiwen.cn/"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
driver.get(url=url)
search = driver.find_element(by=By.ID,value='txtKey')
time.sleep(2)
search.send_keys("长恨歌")
time.sleep(2)
submit = driver.find_element(By.XPATH, '//*[@id="search"]/form/input[3]')
submit.click() # 点击搜索按钮
time.sleep(2)
a_list = driver.find_elements(By.TAG_NAME, 'a')
for a in a_list:
print(a.get_attribute('href'))
driver.close()
获取超链接文本内容是指定文本的元素
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
from selenium.webdriver.common.by import By
url = "https://www.gushiwen.cn/"
service = Service('./chromedriver-win64/chromedriver.exe') # 指定谷歌浏览器的驱动
# <selenium.webdriver.chrome.service.Service object at 0x000001E74A970F10>
driver = webdriver.Chrome(service=service) # 谷歌浏览器
# <selenium.webdriver.chrome.webdriver.WebDriver (session="f1bf13922c749ba9b55dd155538b4d32")>
driver.get(url=url)
search = driver.find_element(by=By.ID,value='txtKey')
time.sleep(2)
res = driver.find_elements(By.LINK_TEXT, '唐诗三百')
print(res)
driver.close()
模拟登录图片验证码网站,遇到验证码首先将验证码图片进行截图
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
from selenium.webdriver.common.by import By # 定位元素
url = "https://www.gushiwen.cn/" # 要访问的网址
service = Service('./chromedriver-win64/chromedriver.exe') # 谷歌浏览器驱动
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window()
driver.get(url=url) # 使用谷歌浏览器访问指定网址
# 点击我的
driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/div/div[2]/div/a[6]').click()
time.sleep(2)
username = 'xxxxxx'
userpass = 'xxxxxx'
# 获取用户名节点 并输入用户名
driver.find_element(By.ID, 'email').send_keys(username)
time.sleep(2)
# 获取密码节点 并输入密码
driver.find_element(By.ID, 'pwd').send_keys(userpass)
time.sleep(2)
# 获取雁阵吗图片的节点对象 并进行截取 保存为code.png
driver.find_element(By.ID, 'imgCode').screenshot('code.png')
time.sleep(2)
driver.close()
使用打码平台处理图片登录验证码
改成python3的输出方式(加上括号)
需要的东西:超级鹰的用户名,超级鹰的密码,超级鹰的软件ID,需要处理的图片路径,处理的验证码类型
# 使用超级鹰打码平台https://www.chaojiying.com/
# @作者: Alice
# @时间: 2024/12/7 22:16
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
import time
from selenium.webdriver.common.by import By # 定位元素
from chaojiying import run # 调用超级鹰
url = "https://www.gushiwen.cn/" # 要访问的网址
service = Service('./chromedriver-win64/chromedriver.exe') # 谷歌浏览器驱动
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window()
driver.get(url=url) # 使用谷歌浏览器访问指定网址
# 点击我的
driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/div/div[2]/div/a[6]').click()
time.sleep(2)
username = 'xxxxxxxxx'
userpass = 'xxxxxxxxx'
driver.find_element(By.ID, 'email').send_keys(username)
time.sleep(2)
driver.find_element(By.ID, 'pwd').send_keys(userpass)
time.sleep(2)
driver.find_element(By.ID, 'imgCode').screenshot('code.png')
path = "./code.png"
img_id = "1004"
pic_str = run(path, img_id)["pic_str"] # 返回的就是一个dict不用json了
time.sleep(2)
driver.find_element(By.ID, 'code').send_keys(pic_str)
time.sleep(2)
driver.find_element(By.ID, 'denglu').click()
time.sleep(2)
driver.close()
鼠标滚动,利用execute_script方法执行js代码
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
def scroll_windwo(driver, stop_lengt=None, step_length=2000):
while True:
if stop_lengt:
if stop_lengt - step_length <= 0: # 刚好够一步,走完直接结束循环,走完剩下的
driver.execute_script(f'window.scrollBy(0, {stop_lengt})') # execute_script可以执行js,向下滚动多少像素
break
print(f"当前的总路程为:{stop_lengt}")
driver.execute_script(f'window.scrollBy(0, {step_length})') # 走好每一步
stop_lengt -= step_length # 更新总路程
time.sleep(0.5)
url = "https://news.163.com/"
step_length = 2000 # 每次滚动的距离(像素)
stop_lengt = 30000 # 滚动的总距离(像素)
service = Service('./chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)
driver.maximize_window()
driver.get(url=url)
for i in range(1, 6):
scroll_windwo(driver, stop_lengt, step_length) # 滚动一个总路程
more = driver.find_element(By.XPATH, '//*[@id="index2016_wrap"]/div[3]/div[2]/div[3]/div[2]/div[5]/div/a[3]')
driver.execute_script('arguments[0].click();', more) # 点击更多
print(f'第{i}次点击')
driver.quit()
不显示浏览器,即:无头模式
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
url = "https://www.gushiwen.cn/"
opt = Options()
service = Service('./chromedriver-win64/chromedriver.exe')
opt.add_argument("--headless") # 无头模式
driver = webdriver.Chrome(service=service, options=opt)
driver.get(url=url)
print(driver.page_source)
使用打码平台登录之后获取cookie然后将cookie使用json模块序列化保存到本地
# @作者: Alice
# @时间: 2024/12/7 22:16
import time
import json
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
from selenium.webdriver.common.by import By # 定位元素
from chaojiying import run # 调用超级鹰
url = "https://www.gushiwen.cn/" # 要访问的网址
service = Service('./chromedriver-win64/chromedriver.exe') # 谷歌浏览器驱动
driver = webdriver.Chrome(service=service) # 谷歌浏览器
driver.maximize_window()
driver.get(url=url) # 使用谷歌浏览器访问指定网址
# 点击我的
driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/div/div[2]/div/a[6]').click()
time.sleep(2)
username = 'xxxxxxxx'
userpass = 'xxxxxxxx'
# 获取用户名节点 并输入用户名
driver.find_element(By.ID, 'email').send_keys(username)
time.sleep(2)
# 获取密码节点 并输入密码
driver.find_element(By.ID, 'pwd').send_keys(userpass)
time.sleep(2)
# 获取雁阵吗图片的节点对象 并进行截取 保存为code.png
driver.find_element(By.ID, 'imgCode').screenshot('code.png')
path = "./code.png"
img_id = "1004"
pic_str = run(path, img_id)["pic_str"] # 返回的就是一个dict不用json了
time.sleep(2)
driver.find_element(By.ID, 'code').send_keys(pic_str)
time.sleep(2)
driver.find_element(By.ID, 'denglu').click()
time.sleep(2)
cookies = driver.get_cookies() # cookie返回的结果是一个列表类型<class 'list'>
driver.close()
json_cookies = json.dumps(cookies) # 使用json进行处理,序列化为json放到本地文件中
with open('gsw_cookies.txt', 'w') as f:
f.write(json_cookies)
使用本地的cookie进行登录
# @作者: Alice
# @时间: 2024/12/7 22:16
import time
import json
from selenium import webdriver # Selenium的中文名是“硒”,webdriver网络驱动程序
from selenium.webdriver.chrome.service import Service # service服务
url = "https://so.gushiwen.cn/user/collect.aspx" # 这里直接进入登录界面了
service = Service('./chromedriver-win64/chromedriver.exe')
driver = webdriver.Chrome(service=service)
driver.maximize_window()
driver.get(url=url) # 使用谷歌浏览器访问指定网址,没有cookie之前是没有登录的状态
time.sleep(2)
with open('gsw_cookies.txt', 'r') as f:
cookies = json.loads(f.read()) # 反序列化为列表类型的cookie
for cookie in cookies:
driver.add_cookie(cookie)
driver.refresh() # 刷新使得cookie生效
# 重新访问登录界面,有了cookie之后是已经登录的状态
driver.get(url)
标签:webdriver,04selenium,入门,service,url,driver,爬虫,time,import From: https://www.cnblogs.com/wumouhao/p/18593773好了,以上就结束了,只是一个简单的使用,之后直接看官方文档吧,
没啥说的,又是无聊的一天。
最近太忙也不知道忙什么。。