1.处理cookie
为什么要处理cookie?
保存客户端的相关状态
在请求中携带cookie,在爬虫中如果遇到了cookie的反爬如何处理?
#手动处理
在抓包工具中捕获cookie,将其封装在headers中
#自动处理
使用session机制
使用场景:动态变化的cookie
session对象:该对象和requests模块用法几乎一致.如果在请求的过程中产生了cookie,如果该请求使用session发起的,则cookie会被自动存储到session中
爬去雪球网的数据
import requests
s = requests.Session()
main_url = "https://xueqiu.com" # 先对url发请求获取cookie
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
}
params = {
"size": "8",
'_type': "10",
"type": "10"
}
s.get(main_url, headers=headers)
url = 'https://stock.xueqiu.com/v5/stock/hot_stock/list.json?size=8&_type=10&type=10'
page_text = s.get(url, headers=headers).json()
print(page_text)
爬取中文网登录之后的数据
import requests
from lxml import etree
s =requests.Session()
url = "https://passport.china.com/logon"
data = {
"userName": "18873",
"password": "zbb521521"
}
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36",
"Referer": "https://passport.china.com/logon",
"Host": "passport.china.com",
"Origin": "https://passport.china.com"
}
s.post(url=url, data=data, headers=headers)
# cookies = res.cookies.get_dict()
res = s.get(
url="https://passport.china.com/main",
# cookies=cookies
)
page_text =res.text
tree = etree.HTML(page_text)
phone = tree.xpath('//*[@id="usernick"]/@title')
print(phone)
2.JS逆向
XX大学BBS论坛
import time
import hashlib
import requests
# 1.首页
res = requests.get(url="https://bbs.pku.edu.cn/v2/home.php")
cookie_dict = res.cookies.get_dict()
# 2.登录
user = "ugbjm"
pwd = "123123"
ctime = int(time.time())
data_string = f"{pwd}{user}{ctime}{pwd}"
obj = hashlib.md5()
obj.update(data_string.encode('utf-8'))
md5_string = obj.hexdigest()
res = requests.post(
url="https://bbs.pku.edu.cn/v2/ajax/login.php",
data={
"username": user,
"password": pwd,
"keepalive": "0",
"time": ctime,
"t": md5_string
},
cookies=cookie_dict
)
print(res.text)
3.验证码识别
基于Python的模块 ddddocr
可以实现对图片验证码的识别。
pip3.11 install ddddocr==1.4.9 -i https://mirrors.aliyun.com/pypi/simple/
pip3.11 install Pillow==9.5.0
本地识别
import ddddocr
with open("img.png",mode='rb') as f:
body = f.read()
ocr = ddddocr.DdddOcr(show_ad=False)
code = ocr.classification(body)
print(code)
在线识别
import ddddocr
import requests
res = requests.get(url="https://console.zbox.filez.com/captcha/create/reg?_t=1701511836608")
ocr = ddddocr.DdddOcr(show_ad=False)
code = ocr.classification(res.content)
print(code)
但是有些验证码并不是图片格式而是base64 需要解码才能
import base64
import ddddocr
content = base64.b64decode("iVBORw0KGgoAAAANSUhEUgAAAGQAAAAoCAYAAAAIeF9DAAAHGElEQVR4Xu2a2VNTZxTAHZ/62of+BX3rdPrUmaq1da3WQWur1mqntrQWLe7UkUoQlEWFqFDZZN8hUBWKQUVpQDCyVUeltVWIIiAEZHWBAEk4zffZe+bmS+6SEEzE/GbOkHvPuXeY85t7vyWZBV48ilnsCS/uxSvEw3hthJydXWITnsiMFyLWfLGcu5jRQuQ2W27dy8ArBOTXvQxmrBBHm+xo/XQhKkTffRu0NSfgt8KvISttGaQlfQyFOWtBXboDbt7Ig6HBdvYSj6C7awDC3kqGg4oC8N0YC6uWhcPCOUHgszQMtvudgsK8Gnj2dNTqGmeErH47Z8rBYlfI6MgAXLqwH5Lj50iGJzL//UDJWP1pBDTfasNrnBEyHdgIef6sF1R5X9o0Xig8Ebb5QrF8QajlLTDoMTIIVkLMZiOcKfrWquGXLyqgo70BDKNDNG8wDFleCTehsS4JivK/4l/uMWxco4T4WDUo30yH+zo9PH0yavn/x+nnuBg1LPhgP0qJjjzjuUJuNGWiiJSEedByt4KffiWxt9bIz65GISveO2CVczcoxGQah+y05SikqT6ZX/fKw4khkfNGEQpZNDeILXUrKKT13mWUkZmyBIxGA7/O5XT2PoL0c9kQcPIX+D5yK2w/vgcis6Oh5qYWJicnac2msM0YrmRw4BkK+cwyA3OEuLh4CApS0GhoaGTTNtTXN2B9fHwCm7YBhdRWR6GQK5rD/BqXU9FQCb4RflYN50d0fgwYxsemTUjJ6ToUEh6iYtOi1NXVY4MTEhLZtA2khquvr69n0zagEP5gfu/uBXruga4aykr8ISv1E0g/tRBUueug+o8I6NH/hTdwFO3tOhsB9iK5NN2lQsbHjZbJSR9kplbC4nkKKsNnySHoejTAlooyOjoKisAXDSYRN2cvW4Lo9XqsCwkJpddKgUJyM3xQSH9fC9RUHbWabbFBnigy63KEEcMIbI3agU0mrynN9WoYfDoIJrOJ/iXH5DwryFnYqS4XP26Kg86OPrZcklPvBIBKVYSNVqvL6Tl7qNVqrCsqKmbTdkEhaUkfYbOv1cbaCLAXNVVR/HtJQl5VXIO3Ru+E3sHHbAmFnCf56RKyb3cmPLjfw5bKgjS/tVWHjY6IiISkd22FmEwmmuPqdDodW2IXFJKaOB8bnZr4IRQXbITWlst01U6ehJGRfnpM1h58KY68vo4VxGKD1doXr0UhSH66hHCx/+dsePJkhL1EkiSLlJDdQdjs5mbbHjQ3N2NeqTyGExUpUAgZJ7gmny32BeOE/ffdhOX8adUmrK2qlD9L2RWzFxvc3adn01Z09XW7RAiH0WiCx73DoKm8DVt8E1DK+tVRTknRaDTY8MzMLDZNz3F5jaaKTQuCQgpy1mCTH3U08Wts6LSs3Lnawty1bFoQMr3lGjxhnGDTVpC8K4XwMZvNELwvF6XEnTjHlkgyPDwMCkUwbTj5S47l5KRAIefL9mCThZ4ODpLnasnsSy6eIoTQ/vAxClm36iibloXQUyD19IiBQhquJTgnJHkRmxbEna8slgnLNJgT4uxq3d44QYJ8FhtfxEAhXZ3XscmdHeIrULLZyNWq8tazaUHcMagL0XKvC4WQ70ucwXYmdZ/OprhjkiM1joBCJifNkJe5kjaZDuoCWyfsoO7I1Ncd0157jI1NwC7/FBQSGJDFlsiGv9YoLi6m6w3uuLy8nC2XxGq395+/S7HRZNqra6m0rC4HLYOgif4lx8X5G7AmOX4uDPTLm18T7C0Mq65fsSwIh/5fGA7R46ksDDd8oYSTx89BnfZfut1O9q1MJjPdfm970EO3Tcj2PH/6e7XmDnsb2bCrcRLcsV7v+FrHSgh5SviDu1RUrAyB1tlX+beQRO7WSUppBn7+LtyPvY0g/EbLicOH5K2gxeDvV3GRmJjElsnCSgiBDNgV5wNtms8P8l3Jn41pluoXix0ixRExcjYXh58/weOflLvYWwjCNlwoyECennzJ8vTLW7CJQXZ9WSGNjeLjsBA2Qjja27RQeTEYcsJW0G2VjJTF9McO2toYwR83OCKFbL+nlWVBwK+BdDq87Zj19jvJc0ICE4PZywUhP3BQ/94E4QdU8MM3J+HzFZH0Bw5L5wfDGp8jsHdnBuRlVdNFoqswGAwQGnoQZZDPY2NjbJksBIUQHGkwhzPX2KPkShkKSTqbwqZnLIJCptLYqVxLaNd3gN/RbSik9paWLZmxuEVIcMohKL92EVo6dNA/PABGk5F+IdXW/RBOV5XA5iP+KMNfuRvGJ8bZW8xY3CKEHcTFouGO+L7aTMNjhWw+7P9avao4BIUQpBprDznXtPd0wJmqUjiSo4R98Qq6WPSN2EIXhBFZUXRAJ9Pe1xFRIQQ5DeZwpNaLfSSFEEijxZotlfciH1lCOLjGs+HFdTgkxMv08x9BPe61Ol73uQAAAABJRU5ErkJggg==")
# with open('x.png', mode='wb') as f:
# f.write(content)
ocr = ddddocr.DdddOcr(show_ad=False)
code = ocr.classification(content)
print(code)
案例:x文街
import requests
import ddddocr
# 获得图片验证码地址
res = requests.post(url="https://api.ruanwen.la/api/auth/captcha/generate")
res_dict = res.json()
captcha_token = res_dict['data']['captcha_token']
captcha_url = res_dict['data']['src']
# 访问并获取图片验证码
res = requests.get(captcha_url)
# 识别验证码
ocr = ddddocr.DdddOcr(show_ad=False)
code = ocr.classification(res.content)
print(code)
# 登录认证
res = requests.post(
url="https://api.ruanwen.la/api/auth/authenticate",
json={
"mobile": "手机号",
"device": "pc",
"password": "密码",
"captcha_token": captcha_token,
"captcha": code,
"identity": "advertiser"
}
)
print(res.json())
10.selenium
概念
基于浏览器自动化的一个模块.
环境的安装:
下载selenium模块
pip install selenium
selenium和爬虫之间的关联是什么?
便捷的获取页面中动态加载的数据
requests模块进行数据爬取:可见非可得
selenium:可见即可得
实现模拟登录
基本操作:
谷歌驱动的下载:
114及之前版本: http://chromedriver.storage.googleapis.com/index.html
117/118/119版本: https://googlechromelabs.github.io/chrome-for-testing/
动作链
一系列的行为动作
无头浏览器
无可视化界面的浏览器
phantosJS
1.处理反爬selenium
像淘宝很多网站都禁止selenium爬取
正常在浏览器输入window.Navigator.webdriver返回的是undefined
用代码打开浏览器返回的是true
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
service = Service("driver/chromedriver.exe")
opt = webdriver.ChromeOptions()
opt.add_argument('--disable-infobars')
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=service, options=opt)
# Selenium在打开任何页面之前,先运行这个Js文件。
with open('driver/hide.js') as f:
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": f.read()})
driver.get('https://www.5xclass.cn')
time.sleep(2000)
driver.close()
2.g美茅台操作示例
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
# 元素定位需要导入webdriver.common.by
from selenium.webdriver.common.by import By
from time import sleep
import datetime
# 防止自动关闭
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止反反爬虫
option.add_experimental_option("detach", True) # 防止自动关闭
# 1.实例化一个浏览器对象
service = Service(executable_path='chromedriver.exe')
bro = webdriver.Chrome(service=service, options=option)
# 2.模拟用户发起请求
url = 'https://myhome.gome.com.cn/member/myReservation'
bro.get(url)
# 3.最大化浏览器
bro.maximize_window()
#等待页面加载完毕
time.sleep(2)
#倒计时开始前5秒不停的点击抢茅台
while True:
now = datetime.datetime.now().strftime("%H:%M:%S")
if now > "19:59:50":
sumbit = bro.find_element(By.CLASS_NAME,"appointmentBtn")
sumbit.click()
if now == "20:00:30":
break
sleep(100000000)
bro.close()
3.执行JS代码
比如说点击下拉菜单
浏览器console中执行
document.querySelector(".area-code-select").children[2].click()
代码中执行 点击按钮
bro.execute_script('document.querySelector(".area-code-select").children[2].click()')
获取cooie
bro.get_cookie()
bro.execute_script('return document.cookie')
获取title
bro.execute_script('return document.title')
4.等待
如果页面加载比较慢,需要等待某个元素加载成功后,再执行某些操作。
示例1:基于lambda表达式
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get('https://passport.bilibili.com/login')
# ############# 方式1:点击短信登录 #############
time.sleep(3)
sms_btn = driver.find_element(
By.XPATH,
'//*[@id="app"]/div[2]/div[2]/div[3]/div[1]/div[3]'
)
sms_btn.click()
# ############# 方式2:点击短信登录(推荐) #############
sms_btn = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.XPATH,
'//*[@id="app"]/div[2]/div[2]/div[3]/div[1]/div[3]'
))
sms_btn.click()
示例2:自定义函数
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get('https://passport.bilibili.com/login')
def func(dv):
print("无返回值,则间隔0.5s执行一次此函数;如有返回值,则复制给sms_btn变量")
# <div xxx="123" id="uuu"></div>
# <img src="..."/>
tag = dv.find_element(
By.XPATH,
'//*[@id="app"]/div[2]/div[2]/div[3]/div[1]/div[3]'
)
img_src = tag.get_attribute("xxx")
if img_src:
return tag
return
sms_btn = WebDriverWait(driver, 30, 0.5).until(func)
sms_btn.click()
time.sleep(250)
driver.close()
示例3:全局配置
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
# 后续找元素时,没找到时则等待10去寻找(一旦找到则继续)
driver.implicitly_wait(30)
driver.get('https://passport.bilibili.com/login')
sms_btn = driver.find_element(
By.XPATH,
# '//*[@id="app"]/div[2]/div[2]/div[3]/div[1]/div[3]'
'//*[@id="xxxxxxxxxapp"]/div[2]/div[2]/div[3]/div[1]/div[3]'
)
sms_btn.click()
print("找到了")
time.sleep(250)
driver.close()
5.获取值
例如:<input type='text' value="?" placeholder="?" />
例如:<select ><option value='1'>北京</option> </option value='2'>上海</option> </select>
,获取select标签的value属性
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.implicitly_wait(10)
driver.get('https://www.bilibili.com/')
time.sleep(10)
tag = driver.find_element(
By.XPATH,
'//*[@id="nav-searchform"]/div[1]/input'
)
print(tag)
#print(tag.text) #获取文本
print(tag.get_attribute("placeholder")) #获取属性
print(tag.get_attribute("value")) #获取值
time.sleep(1000)
driver.close()
如果是单选框 需要都找到标签查看是够被选中判断之后再执行代码
<input type="radio" name="findcar" value="1" checked="">新车
<input type="radio" name="findcar" value="2">二手机
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.implicitly_wait(10)
driver.get('https://www.autohome.com.cn/beijing/')
# ############### 1.单独找到每一个 ###############
tag = driver.find_element(
By.XPATH,
'/html/body/div[1]/div[11]/div[2]/div[1]/div[1]/label[1]/span/input'
)
print(tag.get_property("checked")) # True
tag = driver.find_element(
By.XPATH,
'/html/body/div[1]/div[11]/div[2]/div[1]/div[1]/label[2]/span/input'
)
print(tag.get_property("checked")) # False
# ############### 2.循环找到每一个 ###############
parent = driver.find_element(
By.XPATH,
'/html/body/div[1]/div[11]/div[2]/div[1]/div[1]'
)
tag_list = parent.find_elements(
By.XPATH,
'label/span/input'
)
for tag in tag_list:
print( tag.get_property("checked"), tag.get_attribute("value") )
driver.close()
6.Xpath
获取页面的html
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.implicitly_wait(10)
driver.get('https://car.yiche.com/')
html_string = driver.page_source
tree = etree.HTML(html_string)
div_test = etree.HTML(html_string)
div_list = tree.xpath("/html/body/div[7]/div[1]/div[2]/div/div[1]/div")
for i in div_list:
zz = i.xpath('./a/div//text()')
print(zz)
driver.close()
7.IP代理
普通代理
import requests
#请求地址
targetUrl = "https://myip.ipip.net/"
#代理服务器
proxyHost = "140.250.91.127"
proxyPort = "44125"
proxyMeta = "http://%(host)s:%(port)s" % {
"host" : proxyHost,
"port" : proxyPort,
}
proxies = {
"https" : proxyMeta,
}
resp = requests.get(targetUrl, proxies=proxies)
print(resp.status_code)
print(resp.text)
selenium代理
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
# 换成自己生成的代
service = Service("chromedriver.exe")
opt = webdriver.ChromeOptions()
opt.add_argument(f'--proxy-server=220.187.41.3:44121') # 代理
driver = webdriver.Chrome(service=service, options=opt)
driver.get('https://myip.ipip.net/')
time.sleep(2000)
driver.close()
8.携带Cookie
driver.add_cookie({'name': 'foo', 'value': 'bar'})
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
# 注意:一定要先访问,不然Cookie无法生效
driver.get('https://dig.chouti.com/about')
# 加cookie
driver.add_cookie({
'name': 'token',
'value': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjZHVfNDU3OTI2NDUxNTUiLCJleHBpcmUiOiIxNzA0MzI5NDY5OTMyIn0.8n_tWcEHXsBSXWIY9rBoGWwaLPF8iWIruryhKTe5_ks'
})
# 再访问
driver.get('https://dig.chouti.com/')
time.sleep(2000)
driver.close()
9.无头和其他
如果不想显示展示在浏览器上的操作,只想偷偷的在后台运行。
opt.add_argument('--headless')
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
service = Service("driver/chromedriver.exe")
opt = webdriver.ChromeOptions()
opt.add_argument('--headless')
driver = webdriver.Chrome(service=service, options=opt)
driver.get('https://www.5xclass.cn')
tag = driver.find_element(
By.XPATH,
'/html/body/div/div[2]/div/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/a[1]'
)
print(tag.text)
print(tag.get_attribute("target"))
print(tag.get_attribute("data-toggle"))
driver.close()
```
10.其他配置
opt.add_argument('--disable-infobars') # 禁止策略化
opt.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
opt.add_argument('window-size=1920x3000') # 指定浏览器分辨率
opt.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
opt.add_argument('--incognito') # 隐身模式(无痕模式)
opt.add_argument('--disable-javascript') # 禁用javascript
opt.add_argument('--start-maximized') # 最大化运行(全屏窗口),不设置,取元素会报错
opt.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
opt.add_argument('lang=en_US') # 设置语言
opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
opt.add_argument('User-Agent=Mozilla/5.0 (Linux; U; Androi....') # 设置User-Agent
opt.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置
11.截屏
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get('https://www.5xclass.cn')
tag = driver.find_element(
By.XPATH,
'/html/body/div/div[2]/div/div[2]/div/div[2]'
)
# 截图&保存
tag.screenshot("demo.png")
# 截图&图片内容
body = tag.screenshot_as_png
print(body)
# 截图&Base64编码格式图片内容
b64_body = tag.screenshot_as_base64
print(b64_body)
driver.close()
12.X东搜索
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
# 换成自己生成的代理
res = requests.get(url="http://www.zhiliandaili.com/api3/getapi2?linePoolIndex=-1&pme=1&port=1&format=json&ss=5&ipport=1&dt=0&isp=0&ct=0&uid=42342&usertype=17")
proxy_string = res.json()['data'][0]['ip']
print(f"获取代理:{proxy_string}")
service = Service("chromedriver.exe")
opt = webdriver.ChromeOptions()
opt.add_argument(f'--proxy-server={proxy_string}') # 代理
opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片
#不显示自动化控制
opt.add_argument('--disable-infobars')
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=service, options=opt)
#等待页面加载完
driver.implicitly_wait(10)
with open('hide.js') as f:
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": f.read()})
# 1.打开京东
driver.get('https://www.jd.com/')
# 2.搜索框+输入
tag = driver.find_element(
By.XPATH,
'//*[@id="key"]'
)
tag.send_keys("iphone手机")
# 3.点击搜索
tag = driver.find_element(
By.XPATH,
'//*[@id="search"]/div/div[2]/button'
)
tag.click()
# 4.查询列表
tag_list = driver.find_elements(
By.XPATH,
'//*[@id="J_goodsList"]/ul/li'
)
for tag in tag_list:
title = tag.find_element(By.XPATH, 'div/div[@class="p-name p-name-type-2"]/a/em').text
print(title)
driver.close()
13.大麦网
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
# 换成自己生成的代理
res = requests.get(url="http://www.zhiliandaili.com/api3/getapi2?linePoolIndex=-1&packid=2&qty=1&time=1&port=1&format=json&ss=5&ipport=1&dt=0&isp=0&ct=0&uid=42342&usertype=17")
proxy_string = res.json()['data'][0]['ip']
print(f"获取代理:{proxy_string}")
service = Service("chromedriver.exe")
opt = webdriver.ChromeOptions()
opt.add_argument(f'--proxy-server={proxy_string}') # 代理
opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片
#不显示自动化控制
opt.add_argument('--disable-infobars')
opt.add_experimental_option("excludeSwitches", ["enable-automation"])
opt.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(service=service, options=opt)
#等待页面加载完
driver.implicitly_wait(10)
with open('hide.js') as f:
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": f.read()})
# 1.打开大麦网
driver.get('https://www.damai.cn/')
# 2.搜索框+输入
tag = driver.find_element(
By.XPATH,
'//input[@class="input-search"]'
)
tag.send_keys("周杰伦")
# 3.点击搜索
tag = driver.find_element(
By.XPATH,
'//div[@class="btn-search"]'
)
tag.click()
# 4.查询列表
tag_list = driver.find_elements(
By.XPATH,
'//div[@class="search__itemlist"]//div[@class="items"]'
)
for tag in tag_list:
title = tag.find_element(By.XPATH, 'div[@class="items__txt"]/div[1]/a').text
print(title)
driver.close()
14.滑块验证
基于selenium实现过滑块验证核心需要三步:
- 获取验证码图片
import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service)
# 1.打开首页
driver.get('https://www.geetest.com/adaptive-captcha-demo')
# 2.点击【滑动拼图验证】
tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.XPATH,
'//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[3]'
))
tag.click()
# 3.点击开始验证
tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.CLASS_NAME,
'geetest_btn_click'
))
tag.click()
# 4.读取背景图片
def fetch_bg_func(dv):
tag_object = dv.find_element(
By.CLASS_NAME,
'geetest_bg'
)
style_string = tag_object.get_attribute("style")
match_list = re.findall('url\(\"(.*)\"\);', style_string) # ["http..." ] []
if match_list:
return match_list[0]
bg_image_url = WebDriverWait(driver, 30, 0.5).until(fetch_bg_func) # 新的函数 = 某个函数('geetest_bg')
print("背景图:", bg_image_url)
# 4.读取缺口图片
def fetch_slice_func(dv):
tag_object = dv.find_element(
By.CLASS_NAME,
'geetest_slice_bg'
)
style_string = tag_object.get_attribute("style")
match_list = re.findall('url\(\"(.*)\"\);', style_string)
if match_list:
return match_list[0]
slice_image_url = WebDriverWait(driver, 30, 0.5).until(fetch_slice_func) # 新的函数 = 某个函数('geetest_slice_bg')
print("缺口图:", slice_image_url)
time.sleep(2000)
driver.close()
- 识别图片,计算轨迹距离
方式1ddddocr
import ddddocr
import requests
slice_bytes = requests.get("缺口图片地址").content
bg_bytes = requests.get("背景图片地址").content
slide = ddddocr.DdddOcr(det=False, ocr=False, show_ad=False)
res = slide.slide_match(slice_bytes, bg_bytes, simple_target=True)
x1, y1, x2, y2 = res['target']
print(x1, y1, x2, y2) # 114 45 194 125
方式2opencv
import cv2
import numpy as np
import requests
def get_distance(bg_bytes, slice_bytes):
def get_image_object(byte_image):
img_buffer_np = np.frombuffer(byte_image, dtype=np.uint8)
img_np = cv2.imdecode(img_buffer_np, 1)
bg_img = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
return bg_img
bg_image_object = get_image_object(bg_bytes)
slice_image_object = get_image_object(slice_bytes)
# 边缘检测
bg_edge = cv2.Canny(bg_image_object, 255, 255)
tp_edge = cv2.Canny(slice_image_object, 255, 255)
bg_pic = cv2.cvtColor(bg_edge, cv2.COLOR_GRAY2RGB)
tp_pic = cv2.cvtColor(tp_edge, cv2.COLOR_GRAY2RGB)
res = cv2.matchTemplate(bg_pic, tp_pic, cv2.TM_CCOEFF_NORMED)
min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) # 寻找最优匹配
x = max_loc[0]
return x
slice_bytes = requests.get("缺口图片地址").content
bg_bytes = requests.get("背景图片地址").content
distance = get_distance(bg_bytes, slice_bytes)
print(distance)
方式三打码平台识别
http://www.ttshitu.com/
- 寻找滑块,控制滑动
from selenium.webdriver import ActionChains
tag = driver.find_element(By.CLASS_NAME, 'geetest_btn')
ActionChains(driver).click_and_hold(tag).perform() # 点击并抓住标签 不放开
ActionChains(driver).move_by_offset(xoffset=114, yoffset=0).perform() # 向右滑动114像素(向左是负数)
ActionChains(driver).release().perform() # 释放
案例
import re
import time
import ddddocr
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ActionChains
service = Service("driver/chromedriver.exe")
driver = webdriver.Chrome(service=service)
# 1.打开首页
driver.get('https://www.geetest.com/adaptive-captcha-demo')
# 2.点击【滑动拼图验证】
tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.XPATH,
'//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[3]'
))
tag.click()
# 3.点击开始验证
tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.CLASS_NAME,
'geetest_btn_click'
))
tag.click()
# 4.读取背景图片
def fetch_image_func(class_name):
def inner(dv):
tag_object = dv.find_element(
By.CLASS_NAME,
class_name
)
style_string = tag_object.get_attribute("style")
match_list = re.findall('url\(\"(.*)\"\);', style_string)
if match_list:
return match_list[0]
return inner
bg_image_url = WebDriverWait(driver, 30, 0.5).until(fetch_image_func("geetest_bg"))
slice_image_url = WebDriverWait(driver, 30, 0.5).until(fetch_image_func("geetest_slice_bg"))
slice_bytes = requests.get(slice_image_url).content
bg_bytes = requests.get(bg_image_url).content
slide = ddddocr.DdddOcr(det=False, ocr=False, show_ad=False)
res = slide.slide_match(slice_bytes, bg_bytes, simple_target=True)
x1, y1, x2, y2 = res['target']
print("滑动距离",x1)
def show_func(dv):
geetest_box_tag = dv.find_element(By.CLASS_NAME, "geetest_box")
display_string = geetest_box_tag.get_attribute("style")
if "block" in display_string:
time.sleep(2)
return dv.find_element(By.CLASS_NAME, 'geetest_btn')
btn_tag = WebDriverWait(driver, 30, 0.5).until(show_func)
ActionChains(driver).click_and_hold(btn_tag).perform() # 点击并抓住标签
ActionChains(driver).move_by_offset(xoffset=x1, yoffset=0).perform() # 向右滑动114像素(向左是负数)
ActionChains(driver).release().perform()
time.sleep(2000)
driver.close()
15.中文点击
ActionChains(driver).move_to_element_with_offset(标签对象, xoffset=x, yoffset=y).click().perform() #注意因为是相对位置,所以(0.0)是图片的中心
封装第三方平台方法
# http://www.ttshitu.com/
import base64
import json
import requests
def base64_api(uname, pwd, img, typeid):
with open(img, 'rb') as f:
base64_data = base64.b64encode(f.read())
b64 = base64_data.decode()
data = {"username": uname, "password": pwd, "typeid": typeid, "image": b64}
result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
if result['success']:
return result["data"]["result"]
else:
# !!!!!!!注意:返回 人工不足等 错误情况 请加逻辑处理防止脚本卡死 继续重新 识别
return result["message"]
return ""
def www(img_path, id):
result = base64_api(uname='5483', pwd='Z52121', img=img_path, typeid=id)
return result
实现
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from tujie import www
from selenium.webdriver import ActionChains
service = Service("chromedriver.exe")
driver = webdriver.Chrome(service=service)
#############################################################################################获取图片
# 1.打开首页
driver.get('https://www.geetest.com/adaptive-captcha-demo')
# 2.点击【滑动拼图验证】
tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.XPATH,
'//*[@id="gt-showZh-mobile"]/div/section/div/div[2]/div[1]/div[2]/div[3]/div[4]'
))
tag.click()
# 3.点击开始验证
tag = WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(
By.CLASS_NAME,
'geetest_btn_click'
))
tag.click()
# 4.等待验证码出来
time.sleep(5)
#剪切图片
target_tag = driver.find_element(
By.CLASS_NAME,
'geetest_box'
)
target_tag.screenshot("geetest_box.png")
# 5.识别任务图片
rzz = www("geetest_box.png",20).split("|")
print(rzz)
#6.点击
for group in rzz:
time.sleep(1)
x, y = group.split(',')
x = int(x) - int(target_tag.size['width'] / 2)
y = int(y) - int(target_tag.size['height'] / 2) +50 #因为是截取的整张图片所以这里根据图片间距调整+50
ActionChains(driver).move_to_element_with_offset(target_tag, xoffset=x, yoffset=y).click().perform()
time.sleep(10)
driver.close()
标签:webdriver,进阶,service,driver,爬虫,tag,import,new,div
From: https://www.cnblogs.com/wdyjx/p/18054158