标签：bs4 res selenium find bro time import 选择器

1. bs4搜索文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# html_doc是内容  lxml是解析器
soup = BeautifulSoup(html_doc, 'lxml')

# 搜索文档树  find:找一个     find_all：找所有

# 5 种搜索方式： 字符串、正则表达式、列表、True、方法

1.1 字符串:可以按照标签名，属性名查找

# res=soup.find(name='a',id='link2')
# res=soup.find(href='http://example.com/tillie')
# res=soup.find(class_='story')
# res=soup.body.find('p')
# res=soup.body.find(string='Elsie')
res=soup.find(attrs={'class':'sister'})
print(res)

1.2 正则表达式标签名，属性可以使用正则匹配

import re
# res=soup.find_all(name=re.compile('^b'))
# res=soup.find_all(href=re.compile('^http'))
# for item in res:
#     url=item.attrs.get('href')
#     print(url)
# request-html    获取到页面中所有的链接地址
res=soup.find(attrs={'href':re.compile('^a')})

print(res)

1.3 列表标签名，属性名等于列表或条件

# res=soup.find_all(class_=['story','sister'])  # 或条件
res=soup.find_all(name=['a','p'])  # 或条件
print(res)

1.4 True 标签名，属性名等于布尔

res = soup.find_all(name=True)  # 有标签名的所有标签
print(res)

# 拿出页面中所有图片
# res = soup.find_all(src=True)
# for item in res:
#     url = item.attrs.get('href')
#     print(url)

1.5 方法标签名或属性名 = 方法

# 拿出页面中所有图片
# res = soup.find_all(src=True)
# for item in res:
#     url = item.attrs.get('href')
#     print(url)

1.6 find_all的其他属性 limit recursive:False,只找一层

res=soup.find_all(name='a',limit=2)   # find的本质是find_all + limit=1

res=soup.body.find(name='p',id=False).find_all(name='a',recursive=False)

print(res)

2. css选择器

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')
# res=soup.select('a')
# res=soup.select('#link1')
# res=soup.select('.sister')
# res=soup.select('body>p>a')
# 只需要会了css选择，几乎所有的解析器[bs4,lxml...],都会支持css和xpath


# res=soup.select('body>p>a:nth-child(2)')
# res=soup.select('body>p>a:nth-last-child(1)')

# [attribute=value]
res = soup.select('a[href="http://example.com/tillie"]')
print(res)

'''
记住的：
    1  标签名
    2  .类名
    3  #id号
    4 body a   body下子子孙孙中得a
    5 body>a  body下子的a，没有孙
    6 其他的参照css选择器
'''

3. selenium基本使用

1. requests 发送http请求获取数据，获取数据时xml使用bs4解析，解析出咱们想要的数据
    使用requests获取回来的数据，跟直接在浏览器中看到的数据，可能不一样
    requests不能执行js
    如果使用requets，需要分析当次请求发出了多少请求，每个都要发送一次，才能拼凑出网页完整的数据

2. selenium 操作浏览器，控制浏览器，模拟人的行为
   selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题
   selenium本质是通过驱动浏览器，完全模拟浏览器的操作，比如跳转、输入、点击、下拉等，来拿到网页渲染之后的结果，可支持多种浏览器

3. 使用
   安装模块：pip install selenium
   下载浏览器驱动：selenium操作浏览器，需要有浏览器(谷歌浏览器)，谷歌浏览器驱动
       https://registry.npmmirror.com/binary.html?path=chromedriver/
       浏览器版本对应的驱动
        106.0.5249.119    找到相应的驱动

3.1 快速使用，自动打开浏览器

from selenium import webdriver
import time
# 驱动放在环境变量中，就不会传这个参数
# 打开一个浏览器

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# 在地址栏输入 网站
bro.get('http://www.baidu.com')

time.sleep(3)
bro.close() # 关闭tab页
# bro.quit() # 关闭浏览器

4. 无界面浏览器

# 做爬虫，不希望有一个浏览器打开，谷歌支持无头浏览器，后台运行，没有浏览器的图形化（GUI）界面

from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options


# 驱动放到环境变量中，就不用传这个参数了
# 打开一个浏览器
chrome_options = Options()
# chrome_options.add_argument('window-size=1920x3000')  # 指定浏览器分辨率
# chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
# chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
# chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
# chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"  # 手动指定使用的浏览器位置

bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options)

# 在地址栏输入 网站
bro.get('https://www.jd.com/')
print(bro.page_source) # 浏览器看到的页面的内容

time.sleep(3)
bro.close() # 关闭tab页
bro.quit()  # 关闭浏览器

5. selenium其它用法

5.1 自动登录百度

# 运行自动进入百度页面，自动填写账号，密码，自动登录
import time

from selenium import webdriver
from selenium.webdriver.common.by import By

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('http://www.baidu.com')
bro.implicitly_wait(10) # 等待，找一个标签，如果标签没加载出来，等一会
# By是selenium中内置的一个class，在这个class中有各种方法来定位元素
bro.maximize_window() # 全屏
# 通过 a标签文字内容查找标签的方式
# selenium中通过find_element方法来完成定位。
a = bro.find_element(by=By.LINK_TEXT, value='登录')
# 点击标签
a.click()

# 页面中id唯一，如果有id,优先用id
input_name = bro.find_element(by=By.ID, value='TANGRAM__PSP_11__userName')
# 输入用户名
input_name.send_keys('123@qq.com')

time.sleep(1)

input_password = bro.find_element(by=By.ID,value='TANGRAM__PSP_11__password')
input_password.send_keys('12345')
time.sleep(1)

input_submit = bro.find_element(by=By.ID,value='TANGRAM__PSP_11__submit')
# 点击
input_submit.click()
time.sleep(5)
bro.close()

5.2 获取位置属性大小，文本

# 查找标签
bro.find_element(by=By.ID,value='id号')
bro.find_element(by=By.LINK_TEXT,value='a标签文本内容')
bro.find_element(by=By.PARTIAL_LINK_TEXT,value='a标签文本内容模糊匹配')
bro.find_element(by=By.CLASS_NAME,value='类名')
bro.find_element(by=By.TAG_NAME,value='标签名')
bro.find_element(by=By.NAME,value='属性name')
# -----通用的----
bro.find_element(by=By.CSS_SELECTOR,value='css选择器')
bro.find_element(by=By.XPATH,value='xpath选择器')

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import base64
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
a = bro.find_element(by=By.LINK_TEXT, value='扫码登录')
# a = bro.find_element(by=By.CSS_SELECTOR, value='.login-hd-account>a')
a.click()
# code = bro.find_element(by=By.ID, value='J-qrImg')
code = bro.find_element(by=By.CSS_SELECTOR, value='#J-qrImg')

# # 方案一：通过位置，和大小，截图截出来
print(code.id)
print(code.location)
print(code.tag_name)
print(code.size)

# 方案二：通过src属性获取到图片
print(code.location)
print(code.size)
print(code.id)  # 不是标签的id号
print(code.tag_name)  # 是标签的名字
s = code.get_attribute('src')
print(s)
with open('code.png','wb') as f:
    res=base64.b64decode(s.split(',')[-1])
    f.write(res)

5.3 等待元素被加载

1. 代码执行很快，有些标签还没加载出来，直接取，取不到
2. 等待
      显示等待：一般不用，需要指定等待哪个标签，如果标签很多，每个都要设置比较麻烦
      隐士等待：
      bro.implicitly_wait(10)
      find找标签的时候，如果找不到，等最多10s钟

5.4 元素操作

1. 点击 
   标签.click()

2. input写文字
   标签.send_keys('文字')

3. input清空文字
   标签.clear()

4. 模拟键盘操作
   from selenium.webdriver.common.keys import Keys
   input_search.send_keys(Keys.ENTER)

5.5 自动搜索

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

input_search=bro.find_element(by=By.ID,value='key')
input_search.send_keys('精品内衣')


# 模拟回车操作
input_search.send_keys(Keys.ENTER)
time.sleep(5)
bro.close()

5.6 执行js代码，滑动屏幕

import time

from selenium import webdriver
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

# 1. 能干很多事情，打印了cookie
bro.execute_script('alert(document.cookie)')

time.sleep(3)
bro.close()

# 2. 滚动页面，到最底部
# for i in range(10):
#     y=400*(i+1)
#     bro.execute_script('scrollTo(0,%s)'%y)
#     time.sleep(1)

# 3. 一次性直接滑动到最底部
bro.execute_script('s

5.7 切换选项卡

# 跳转页面
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

# 使用js打开新的选项卡
bro.execute_script('window.open()')

# 切换到这个选项卡上，刚刚打开的是第一个
bro.switch_to.window(bro.window_handles[1])
bro.get('http://www.taobao.com')
time.sleep(2)
bro.switch_to.window(bro.window_handles[0])

time.sleep(3)
bro.close()
bro.quit()

5.8 浏览器前进后退

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')

time.sleep(2)
bro.get('https://www.taobao.com/')

time.sleep(2)
bro.get('https://www.baidu.com/')

# 后退一下
bro.back()
time.sleep(1)
# 前进一下
bro.forward()
time.sleep(3)
bro.close()

5.9 异常处理

import time

from selenium import webdriver
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.jd.com/')
try:
    time.sleep(2)
    bro.get('https://www.taobao.com/')

    time.sleep(2)
    bro.get('https://www.baidu.com/')

    # 后退一下
    bro.back()
    time.sleep(1)
    # 前进一下
    bro.forward()
    time.sleep(3)
    bro.close()
except Exception as e:
    print(e)
finally:
    bro.close()

6. selenium登录cnblogs获取cookie

# 操作浏览器，登录成功就可以拿到登录成功的cookie，保存到本地
# 如果有很多小号，会有很多cookie，搭建cookie池
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import json


# 登录过程
# bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.get('https://www.cnblogs.com/')
# bro.implicitly_wait(10)
# try:
#     # 找到登录按钮
#     submit_btn = bro.find_element(By.LINK_TEXT, value='登录')
#     submit_btn.click()
#     time.sleep(1)
#     username = bro.find_element(By.ID, value='mat-input-0')
#     password = bro.find_element(By.ID, value='mat-input-1')
#     username.send_keys("616564099@qq.com")
#     password.send_keys('sadfasdfads')
#
#     submit = bro.find_element(By.CSS_SELECTOR,
#                               value='body > app-root > app-sign-in-layout > div > div > app-sign-in > app-content-container > div > div > div > form > div > button')
#
#     time.sleep(20)
#     submit.click()
#     # 会有验证码，滑动，手动操作完了，敲回车，程序继续往下走
#     input()
#     # 已经登录成功了
#
#     cookie = bro.get_cookies()
#     print(cookie)
#     with open('cnblogs.json', 'w', encoding='utf-8') as f:
#         json.dump(cookie, f)
#
#     time.sleep(5)
# except Exception as e:
#     print(e)
# finally:
#     bro.close()




#  打开cnblose，自动写入cookie，我就是登录状态了
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.cnblogs.com/')
bro.implicitly_wait(10)
time.sleep(3)
# 把本地的cookie写入，就登录了
with open('cnblogs.json','r',encoding='utf-8') as f:
    cookie=json.load(f)

for item in cookie:
    bro.add_cookie(item)


# 刷新一下页面
bro.refresh()
time.sleep(10)
bro.close()

7. 抽屉半自动点赞

# 使用selenium登录到抽屉，获取到，使用requests，自动点赞
  使用requests登录，非常难登录，因为有验证码

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import requests

bro = webdriver.Chrome(executable_path='./chromedriver.exe')

bro.get('https://dig.chouti.com/')
bro.implicitly_wait(10)
try:
    submit = bro.find_element(by=By.ID, value='login_btn')
    bro.execute_script("arguments[0].click()", submit)
    # submit.click() # 有的页面button能找到，但是点击不了，报错，可以使用js点击它
    time.sleep(2)
    username = bro.find_element(by=By.NAME, value='phone')
    username.send_keys('18953675221')
    password = bro.find_element(by=By.NAME, value='password')
    password.send_keys('lqz123')
    time.sleep(3)
    submit_button = bro.find_element(By.CSS_SELECTOR,
                                     'body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
    submit_button.click()

    # 验证码
    input()
    cookie = bro.get_cookies()
    print(cookie)
    with open('chouti.json', 'w', encoding='utf-8') as f:
        json.dump(cookie, f)

    # 找出所有文章的id号
    div_list = bro.find_elements(By.CLASS_NAME, 'link-item')
    l = []
    for div in div_list:
        article_id = div.get_attribute('data-id')
        l.append(article_id)



except Exception as e:
    print(e)

finally:
    bro.close()

#  继续往下写，selenium完成它的任务了，登录---》拿到cookie，使用requests发送[点赞]

print(l)

with open('chouti.json', 'r', encoding='utf-8')as f:
    cookie = json.load(f)
# 小细节，selenium的cookie不能直接给request用，需要有些处理
request_cookies = {}
for item in cookie:
    request_cookies[item['name']] = item['value']
print(request_cookies)
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
for i in l:
    data = {
        'linkId': i
    }
    res = requests.post('https://dig.chouti.com/link/vote', data=data, headers=header, cookies=request_cookies)
    print(res.text)

标签：bs4,res,selenium,find,bro,time,import,选择器
From： https://www.cnblogs.com/cainiaozhy/p/16926280.html

【爬虫】bs4搜索文档，css选择器，selenium基本使用

1. bs4搜索文档树

1.1 字符串:可以按照标签名，属性名查找

1.2 正则表达式标签名，属性可以使用正则匹配

1.3 列表标签名，属性名等于列表或条件

1.4 True 标签名，属性名等于布尔

1.5 方法标签名或属性名 = 方法

1.6 find_all的其他属性 limit recursive:False,只找一层

2. css选择器

3. selenium基本使用

3.1 快速使用，自动打开浏览器

4. 无界面浏览器

5. selenium其它用法

5.1 自动登录百度

5.2 获取位置属性大小，文本

5.3 等待元素被加载

5.4 元素操作

5.5 自动搜索

5.6 执行js代码，滑动屏幕

5.7 切换选项卡

5.8 浏览器前进后退

5.9 异常处理

6. selenium登录cnblogs获取cookie

7. 抽屉半自动点赞

相关文章

赞助商

阅读排行

【爬虫】bs4搜索文档，css选择器，selenium基本使用

1. bs4搜索文档树

1.1 字符串:可以按照标签名，属性名查找

1.2 正则表达式 标签名，属性可以使用正则匹配

1.3 列表 标签名，属性名 等于列表 或条件

1.4 True 标签名，属性名 等于布尔

1.5 方法 标签名或属性名 = 方法

1.6 find_all的其他属性 limit recursive:False,只找一层

2. css选择器

3. selenium基本使用

3.1 快速使用，自动打开浏览器

4. 无界面浏览器

5. selenium其它用法

5.1 自动登录百度

5.2 获取位置属性大小，文本

5.3 等待元素被加载

5.4 元素操作

5.5 自动搜索

5.6 执行js代码，滑动屏幕

5.7 切换选项卡

5.8 浏览器前进后退

5.9 异常处理

6. selenium登录cnblogs获取cookie

7. 抽屉半自动点赞

相关文章

赞助商

阅读排行

1.2 正则表达式标签名，属性可以使用正则匹配

1.3 列表标签名，属性名等于列表或条件

1.4 True 标签名，属性名等于布尔

1.5 方法标签名或属性名 = 方法