import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pymysql
# 指定WebDriver的路径
webdriver_path = 'C:/chromedriver/chromedriver.exe'
# 创建Service对象
service = Service(executable_path=webdriver_path)
# 连接数据库
db = pymysql.connect(host='127.0.0.1', user='root', password='******', db='lei_db', charset='utf8mb4')
# 创建游标对象
cursor = db.cursor()
# 初始化WebDriver
driver = webdriver.Chrome(service=service)
# 打开目标网页
driver.get("https://www.leichina.org/cei/2935720/2935943/index.html")
time.sleep(10)
driver.maximize_window()
# 等待页面加载完成
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# 定位iframe并切换上下文
iframe = driver.find_element(By.ID, "frame2")
driver.switch_to.frame(iframe)
# 定位“更多”按钮并点击
more_btn = driver.find_element(By.ID, "moreBtn")
more_btn.click()
# 等待select元素出现
select = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#btn2 > select")))
# 使用Select类来处理下拉框
select = Select(select)
select.select_by_index(1)
# 定位验证码并刷icon
icon_element = driver.find_element(By.ID, "changepic")
icon_element.click() # 点击刷新icon,刷新它
# 等待验证码图片加载完成
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#imgcode")))
# 这里等待用户手动输入验证码
user_input = input("请打开验证码图片URL,在浏览器中查看验证码,并输入验证码:")
# 定位验证码输入框并输入验证码
vcode_input = driver.find_element(By.CSS_SELECTOR, "#searchForm\:vCode")
vcode_input.send_keys(user_input)
# 定位查询按钮并点击
query_btn = driver.find_element(By.ID, "searchForm:j_id36")
query_btn.click()
time.sleep(10)
# 等待查询结果表格加载完成
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData")))
# 解析查询结果
total_pages = 10692
for page in range(1, total_pages + 1):
print(f"正在处理第{page}页...")
time.sleep(10)
# 等待表格加载完成
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData")))
#仅用于获取行数 因为最后一页行数可能小于10
table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb")
rows = table.find_elements(By.TAG_NAME, "tr")
r_count = len(rows)
for i in range(0,r_count):
# 解析表格
table = driver.find_element(By.CSS_SELECTOR, "#resultForm\:cdmLegalEntityListData\:tb")
# rows = table.find_elements(By.TAG_NAME, "tr")[1:] # 跳过表头
rows = table.find_elements(By.TAG_NAME, "tr")
row=rows[i]
cells = row.find_elements(By.TAG_NAME, "td")
lei = cells[0].text
company_cn_name = cells[1].text
status = cells[2].text
address = cells[3].text
# 模拟点击进入详情页
detail_link = cells[4].find_element(By.TAG_NAME, "a")
detail_link.click()
time.sleep(5)
# 等待详情页加载完成
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
# 定位验证码输入框并输入验证码
company_en_name = driver.find_element(By.CSS_SELECTOR,
"#j_id3\:j_id6 > table > tbody > tr > td > table > tbody > tr:nth-child(2) > td.txtLeft").text
# 执行SQL语句,插入数据
sql = "INSERT INTO lei (lei, company_cn_name, status, address, company_en_name) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, (lei, company_cn_name, status, address, company_en_name))
# print(f"lei: {lei}")
# print(f"company_cn_name: {company_cn_name}")
# print(f"status: {status}")
# print(f"address: {address}")
# print(f"company_en_name: {company_en_name}")
# 定位返回按钮并点击
back_btn = driver.find_element(By.CSS_SELECTOR,
"#j_id3 > table > tbody > tr > td > table > tbody > tr:nth-child(1) > td > table > tbody > tr > td > table:nth-child(15) > tbody > tr:nth-child(2) > td > table > tbody > tr > td:nth-child(2) > a > img")
back_btn.click()
time.sleep(5)
# 提交事务
db.commit()
print(f"已经爬取第{i+1}条...")
# 翻页
if page <= total_pages:
next_page_btn = driver.find_element(By.XPATH, '//*[@id="resultForm"]/table/tbody/tr[1]/td/table/tbody/tr/td/table[2]/tbody/tr[2]/td/a[1]')
next_page_btn.click()
time.sleep(20) # 等待页面加载完成
# 关闭数据库连接
cursor.close()
db.close()
标签:name,tr,LEI,driver,element,爬取,table,find,selenium
From: https://www.cnblogs.com/zhouwp/p/18184217