相关资料
免责声明
本人承诺在使用爬虫程序时,仅出于学习和个人技术提升的目的,以了解互联网数据抓取和分析的工作原理。
本人郑重声明:
-
非商业用途:本人所使用的爬虫程序纯属个人学习之需,绝不用于商业目的或从事任何违法违规活动。
-
尊重网站规则:在使用爬虫程序时,本人将遵守所爬取网站的robots.txt协议或网站的访问规则,并尊重网站所有者对于爬虫程序的限制。
-
数据保护和隐私:本人承诺在爬取数据时不会获取、储存或利用任何个人隐私信息,且不会对所访问网站造成不必要的负担或干扰。
-
技术分享与合作:本人愿意遵守技术共享的原则,不会滥用所学知识,也愿意在合适的情况下与相关人士分享学习心得。
-
法律遵守:本人承诺遵守当地法律法规,并对于使用爬虫程序可能带来的风险和责任自行承担。
使用目的
这次我使用是为了爬取一个网站的数据,但是这个网站用普通的爬虫只能得到静态的网页,反爬做的比较好,于是我就找到了可以模拟点击浏览器的自动化工具Selenium
其可以通过代码自动打开浏览器,并进行点击,填写,跳转等操作,并可以通过内置方法获取HTML元素
代码
import traceback
from threading import Thread
import requests
from bs4 import BeautifulSoup
import time
import random
from helium import *
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
chrome_driver_path = r'C:\Users\86150\Documents\chrome-win64\chrome-win64\chrome.exe'
starturl = 'url'
def findQuestion(html01):
try:
# 查找问题
question = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
.find_element(By.CLASS_NAME, 'timu') \
.find_element(By.CLASS_NAME, 'timu-text')
question = str(question.text)
return question
except Exception as ex:
print(f"出现异常Question:f{ex}")
return "NaN"
def findSelect(html01):
try:
# 查找选择
selectOpt = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
.find_element(By.CLASS_NAME, 'answer-w') \
.find_element(By.CLASS_NAME, 'options-w') \
.find_elements(By.TAG_NAME, 'p')
selectlist = []
for p_element in selectOpt:
text = p_element.text
selectlist.append(text)
ans = "\n".join(selectlist)
return ans
except Exception as ex:
print(f"出现异常Select:f{ex}")
return "NaN"
def findImg(html01):
try:
# 查找图片
imageInf = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
.find_element(By.CLASS_NAME, 'answer-w') \
.find_element(By.CLASS_NAME, 'media-w') \
.find_element(By.TAG_NAME, 'img').get_attribute('src')
return str(imageInf)
except Exception as ex:
print(f"No Img")
return "NaN"
def findAns(html01, wait):
try:
ans_element = html01.find_element(By.CLASS_NAME, "com-shiti-xiangjie") \
.find_element(By.CLASS_NAME, "xiangjie") \
.find_element(By.CLASS_NAME, "content")
wait.until(lambda d: ans_element.is_displayed())
ans = str(ans_element.get_attribute("innerHTML"))
if ans == "":
ans = "Void"
return ans
except Exception as ex:
print(f"出现异常Ans:f{ex}")
return "NaN"
def getAns(html01):
try:
html01.find_element(By.CLASS_NAME, "tool-bar") \
.find_element(By.CLASS_NAME, "btn-bar") \
.find_element(By.CSS_SELECTOR, 'button.right.pt[ref="xiangqing"]').click()
except Exception as ex:
print(f'出现异常getAns:{ex}')
def nextOne(html01):
try:
html01.find_element(By.CLASS_NAME, "tool-bar") \
.find_element(By.CLASS_NAME, "btn-bar") \
.find_element(By.XPATH, "//button[text()='下一题']").click()
return True
except Exception as ex:
print(f"出现异常nextOne:f{ex}")
return False
def debug(df_train):
try:
print(df_train.shape)
print(df_train.tail(1))
print("***************************")
print()
except Exception as ex:
print(f"出现异常DEBUG:{ex}")
def getData(url):
startNum = 0
df_train = pd.DataFrame({'question': [], 'selectOption': [],
'imgUrl': [], 'answer': []})
# 打开主页
driver = webdriver.Chrome()
driver.get(url)
wait = WebDriverWait(driver, timeout=10)
time.sleep(10)
while startNum < 1424:
startNum = startNum + 1
try:
html01 = driver.find_element(By.CLASS_NAME, 'layout-article') \
.find_element(By.CLASS_NAME, 'news-page') \
.find_element(By.CLASS_NAME, 'left')
wait.until(lambda d: html01.is_displayed())
getAns(html01)
html01 = driver.find_element(By.CLASS_NAME, 'layout-article') \
.find_element(By.CLASS_NAME, 'news-page') \
.find_element(By.CLASS_NAME, 'left')
wait.until(lambda d: html01.is_displayed())
a1 = findQuestion(html01)
a2 = findSelect(html01)
a3 = findImg(html01)
a4 = findAns(html01, wait)
'''
print(a1)
print(a2)
print(a3)
print(a4)
'''
tdf = pd.DataFrame({'question': [a1], 'selectOption': [a2],
'imgUrl': [a3], 'answer': [a4]})
df_train = pd.concat([df_train, tdf])
debug(df_train)
flag = nextOne(html01)
if not flag:
break
except Exception as ex:
print("出现如下异常getData:%s" % ex)
traceback.print_exc()
driver.quit()
return df_train
Data_MONI = getData(starturl)
Data_MONI.to_csv("D:\\moni_four-data")
标签:NAME,html01,CLASS,Selenium,爬虫,element,print,find
From: https://www.cnblogs.com/cilinmengye/p/17926739.html