import random
import time
from datetime import datetime
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import selenium
import pyquery
from ConnectionPool import Client_Pool
from abc import ABC, abstractmethod
from cookies.CookieSaver import CookieSaver
class BaseCrawler(ABC):
@abstractmethod
def crawl(self, url: str):
pass
@abstractmethod
def parse(self):
pass
@abstractmethod
def save(self):
pass
class CrawlerData:
dataList: list[dict[str, any]]
dataDict: dict[str, any]
cssDict: dict[str, str]
def __init__(self, css_dict: dict[str, str]):
self.cssDict = css_dict
self.dataList = []
self.dataDict = {}
def write(self, col_name: str, value: any):
self.dataDict[col_name] = value
def css(self, name: str) -> str:
return self.cssDict.get(name, "")
def nextRow(self):
for col_name in self.cssDict.keys():
self.dataDict[col_name] = self.dataDict.get(col_name, None)
self.dataList.append(self.dataDict)
self.dataDict = {}
def getColName(self):
return tuple(self.cssDict.keys())
class Crawler(BaseCrawler):
driver: webdriver.Edge
data: CrawlerData
cookieSaver: CookieSaver
def crawl(self, url: str):
pass
def parse(self):
pass
def save(self):
with MongoClient() as client:
db = client["Hotel"]
collection = db["XIECHENG"]
tuples = self.data.dataList
for t in tuples:
collection.update_one(t, {"$set": t}, upsert=True)
def __init__(self, url_list: list[str], ):
# 设置反爬参数,防止被发现是爬虫
options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument(
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0')
self.driver = webdriver.Edge(options=options)
self.urlList = url_list
def randomWait(min_time=0.5, max_time=1.5):
time.sleep(random.uniform(min_time, max_time))
class HotelCrawler(Crawler):
def __init__(self, url_list: list[str]):
self.goods_css = "div.card-item-wrap"
css_dict = {
"title": ".list-card-title span",
"location": "span.ads",
"price": "span.real-price",
"tags": "div.list-card-tag",
"comment": "div.list-card-comment p.count",
"score": "div.score span"
}
self.data = CrawlerData(css_dict)
super().__init__(url_list)
self.cookieSaver = CookieSaver(self.driver)
def randomScroll(self):
self.driver.execute_script("window.scrollBy(0,(Math.random()*0.3+0.7)*document.body.scrollHeight);")
def parse(self):
doc = pyquery.PyQuery(self.driver.page_source)
goods = doc(self.goods_css).items()
for g in goods:
for col, css in self.data.cssDict.items():
self.data.write(col, g(css).text())
self.data.write("domain", self.cookieSaver.cookies.domain)
self.data.write("time", datetime.now().date().isoformat())
self.data.nextRow()
def findMore(self):
try:
target = self.driver.find_element(By.CSS_SELECTOR, "div.list-btn-more div")
self.driver.execute_script("arguments[0].scrollIntoView({block: 'center',inline: 'center'});", target)
target.click()
return True
except Exception as e:
return False
def crawlAllURL(self, times=10):
for url in self.urlList:
self.crawl(url,times)
def crawl(self, url: str, times=10):
self.driver.get(url)
self.driver.get(url)
load = self.cookieSaver.load_cookies()
valid = self.cookieSaver.is_cookie_valid()
while not load or not valid:
input("请登录后按回车键继续...")
self.cookieSaver.save_cookies()
load = self.cookieSaver.load_cookies()
valid = self.cookieSaver.is_cookie_valid()
more_times = 0
try:
while True:
if self.findMore():
more_times += 1
if more_times > times:
break
else:
self.randomScroll()
randomWait(2.5, 3)
except Exception as e:
self.parse()
self.save()
print(f'遇到错误:{e}'
f'已经当前数据存储')
self.parse()
self.save()
if __name__ == '__main__':
urls = [
"https://hotels.ctrip.com/hotels/list?countryId=1&city=-1&optionId=16&optionType=Province&display=%E6%B5%99%E6%B1%9F%2C+%E4%B8%AD%E5%9B%BD",
]
crawler = HotelCrawler(urls)
crawler.crawlAllURL(100)
标签:__,酒店,携程,self,list,爬虫,str,import,def
From: https://www.cnblogs.com/wzu-hqj/p/18593758