main.py
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery
import pymongo
from config import *
import re
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
wait = WebDriverWait(browser, 10)
client = pymongo.MongoClient(MONGO_URL, MONGO_PORT)
db = client[MONGO_DB]
def search_page():
try:
browser.get("https://www.damai.cn/")
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.dm-header-wrap > div > div.search-header > input")))
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "body > div.dm-header-wrap > div > div.search-header > div.btn-search")))
input.send_keys("演唱会")
submit.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-top > span.search-box-keyword")))
print ("共找到" + total.text + "个结果")
get_products()
return True
except TimeoutError:
search_page()
def next_page(index):
try:
page_css_id = "body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.pagination > div > ul > li:nth-child(" + str(index)+")"
switch_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, page_css_id)))
switch_page.click()
now_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-flex > div.search-main > div.search-sort.search-main-sort > div.pagination-top.search-sort_fr > div > span:nth-child(1)")))
if str(index) == now_page.text:
print("切换到第" + str(index) + "页")
get_products()
else:
next_page(index)
except TimeoutError:
next_page(index)
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.item__main > div")))
html = browser.page_source
doc = PyQuery(html)
items = doc('body > div.search-box > div.search-box-flex > div.search-main > div.search__itemlist > div.item__main > div')
for item in items.children().items():
course_name = item.find("div > div.items__txt__title > a").text()
foo = item.find("div > div:nth-child(2)").text()
if "艺人:" in foo:
people_name = re.sub("艺人:", "", foo)
address = ""
else:
address = foo
people_name = ""
if not address:
address = item.find("div > div:nth-child(3)").text()
course_date = item.find("div > div:nth-child(4)").text()
else:
course_date = item.find("div > div:nth-child(3)").text()
product = {
"演唱会名字:": course_name,
"乐队名字:": people_name,
"演唱地点": address,
"演唱日期": course_date
}
print (product)
save_to_mongo(product)
def save_to_mongo(result):
if db[MONGO_TABLE].insert_one(result):
print("存储到数据库", result)
else:
print("存储数据出错", result)
def main():
search_page()
for num in range(2, 6):
next_page(num)
sleep(2)
if __name__ == "__main__":
main()
settings.py
MONGO_URL = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'damai'
MONGO_TABLE = 'yanchanghui'
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
标签:__,box,search,爬取,大麦,div,main,page,演唱会
From: https://www.cnblogs.com/z5onk0/p/16753192.html