【爬虫】项目篇-selenium爬取大鱼潮汐网

标签：xpath get selenium 爬虫爬取 print drvier date find

爬取指定日期的潮汐数据

创建driver对象，并设为最大窗口

url="https://www.chaoxibiao.net/tides/75.html"
option=Options()
option.binary_location=r"C:\Users\txmmy\AppData\Local\Google\Chrome\Application\chrome.exe"
drvier=webdriver.Chrome(options=option)
drvier.get(url)
drvier.maximize_window()

2.设置起始日期和终止日期

date1='2003-1-1'
date2='2003-3-1'

3.由于需要自动切换日期，因此需要设置自动切换日期的方法。首先是定位到年份，其次定位到月份

def locate_date():
	# 将字符串日期改为日期格式
	date1_date = datetime.strptime(date1, "%Y-%m-%d").date()
	#获取起始年份
	y1 = int(date1.split('-')[0])
	#点击切换日期按钮
	drvier.find_element_by_xpath('//*[@id="changeDate"]').click()
	# 先选择年份
	drvier.find_element_by_xpath('//div[@class="laydate-set-ym"]/span[1]').click()
	#获取15个年份中的第一个，即最小年份
	init_year = drvier.find_element_by_xpath('//div[@class="layui-laydate-content"]/ul/li').get_attribute('lay-ym')
	#如果起始年份小于日历的最小年份，那就向前反也
    while y1 < int(init_year):
        # 向前翻页
        drvier.find_element_by_xpath('//*[@id="layui-laydate1"]/div[1]/div[1]/i[1]').click()
        # 得到当前日历的初始年份和末尾年份
        init_year = drvier.find_element_by_xpath('//div[@class="layui-laydate-content"]/ul/li').get_attribute(
            'lay-ym')
        last_year = drvier.find_element_by_xpath(
            '//div[@class="layui-laydate-content"]/ul/li/following-sibling::*[14]').get_attribute('lay-ym')
		# 如果起始年份大于等于当前日历页的最小年份且小于等于当前日历页的最大年份，则开始遍历当前日历页，最后点击
        if y1 >= int(init_year) and y1 <= int(last_year):
            for i in range(1, 15):
                year = drvier.find_element_by_xpath(
                    '//div[@class="layui-laydate-content"]/ul/li/following-sibling::*[%d]' % i)
                if y1 == int(year.get_attribute('lay-ym')):
                    year.click()
                    break
    # 定位月份
    m1 = int(date1.split('-')[1])
    drvier.find_element_by_xpath('//div[@class="laydate-set-ym"]/span[2]').click()
    time.sleep(2)
    init_m1 = drvier.find_element_by_xpath('//ul[@class="layui-laydate-list laydate-month-list"]/li[%d]' % m1)
    init_m1.click()
	
	# 点击月份后的日历页是一个6*7的表格，循环这个表格，获取当前日期。判断当前日期是否大于起始日期，如果当前日期为终止日期，则停止切换日期（循环），如果当前日期不为终止日期且为选中日期或当月日期，则点击选中，如果当前日期不为终止日期且为下个月日期，递归该函数。
    # 自动切换日期
    def change_date():
        for i in range(1, 7):
            for j in range(1, 8):
                a = drvier.find_element_by_xpath(
                    '//div[@class="layui-laydate-content"]/table/tbody/tr[%d]/td[%d]' % (i, j))
                # 字符串转换
                a_date = datetime.strptime(a.get_attribute('lay-ymd'), "%Y-%m-%d").date()
                print(a_date, date1_date)
                distance = a_date - date1_date
                if distance.days >= 0:
                    print(a.text)
                    print(a.get_attribute('lay-ymd'))
                    print("distance", distance.days)
                    # 若切换的日期为终止日期，停止循环
                    if a.get_attribute('lay-ymd') == date2:
                        a.click()
                        GetCondition()
                        return
                    else:
                        if a.get_attribute('class') == '' or a.get_attribute('class') == 'layui-this':
                            a.click()
                            time.sleep(2)
                            GetCondition()
                            drvier.find_element_by_xpath('//*[@id="changeDate"]').click()
                            time.sleep(2)

                        else:
                            if a.get_attribute('class') == 'laydate-day-next' and a.text == '1':
                                print("1", a.get_attribute('lay-ymd'))
                                a.click()
                                time.sleep(2)
                                drvier.find_element_by_xpath('//*[@id="changeDate"]').click()
                                return change_date()

    change_date()

定义GetCondition获取农历、公历、潮汐级别

    def GetCondition():
        # 获取公历
        date_lst = drvier.find_elements_by_xpath("//p/strong")
        date = ''.join([i.text for i in date_lst])

        # 获取农历
        lu_date_lst = drvier.find_elements_by_xpath('//p[@style="text-align:center;margin-bottom:0"]')
        lu_date = [i.text for i in lu_date_lst][0].split(" ")[1][:-2]

        # 获取潮汐情况
        tide_lst = drvier.find_elements_by_xpath('//p/span[@class="cnTides"]')
        tide = ''.join([i.text for i in tide_lst])
        print("公历:", date, " 农历:", lu_date, " 潮汐情况:", tide)

4.使用动作链ActionChains移动坐标来获取24小时的潮高和时间，这里使用谷歌浏览器插件Page Ruler Redux来获取坐标

    list={}
    try:
        #定位锚点，获取24小时的潮高
        for i in range(1, 25):
            action = ActionChains(drvier)
            if i==1:
                action.move_by_offset(373,650).perform()
            elif i==24:
                action.move_by_offset(28,0).perform()
            else:
                action.move_by_offset(34,0).perform()
            time.sleep(2)
            select = drvier.find_elements_by_xpath('//div[@id="tidesLine"]')
            for i in select:
                txt= i.text
                txt=re.sub('\n',"",txt)
                list['time']=re.search('时间 : (.*?)潮高',txt).group(1)
                list['high']=re.search('潮高 : (.+)',txt).group(1)
                print(type(txt))
                print(txt)
                print(list)
            time.sleep(2)
    except Exception as e:
        print(e.args)

完整代码：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
from datetime import  datetime
import csv
import os
url="https://www.chaoxibiao.net/tides/75.html"
def locate_date():
    print("date2", date2)
    date1_date = datetime.strptime(date1, "%Y-%m-%d").date()
    y1 = int(date1.split('-')[0])
    # 点击切换日期按钮 2003-01-01
    drvier.find_element_by_xpath('//*[@id="changeDate"]').click()

    # 点击2021年,开始选择年份
    drvier.find_element_by_xpath('//div[@class="laydate-set-ym"]/span[1]').click()

    # 获取15个年份中第一个，即最小的年份
    init_year = drvier.find_element_by_xpath('//div[@class="layui-laydate-content"]/ul/li').get_attribute('lay-ym')
    # 如果y小于日历的最小年份
    while y1 < int(init_year):
        # 向前翻页
        drvier.find_element_by_xpath('//*[@id="layui-laydate1"]/div[1]/div[1]/i[1]').click()
        # 得到当前日历的初始年份和末尾年份
        init_year = drvier.find_element_by_xpath('//div[@class="layui-laydate-content"]/ul/li').get_attribute(
            'lay-ym')
        last_year = drvier.find_element_by_xpath(
            '//div[@class="layui-laydate-content"]/ul/li/following-sibling::*[14]').get_attribute('lay-ym')
        if y1 >= int(init_year) and y1 <= int(last_year):
            for i in range(1, 15):
                year = drvier.find_element_by_xpath(
                    '//div[@class="layui-laydate-content"]/ul/li/following-sibling::*[%d]' % i)
                if y1 == int(year.get_attribute('lay-ym')):
                    year.click()
                    break
    time.sleep(2)

    # 定位月份
    m1 = int(date1.split('-')[1])
    drvier.find_element_by_xpath('//div[@class="laydate-set-ym"]/span[2]').click()
    time.sleep(2)
    init_m1 = drvier.find_element_by_xpath('//ul[@class="layui-laydate-list laydate-month-list"]/li[%d]' % m1)
    init_m1.click()

    # 自动切换日期
    def change_date():
        for i in range(1, 7):
            for j in range(1, 8):
                a = drvier.find_element_by_xpath(
                    '//div[@class="layui-laydate-content"]/table/tbody/tr[%d]/td[%d]' % (i, j))
                # 字符串转换
                a_date = datetime.strptime(a.get_attribute('lay-ymd'), "%Y-%m-%d").date()
                print(a_date, date1_date)
                distance = a_date - date1_date
                if distance.days >= 0:
                    print(a.text)
                    print(a.get_attribute('lay-ymd'))
                    print("distance", distance.days)
                    # 若切换的日期为终止日期，停止循环
                    if a.get_attribute('lay-ymd') == date2:
                        a.click()
                        GetCondition()
                        return
                    else:
                        if a.get_attribute('class') == '' or a.get_attribute('class') == 'layui-this':
                            a.click()
                            time.sleep(2)
                            GetCondition()
                            drvier.find_element_by_xpath('//*[@id="changeDate"]').click()
                            time.sleep(2)

                        else:
                            if a.get_attribute('class') == 'laydate-day-next' and a.text == '1':
                                print("1", a.get_attribute('lay-ymd'))
                                a.click()
                                time.sleep(2)
                                drvier.find_element_by_xpath('//*[@id="changeDate"]').click()
                                return change_date()

    change_date()

    def GetCondition():
        # 获取公历
        date_lst = drvier.find_elements_by_xpath("//p/strong")
        date = ''.join([i.text for i in date_lst])

        # 获取农历
        lu_date_lst = drvier.find_elements_by_xpath('//p[@style="text-align:center;margin-bottom:0"]')
        lu_date = [i.text for i in lu_date_lst][0].split(" ")[1][:-2]

        # 获取潮汐情况
        tide_lst = drvier.find_elements_by_xpath('//p/span[@class="cnTides"]')
        tide = ''.join([i.text for i in tide_lst])
        print("公历:", date, " 农历:", lu_date, " 潮汐情况:", tide)

def start():
    # 选择日期
    locate_date()
    list={}
    try:
        #定位锚点，获取24小时的潮高
        for i in range(1, 25):
            action = ActionChains(drvier)
            if i==1:
                action.move_by_offset(373,650).perform()
            elif i==24:
                action.move_by_offset(28,0).perform()
            else:
                action.move_by_offset(34,0).perform()
            time.sleep(2)
            select = drvier.find_elements_by_xpath('//div[@id="tidesLine"]')
            for i in select:
                txt= i.text
                txt=re.sub('\n',"",txt)
                list['time']=re.search('时间 : (.*?)潮高',txt).group(1)
                list['high']=re.search('潮高 : (.+)',txt).group(1)
                print(type(txt))
                print(txt)
                print(list)
            time.sleep(2)
    except Exception as e:
        print(e.args)

def save():
    pass
if __name__ == '__main__':

    option=Options()
    option.binary_location=r"C:\Users\txmmy\AppData\Local\Google\Chrome\Application\chrome.exe"
    drvier=webdriver.Chrome(options=option)
    drvier.get(url)
    drvier.maximize_window()
    time.sleep(2)
    date1='2003-1-1'
    date2='2003-3-1'

    #创建一个csv文件
    # if os.path.exists("-潮汐表.csv")==True:
    #     pass
    # else:
    #     newfile = open('.csv', 'w', newline='')

    start()

标签：xpath,get,selenium,爬虫,爬取,print,drvier,date,find
From： https://www.cnblogs.com/Gimm/p/18116744

【爬虫】项目篇-selenium爬取大鱼潮汐网

相关文章

赞助商

阅读排行