import random标签:__,feapder,selenium,li,爬取,item,import,page From: https://www.cnblogs.com/Lhptest/p/18226511
import time
import feapder
from feapder.utils.webdriver import WebDriver
from parsel import Selector
from feapder.db.mysqldb import MysqlDB
from selenium.webdriver.common.by import By
class TestRender(feapder.AirSpider):
db = MysqlDB()
__custom_setting__ = dict(
WEBDRIVER=dict(
pool_size=1, # 浏览器的数量
load_images=True, # 是否加载图片
user_agent=None, # 字符串 或 无参函数,返回值为user_agent
proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
headless=False, # 是否为无头浏览器
driver_type="CHROME", # CHROME、EDGE、PHANTOMJS、FIREFOX
timeout=30, # 请求超时时间
window_size=(1024, 800), # 窗口大小
executable_path=None, # 浏览器路径,默认为默认路径
render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
xhr_url_regexes=[
"/ad",
], # 拦截 http://www.spidertools.cn/spidertools/ad 接口
)
)
def start_requests(self):
yield feapder.Request("https://18je.life/t/1/", render=True)
def parse(self, request, response):
print(request.url)
browser: WebDriver = response.browser
time.sleep(random.randint(3, 6))
lis_ = Selector(browser.page_source).xpath('//ul[@class="list"]/li') # xpath提取
item_list = []
for li in lis_:
item = {}
item["cover"] = li.xpath('.//div[@class="vodlist_img"]/img/@data-original').get('').strip()
item_list.append(item)
print(item_list)
# 写入数据库
random_float = round(random.uniform(0.5, 1.5), 1)
time.sleep(random_float)
next_page = 'https://18je.life'+Selector(browser.page_source).xpath('//ul[@class="pagelist"]/li[last()]/a/@href').extract_first()
if next_page == response.url:
return 0
yield feapder.Request(url=next_page, callback=self.parse,render=True)
if __name__ == "__main__":
TestRender().start()