爬虫Spider
该爬虫分为两部分,分别为Spider.py和model.py
Spider.py
该文件主要业务逻辑是调用Selenium来通过自动化测试的方法实现模拟人的行为来对网页进行请求,并将请求到的HTML提取为文本,为后续的数据抽取做铺地。需要的第三方工具包Selenium,scrapy,time
spider.py具体实现
from selenium import webdriver # 使用chromedriver必需 from scrapy import Selector # 使用Selector import time # TimeSleep休眠 from Interests.models import * #表示models文件处于Interests文件夹下 from selenium.common.exceptions import * domin = "https://detail.zol.com.cn/motherboard" broswer = webdriver.Chrome(executable_path="C:\MyApplications\chromedriver.exe") url_list = [] # 设置列表用于存放product_url def Get_Url(): broswer.get(domin) sel = Selector(text=broswer.page_source) info = sel.xpath("//ul[@class='clearfix']/li/a/@href").extract() for i in info: product_url = 'https://detail.zol.com.cn' + i print(product_url) url_list.append(product_url) def Get_Info(): for a in url_list: broswer.get(a) detail = Selector(text=broswer.page_source) # 获取到单个商品的详情页面 name = detail.xpath("//h1/text()").extract()[0] core = detail.xpath("//ul/li[1]/p[1]/text()").extract()[0] radio_core = detail.xpath("//ul/li[1]/p[2]/text()").extract()[0] ram_type = detail.xpath("//ul/li[2]/p[1]/text()").extract()[0] max_ram = detail.xpath("//ul/li[2]/p[2]/text()").extract()[0] appear_size = detail.xpath("//ul/li[3]/p[1]/text()").extract()[0] broad_type = detail.xpath("//ul/li[3]/p[2]/text()").extract()[0] energy_socket = detail.xpath("//ul/li[4]/p[1]/text()").extract()[0] charge_mode = detail.xpath("//ul/li[4]/p[2]/text()").extract()[0] info = Info() info.name = name info.core = core info.radio_core = radio_core info.ram_type = ram_type info.max_ram = max_ram info.appear_size = appear_size info.broad_type = broad_type info.energy_socket = energy_socket info.charge_mode = charge_mode info.save(force_insert=True) if __name__ == "__main__": t1 = Get_Url() t2 = Get_Info() time.sleep(3) t1.start() t2.start()
models.py
models.py的主要功能是链接数据库,但是数据库中的表格不必是提前建立好的。通过peewee第三方工具包创建一个指定数据库的基础模型类。其指定数据库中数据类型可以为CharField,TextField,DateField等。
models具体代码实现
from peewee import * db = MySQLDatabase("demo", host="127.0.0.1", port=3306, user="root", password="123456") # 创建一个指定数据库的基础模型类。 # 定义一个建立数据库连接的基础模型类,这样就不必为后续模型指定数据库 class BaseModel(Model): class Meta: database = db class Info(BaseModel): name = CharField() core = CharField() radio_core = TextField() ram_type = TextField() max_ram = CharField() appear_size = TextField() broad_type = TextField() energy_socket = TextField() charge_mode = TextField() if __name__ == "__main__": db.create_tables([Info])
标签:info,xpath,text,detail,Spider,li,爬虫,extract From: https://www.cnblogs.com/liam-sliversucks/p/17315883.html