selenium 用作自动化测试工具,并非爬虫工具,用作爬虫性能没那么好。但既然可以读取网页信息,那还是可以用来爬取数据的。用该工具模拟访问,网站会认为是正常的访问行为。
项目创建几个文件,都在同一个目录中:
setting.cfg :配置文件
mssql.py : 数据库
lagou.py : 拉勾网标签读取和相关操作
mydriver.py :driver 相关操作
main.py : 执行文件
selenium 标签读取设置参考本人之前总结的 : Python selenium自动化模拟登录操作(一) ,或者 selenium 元素定位 。 思路还是和 Python scrapy 爬取拉勾网招聘信息的一样。不过对于翻页的操作,使用selenium 来获取 “下一页” 标签来点击触发跳转。每页读取保存数据后再继续点击下一页,知道达到总页数。比较麻烦的是,要判断页面是否加载完成,如果未加载出来,读取标签太快则报错。(可参考:等待页面加载完成(Waits) 或者Waits)
本脚本缺点:这是一次模拟登陆爬取的,如果爬取另一个岗位,换名称又得重新模拟登陆了。(当然可以传递一个列表/集合的岗位,登录之后每次设置搜索爬取即可)
【seleniumlagou】数据库表结构:
USE [Myspider]
GO
CREATE TABLE [dbo].[seleniumlagou](
[companyfullname] [varchar](50) NULL,
[positionname] [varchar](50) NULL,
[salary] [varchar](20) NULL,
[workyear] [varchar](20) NULL,
[education] [varchar](20) NULL,
[city] [varchar](20) NULL,
[district] [varchar](20) NULL,
[financestage] [varchar](50) NULL,
[industryfield] [varchar](100) NULL,
[firsttype] [varchar](50) NULL,
[positionlables] [varchar](100) NULL
) ON [PRIMARY]
GO
【setting.cfg】配置文件
[mssql]
MSSQL_HOST = 'HZC'
MSSQL_USER = 'kk'
MSSQL_PASSWD = 'kk'
MSSQL_DBNAME = 'Myspider'
[driver]
driverPath = 'D:/Python35/selenium/phantomjs/bin/phantomjs.exe'
imgPath = 'E:/mypy/lagou/img.png'
【mssql.py】数据库执行脚本
# -*- coding: utf-8 -*-
# python 3.5
import sys
import pymssql
import configparser
sys.path.append(r'E:/mypy/lagou')
cf = configparser.ConfigParser()
cf.read("setting.cfg")
MSSQL_HOST = cf.get("mssql", "MSSQL_HOST").strip().replace("\'","").replace(r"\n","")
MSSQL_USER = cf.get("mssql", "MSSQL_USER").strip().replace("\'","").replace(r"\n","")
MSSQL_PASSWD = cf.get("mssql", "MSSQL_PASSWD").strip().replace("\'","").replace(r"\n","")
MSSQL_DBNAME = cf.get("mssql", "MSSQL_DBNAME").strip().replace("\'","").replace(r"\n","")
class MSSQL(object):
def __init__(self):
self.host = MSSQL_HOST
self.user = MSSQL_USER
self.pwd = MSSQL_PASSWD
self.db = MSSQL_DBNAME
self._conn = self.GetConnect()
if(self._conn):
self._cur = self._conn.cursor()
#连接数据库
def GetConnect(self):
conn = False
try:
conn = pymssql.connect(
host=self.host,
user=self.user,
password=self.pwd,
database =self.db
)
except Exception as err:
print("连接数据库失败, %s" % err)
else:
return conn
#执行查询
def ExecQuery(self,sql):
res = ""
try:
self._cur.execute(sql)
res = self._cur.fetchall()
except Exception as err:
print("查询失败, %s" % err)
else:
return res
#执行非查询类语句
def ExecNonQuery(self, sql):
flag = False
try:
self._cur.execute(sql)
self._conn.commit()
flag = True
except Exception as err:
flag = False
self._conn.rollback()
print("执行失败, %s" % err)
else:
return flag
#获取连接信息
def GetConnectInfo(self):
print( "连接信息:" )
print( "服务器:%s , 用户名:%s , 数据库:%s " % (self.host,self.user,self.db))
#关闭数据库连接
def Close(self):
if(self._conn):
try:
if(type(self._cur)=='object'):
self._cur.close()
if(type(self._conn)=='object'):
self._conn.close()
except:
raise("关闭异常, %s,%s" % (type(self._cur), type(self._conn)))
【mydriver.py】driver 相关操作
# -*- coding: utf-8 -*-
# python 3.5
import sys
import configparser
from selenium import webdriver
from lagou import Lagou
sys.path.append(r'E:/mypy/lagou')
cf = configparser.ConfigParser()
cf.read("setting.cfg")
driverPath = cf.get("driver", "driverPath").strip().replace("\'","").replace(r"\n","")
imgPath = cf.get("driver", "imgPath").strip().replace("\'","").replace(r"\n","")
class MyDriver(object):
def __init__(self):
self.imgPath = imgPath
self.driverPath = driverPath
#self.driver = webdriver.PhantomJS()
self.driver = webdriver.Chrome("D:/Python35/selenium/webdriver/chromedriver/chromedriver.exe")
self.myweb = Lagou(self.driver)
def setUp(self,url):
self.driver.get(url)
# 本类变量处理
def setImgPath(self,imgPath):
self.imgPath = imgPath
def setDriverPath(self,driverPath):
self.driverPath = driverPath
def getImgPath(self):
return self.imgPath
def getDriverPath(self):
return self.driverPath
def getDriver(self):
return self.driver
# driver 相关操作
def setOptions(self):
self.driver.maximize_window()
#self.driver.set_window_size(宽,高)
def saveScreenshot(self):
self.driver.get_screenshot_as_file(imgPath)
def quitDriver(self):
self.driver.quit()
# web 通用函数登录操作
def setUserPwd(self,username,password):
self.myweb.setUsername(username)
self.myweb.setPassword(password)
def doSubmit(self):
self.myweb.doSubmit()
def getLoginErrMsg(self):
return self.myweb.getLoginErrMsg()
# web 拉钩其他操作
def doFirstSearch(self,keyword):
self.myweb.firstSearch(keyword)
self.myweb.firstSearchClick()
def doDetailSearch(self,keyword,city,workyear,education,financestage,industryfield,monthsalary):
self.myweb.detailSearch(keyword,city,workyear,education,financestage,industryfield,monthsalary)
def saveDate(self):
self.myweb.saveDate()
【lagou.py 】 拉勾网标签读取和相关操作
# -*- coding: utf-8 -*-
# python 3.5
import time
from mssql import MSSQL
class Lagou(object):
def __init__(self,driver):
self.mssql = MSSQL()
self.driver = driver
#self.taltalpage = 0
#登录
def setUsername(self,username):
return self.driver.find_element_by_xpath("//input[@placeholder='请输入常用手机号/邮箱']").send_keys(username)
def setPassword(self,password):
return self.driver.find_element_by_xpath("//input[@placeholder='请输入密码']").send_keys(password)
def doSubmit(self):
return self.driver.find_element_by_xpath("//form[@class='active']/div[5]/input[@type='submit']").click()
def getLoginErrMsg(self):
return self.driver.find_element_by_class_name('input_tips').text.strip()
#首页搜索,条件少
def firstSearch(self,keyword):
return self.driver.find_element_by_id('search_input').send_keys(keyword)
def firstSearchClick(self):
return self.driver.find_element_by_id('search_button').click()
#详细搜索
def detailSearch(self,keyword,city,workyear,education,financestage,industryfield,monthsalary):
keyword = keyword.strip()
if len(city) == 0 :
city = ""
else:
city = "&city=%s" % city.strip()
if len(workyear) == 0 :
workyear = ""
else:
workyear = "&gj=%s" % workyear.strip()
if len(education) == 0 :
education = ""
else:
education = "&xl=%s" % education.strip()
if len(financestage) == 0 :
financestage = ""
else:
financestage = "&jd=%s" % financestage.strip()
if len(industryfield) == 0 :
industryfield = ""
else:
industryfield = "&hy=%s" % industryfield.strip()
if len(monthsalary) == 0 :
monthsalary = ""
else:
monthsalary = "&yx=%s" % monthsalary.strip()
#选择标签比较麻烦,直接拼接网站访问
url = "https://www.lagou.com/jobs/list_%s?px=default" % keyword
url = url + "%s%s%s%s%s%s" %(workyear,education,financestage,industryfield,monthsalary,city)
self.driver.get(url)
#总页数
def getTaltalPage(self):
num = self.driver.find_element_by_xpath("//div[@class='page-number']/span[2]").text.strip()
if len(num) == 0 :
num = 0
#self.taltalpage = int(num)
print("总页数:%s " % num)
return int(num)
#点击下一页
def NextPage(self):
self.driver.find_element_by_xpath("//span[@class='pager_next ']").click()
#保存所有页数据
def saveDate(self):
taltalpage = self.getTaltalPage()
currentpage = 1
if taltalpage != 0:
while currentpage <= taltalpage:
time.sleep(3) #等待页面加载
print(">> 第 %s 页数据处理中…………………………………………" % currentpage)
print(self.driver.current_url)
self.saveOnePageDate() #保存当页数据
self.NextPage() #点击下一页
currentpage = currentpage + 1
else:
pass
#保存一页数据
def saveOnePageDate(self):
index = 0
while index <= 14:
xpath = "//li[@data-index='%s']" % index
print(">> 第 %s 条" % index)
self.saveliDate(xpath)
index = index + 1
#保存 li 到数据库
def saveliDate(self,xpath):
positi = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[1]/a/h3").text.strip()
citydist = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[1]/a/span/em").text.strip()
salary = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[2]/div/span").text.strip()
wy_edu = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[2]/div").text.strip()
company = self.driver.find_element_by_xpath(xpath + "/div[1]/div[2]/div[1]/a").text.strip()
fina_ind = self.driver.find_element_by_xpath(xpath + "/div[1]/div[2]/div[2]").text.strip()
firsttype = self.driver.find_element_by_xpath(xpath + "/div[2]/div[1]").text.strip()
lables = self.driver.find_element_by_xpath(xpath + "/div[2]/div[2]").text.strip()
companyfullname = company
positionname = positi
salary = salary #((wy_edu.replace(" ", "/")).split('/')[0]).strip()
workyear = ((wy_edu.replace(" ", "/")).split('/')[1]).strip()
education = ((wy_edu.replace(" ", "/")).split('/')[4]).strip()
city = ((citydist+'·'+citydist).split('·')[0]).strip()
district = ((citydist+'·'+citydist).split('·')[1]).strip()
industryfield = (fina_ind.split('/')[0]).strip()
financestage = (fina_ind.split('/')[1]).strip()
firsttype = firsttype.replace(" ", ",").strip()
positionlables = lables.replace("“", "").replace("”", "").strip()
sql = """INSERT INTO seleniumlagou( companyfullname , positionname, salary, workyear,
education,city,district, industryfield, financestage, firsttype, positionlables)
VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % \
(companyfullname,positionname,salary,workyear,education,city,
district,industryfield,financestage,firsttype,positionlables)
self.mssql.ExecNonQuery(sql)
"""
print("companyfullname = " + companyfullname)
print("positionname = " + positionname)
print("salary = " + salary)
print("workyear = " + workyear)
print("education = " + education)
print("city = " + city)
print("district = " + district)
print("industryfield = " + industryfield)
print("financestage = " + financestage)
print("firsttype = " + firsttype)
print("positionlables = " + positionlables)
"""
【main.py 】 执行文件
# -*- coding: utf-8 -*-
# python 3.5
import time
import unittest
from mydriver import MyDriver
class Main(unittest.TestCase):
username = "kk"
password = "kk"
loginUrl = 'https://passport.lagou.com/login/login.html'
#【登录拉钩网】
mydriver = MyDriver()
driver = mydriver.getDriver()
#mydriver.setOptions()
mydriver.setUp(loginUrl)
mydriver.setUserPwd(username,password)
mydriver.doSubmit()
print("[1] "+driver.current_url)
#判断页面是否跳转加载,url不一样说明已跳转。
while True:
if loginUrl == driver.current_url :
time.sleep(1)
print("[-] "+driver.current_url)
print("loading……")
continue
else:
break
print("[2] "+driver.current_url)
#mydriver.saveScreenshot()
#【按条件搜索】
#首页筛选条件太少,任意输入直接点击搜索将跳转详细搜索列表
mydriver.doFirstSearch("hzc")
print("[3] "+driver.current_url)
#详细搜索页面,格式:(岗位,工作城市,工作经验,学历要求,融资阶段,行业领域,月薪范围)
#mydriver.doDetailSearch("dba","深圳","3-5年","本科","未融资","移动互联网","15k-25k")
mydriver.doDetailSearch("DBA","","","","","","")
print("[4] "+driver.current_url)
mydriver.saveDate()
print("done!")
if __name__ == "__main__":
Main()