首页 > 编程语言 >Python selenium 拉钩爬虫

Python selenium 拉钩爬虫

时间:2023-01-27 14:05:05浏览次数:57  
标签:Python selenium self driver 拉钩 strip print div def


selenium 用作自动化测试工具,并非爬虫工具,用作爬虫性能没那么好。但既然可以读取网页信息,那还是可以用来爬取数据的。用该工具模拟访问,网站会认为是正常的访问行为。


项目创建几个文件,都在同一个目录中:

setting.cfg :配置文件

mssql.py : 数据库

lagou.py : 拉勾网标签读取和相关操作

mydriver.py :driver 相关操作

main.py : 执行文件


selenium 标签读取设置参考本人之前总结的 : ​​Python selenium自动化模拟登录操作(一)​​ ,或者 ​​selenium 元素定位​​​ 。 思路还是和 ​​Python scrapy 爬取拉勾网招聘信息​​的一样。不过对于翻页的操作,使用selenium 来获取 “下一页” 标签来点击触发跳转。每页读取保存数据后再继续点击下一页,知道达到总页数。比较麻烦的是,要判断页面是否加载完成,如果未加载出来,读取标签太快则报错。(可参考:​​等待页面加载完成(Waits)​​​  或者​​Waits​​)


本脚本缺点:这是一次模拟登陆爬取的,如果爬取另一个岗位,换名称又得重新模拟登陆了。(当然可以传递一个列表/集合的岗位,登录之后每次设置搜索爬取即可)


【seleniumlagou】数据库表结构:

USE [Myspider]
GO
CREATE TABLE [dbo].[seleniumlagou](
[companyfullname] [varchar](50) NULL,
[positionname] [varchar](50) NULL,
[salary] [varchar](20) NULL,
[workyear] [varchar](20) NULL,
[education] [varchar](20) NULL,
[city] [varchar](20) NULL,
[district] [varchar](20) NULL,
[financestage] [varchar](50) NULL,
[industryfield] [varchar](100) NULL,
[firsttype] [varchar](50) NULL,
[positionlables] [varchar](100) NULL
) ON [PRIMARY]
GO


【setting.cfg】配置文件

[mssql]
MSSQL_HOST = 'HZC'
MSSQL_USER = 'kk'
MSSQL_PASSWD = 'kk'
MSSQL_DBNAME = 'Myspider'

[driver]
driverPath = 'D:/Python35/selenium/phantomjs/bin/phantomjs.exe'
imgPath = 'E:/mypy/lagou/img.png'


【mssql.py】数据库执行脚本

# -*- coding: utf-8 -*-
# python 3.5

import sys
import pymssql
import configparser

sys.path.append(r'E:/mypy/lagou')
cf = configparser.ConfigParser()
cf.read("setting.cfg")

MSSQL_HOST = cf.get("mssql", "MSSQL_HOST").strip().replace("\'","").replace(r"\n","")
MSSQL_USER = cf.get("mssql", "MSSQL_USER").strip().replace("\'","").replace(r"\n","")
MSSQL_PASSWD = cf.get("mssql", "MSSQL_PASSWD").strip().replace("\'","").replace(r"\n","")
MSSQL_DBNAME = cf.get("mssql", "MSSQL_DBNAME").strip().replace("\'","").replace(r"\n","")

class MSSQL(object):
def __init__(self):
self.host = MSSQL_HOST
self.user = MSSQL_USER
self.pwd = MSSQL_PASSWD
self.db = MSSQL_DBNAME

self._conn = self.GetConnect()
if(self._conn):
self._cur = self._conn.cursor()

#连接数据库
def GetConnect(self):
conn = False
try:
conn = pymssql.connect(
host=self.host,
user=self.user,
password=self.pwd,
database =self.db
)
except Exception as err:
print("连接数据库失败, %s" % err)
else:
return conn

#执行查询
def ExecQuery(self,sql):
res = ""
try:
self._cur.execute(sql)
res = self._cur.fetchall()
except Exception as err:
print("查询失败, %s" % err)
else:
return res


#执行非查询类语句
def ExecNonQuery(self, sql):
flag = False
try:
self._cur.execute(sql)
self._conn.commit()
flag = True
except Exception as err:
flag = False
self._conn.rollback()
print("执行失败, %s" % err)
else:
return flag


#获取连接信息
def GetConnectInfo(self):
print( "连接信息:" )
print( "服务器:%s , 用户名:%s , 数据库:%s " % (self.host,self.user,self.db))


#关闭数据库连接
def Close(self):
if(self._conn):
try:
if(type(self._cur)=='object'):
self._cur.close()
if(type(self._conn)=='object'):
self._conn.close()
except:
raise("关闭异常, %s,%s" % (type(self._cur), type(self._conn)))


【mydriver.py】driver 相关操作

# -*- coding: utf-8 -*-
# python 3.5

import sys
import configparser
from selenium import webdriver
from lagou import Lagou

sys.path.append(r'E:/mypy/lagou')
cf = configparser.ConfigParser()
cf.read("setting.cfg")

driverPath = cf.get("driver", "driverPath").strip().replace("\'","").replace(r"\n","")
imgPath = cf.get("driver", "imgPath").strip().replace("\'","").replace(r"\n","")

class MyDriver(object):
def __init__(self):
self.imgPath = imgPath
self.driverPath = driverPath
#self.driver = webdriver.PhantomJS()
self.driver = webdriver.Chrome("D:/Python35/selenium/webdriver/chromedriver/chromedriver.exe")
self.myweb = Lagou(self.driver)

def setUp(self,url):
self.driver.get(url)

# 本类变量处理
def setImgPath(self,imgPath):
self.imgPath = imgPath

def setDriverPath(self,driverPath):
self.driverPath = driverPath

def getImgPath(self):
return self.imgPath

def getDriverPath(self):
return self.driverPath

def getDriver(self):
return self.driver

# driver 相关操作
def setOptions(self):
self.driver.maximize_window()
#self.driver.set_window_size(宽,高)

def saveScreenshot(self):
self.driver.get_screenshot_as_file(imgPath)

def quitDriver(self):
self.driver.quit()

# web 通用函数登录操作
def setUserPwd(self,username,password):
self.myweb.setUsername(username)
self.myweb.setPassword(password)

def doSubmit(self):
self.myweb.doSubmit()

def getLoginErrMsg(self):
return self.myweb.getLoginErrMsg()

# web 拉钩其他操作
def doFirstSearch(self,keyword):
self.myweb.firstSearch(keyword)
self.myweb.firstSearchClick()

def doDetailSearch(self,keyword,city,workyear,education,financestage,industryfield,monthsalary):
self.myweb.detailSearch(keyword,city,workyear,education,financestage,industryfield,monthsalary)

def saveDate(self):
self.myweb.saveDate()


【lagou.py 】 拉勾网标签读取和相关操作

# -*- coding: utf-8 -*-
# python 3.5

import time
from mssql import MSSQL

class Lagou(object):
def __init__(self,driver):
self.mssql = MSSQL()
self.driver = driver
#self.taltalpage = 0

#登录
def setUsername(self,username):
return self.driver.find_element_by_xpath("//input[@placeholder='请输入常用手机号/邮箱']").send_keys(username)

def setPassword(self,password):
return self.driver.find_element_by_xpath("//input[@placeholder='请输入密码']").send_keys(password)

def doSubmit(self):
return self.driver.find_element_by_xpath("//form[@class='active']/div[5]/input[@type='submit']").click()

def getLoginErrMsg(self):
return self.driver.find_element_by_class_name('input_tips').text.strip()


#首页搜索,条件少
def firstSearch(self,keyword):
return self.driver.find_element_by_id('search_input').send_keys(keyword)

def firstSearchClick(self):
return self.driver.find_element_by_id('search_button').click()

#详细搜索
def detailSearch(self,keyword,city,workyear,education,financestage,industryfield,monthsalary):
keyword = keyword.strip()

if len(city) == 0 :
city = ""
else:
city = "&city=%s" % city.strip()

if len(workyear) == 0 :
workyear = ""
else:
workyear = "&gj=%s" % workyear.strip()

if len(education) == 0 :
education = ""
else:
education = "&xl=%s" % education.strip()

if len(financestage) == 0 :
financestage = ""
else:
financestage = "&jd=%s" % financestage.strip()

if len(industryfield) == 0 :
industryfield = ""
else:
industryfield = "&hy=%s" % industryfield.strip()

if len(monthsalary) == 0 :
monthsalary = ""
else:
monthsalary = "&yx=%s" % monthsalary.strip()

#选择标签比较麻烦,直接拼接网站访问
url = "https://www.lagou.com/jobs/list_%s?px=default" % keyword
url = url + "%s%s%s%s%s%s" %(workyear,education,financestage,industryfield,monthsalary,city)
self.driver.get(url)


#总页数
def getTaltalPage(self):
num = self.driver.find_element_by_xpath("//div[@class='page-number']/span[2]").text.strip()
if len(num) == 0 :
num = 0
#self.taltalpage = int(num)
print("总页数:%s " % num)
return int(num)

#点击下一页
def NextPage(self):
self.driver.find_element_by_xpath("//span[@class='pager_next ']").click()


#保存所有页数据
def saveDate(self):
taltalpage = self.getTaltalPage()
currentpage = 1
if taltalpage != 0:
while currentpage <= taltalpage:
time.sleep(3) #等待页面加载
print(">> 第 %s 页数据处理中…………………………………………" % currentpage)
print(self.driver.current_url)
self.saveOnePageDate() #保存当页数据
self.NextPage() #点击下一页
currentpage = currentpage + 1
else:
pass


#保存一页数据
def saveOnePageDate(self):
index = 0
while index <= 14:
xpath = "//li[@data-index='%s']" % index
print(">> 第 %s 条" % index)
self.saveliDate(xpath)
index = index + 1


#保存 li 到数据库
def saveliDate(self,xpath):
positi = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[1]/a/h3").text.strip()
citydist = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[1]/a/span/em").text.strip()
salary = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[2]/div/span").text.strip()
wy_edu = self.driver.find_element_by_xpath(xpath + "/div[1]/div[1]/div[2]/div").text.strip()
company = self.driver.find_element_by_xpath(xpath + "/div[1]/div[2]/div[1]/a").text.strip()
fina_ind = self.driver.find_element_by_xpath(xpath + "/div[1]/div[2]/div[2]").text.strip()
firsttype = self.driver.find_element_by_xpath(xpath + "/div[2]/div[1]").text.strip()
lables = self.driver.find_element_by_xpath(xpath + "/div[2]/div[2]").text.strip()

companyfullname = company
positionname = positi
salary = salary #((wy_edu.replace(" ", "/")).split('/')[0]).strip()
workyear = ((wy_edu.replace(" ", "/")).split('/')[1]).strip()
education = ((wy_edu.replace(" ", "/")).split('/')[4]).strip()
city = ((citydist+'·'+citydist).split('·')[0]).strip()
district = ((citydist+'·'+citydist).split('·')[1]).strip()
industryfield = (fina_ind.split('/')[0]).strip()
financestage = (fina_ind.split('/')[1]).strip()
firsttype = firsttype.replace(" ", ",").strip()
positionlables = lables.replace("“", "").replace("”", "").strip()

sql = """INSERT INTO seleniumlagou( companyfullname , positionname, salary, workyear,
education,city,district, industryfield, financestage, firsttype, positionlables)
VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % \
(companyfullname,positionname,salary,workyear,education,city,
district,industryfield,financestage,firsttype,positionlables)

self.mssql.ExecNonQuery(sql)

"""
print("companyfullname = " + companyfullname)
print("positionname = " + positionname)
print("salary = " + salary)
print("workyear = " + workyear)
print("education = " + education)
print("city = " + city)
print("district = " + district)
print("industryfield = " + industryfield)
print("financestage = " + financestage)
print("firsttype = " + firsttype)
print("positionlables = " + positionlables)
"""


【main.py 】 执行文件

# -*- coding: utf-8 -*-
# python 3.5

import time
import unittest
from mydriver import MyDriver

class Main(unittest.TestCase):
username = "kk"
password = "kk"
loginUrl = 'https://passport.lagou.com/login/login.html'

#【登录拉钩网】
mydriver = MyDriver()
driver = mydriver.getDriver()
#mydriver.setOptions()
mydriver.setUp(loginUrl)
mydriver.setUserPwd(username,password)
mydriver.doSubmit()
print("[1] "+driver.current_url)

#判断页面是否跳转加载,url不一样说明已跳转。
while True:
if loginUrl == driver.current_url :
time.sleep(1)
print("[-] "+driver.current_url)
print("loading……")
continue
else:
break

print("[2] "+driver.current_url)
#mydriver.saveScreenshot()

#【按条件搜索】
#首页筛选条件太少,任意输入直接点击搜索将跳转详细搜索列表
mydriver.doFirstSearch("hzc")
print("[3] "+driver.current_url)

#详细搜索页面,格式:(岗位,工作城市,工作经验,学历要求,融资阶段,行业领域,月薪范围)
#mydriver.doDetailSearch("dba","深圳","3-5年","本科","未融资","移动互联网","15k-25k")
mydriver.doDetailSearch("DBA","","","","","","")
print("[4] "+driver.current_url)

mydriver.saveDate()

print("done!")


if __name__ == "__main__":
Main()




Python selenium 拉钩爬虫_数据库


Python selenium 拉钩爬虫_ci_02


标签:Python,selenium,self,driver,拉钩,strip,print,div,def
From: https://blog.51cto.com/hzc2012/6024063

相关文章

  • 实战案例!用1行Python代码识别身份证信息,准确率超过99%,YYDS
    大家好,这里是程序员晚枫。录入身份证信息是一件繁琐的工作,如果可以自动识别并且录入系统,那可真是太好了。今天我们就来学习一下,如何自动识别身份证信息并且录入系统~识......
  • 【Python基础学习】6.组合数据类型
    主要参考来源:慕课嵩天老师的“Python语言程序设计”[https://www.icourse163.org/course/BIT-268001?tid=1468130447]6.1集合类型及操作集合类型定义集合是多个元素的无......
  • 实战案例!Python批量识别银行卡号码并且写入Excel,小白也可以轻松使用~
    大家好,这里是程序员晚枫,今天我们继续学习Python自动化办公:每次有新员工入职,都要收集大量的工资卡信息,并且生成Excel文档,能不能用Python准确、快速地解决呢?今天我们就来学......
  • Python原型链污染变体(prototype-pollution-in-python)
    简介前些时间看了idekctf2022*的taskmanager,出题人参考了另一位博主Python原型链污染变体的博文,于是打算写一篇文章简单学习下这种攻击方式和题目中的一些解题技巧等内......
  • 监控Python 内存使用情况和代码执行时间
    我的代码的哪些部分运行时间最长、内存最多?我怎样才能找到需要改进的地方?”在开发过程中,我很确定我们大多数人都会想知道这一点,而且通常情况下存在开发空间。在本文中总结......
  • Python 三维绘图问题
    提问: 各位,本人刚刚才接触Python。现在有个问题在于,我有一组数据想要去将变成三维曲面图,网上教程多是曲面上的点Z用XY来表示,但是我这个数据是单纯的测量数据,并没有什么公......
  • Python入门之while练习
    #练习1:在控制台中,获取一个开始值,一个结束值。#将中间的数字打印出来。#例如:开始值3结束值10#打印456789begin=in......
  • 关于Python 面向对象寻值的问题. How the number be found in the OOP in Python
    今天在看Python面向对象的时候看到了一个很有意思的问题Today.WhenilearningtheOOPinpython,IfoundaveryinterestingQuestionthathowanumberbefound......
  • 利用python函数调用ffmpeg批量进行转码
    本人学习python没几天,代码也没记住,写个函数到处查笔记,东拼西凑的。累……但是最终还是搞定了。欢迎高手指导,谢谢!单个文件转码#学会如何在python调用bat文件importos,......
  • Python 中的作用域
    变量的作用域是指变量的作用范围,是程序可以访问该变量的正文区域。例如:g=123deffunction():print(g)print(g)程序总共有6行在第1行,定义了变量g在第4......