可参考思路
# coding:utf-8
import requests
import pymysql
from bs4 import BeautifulSoup
import time
from lxml import etree
import re
class Bank:
def __init__(self):
pass
# 抓取平台1
def getQuestionBank(self):
url = 'http://www.bsmz.net/gongwen/805561.html'
headers = {
'Referer': 'http://xfxuezhang.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
html = requests.get(url, headers=headers).content.decode('gbk')
tree = etree.HTML(html)
content = tree.xpath('//div[@class="clearfix neirong"]/p/text()')
# print(content)
return content
# 数据清洗
def parseQuestions(self, content):
# 最终的结果集
arrays = []
# 临时工具数组
temp = []
# 存选项
options = []
# 跳过,“选择词语的正确词形”这个题型以后再说
skip = 0
for i in content:
i = i.replace('\u3000', '')
# 答案
if i.startswith('参考答案'):
if skip == 1:
skip = 0
continue
# print(i+" "+str(temp))
# 选项对应的数组下标
if len(re.findall(r'[a-eA-E]', i)) == 0:
continue
index = ord(re.findall(r'[a-eA-E]', i)[0].upper()) - 65
# 选项对应的答案内容
answer = options.pop(index)
# print(str(len(temp))+' '+str(temp)+' '+answer)
temp.append(re.findall(r'[a-eA-E]、{0,}(.+)', answer)[0])
# 存入结果集
arrays.append(temp.copy())
# print(temp)
# 数组清空,为下次做准备
temp.clear()
# 数组清空,为下次做准备
options.clear()
# 选项
elif re.match(r'^[A-E]+', i):
if skip == 1:
continue
options.append(i)
# 题干
else:
# 这个题型以后再说
if '选择词语的正确词形' in i:
skip = 1
continue
# 去除特殊字符,并加入
temp.append((temp.pop() if len(temp) != 0 else '')+re.sub('\W+', '', i).replace("_", ''))
# print(arrays)
# print(len(arrays))
return arrays
# 数据持久化
def storeQuestions(self, arrays):
db = pymysql.connect(host="localhost", port=3306, user="root", passwd="1061700625", db="learn")
cursor = db.cursor(pymysql.cursors.DictCursor) # 使用cursor()方法获取操作游标,按字典返回
for item in arrays:
print('*' * 20)
print(item)
if len(item) < 2:
print(">> 数据错误")
continue
question = item[0]
options = None
answer = item[1]
sql = "select * from bank where question='{}'".format(question)
cursor.execute(sql)
data = cursor.fetchall()
if len(data) != 0:
print(">> 数据存在")
continue
sql = "insert into bank(question,options,answer) values('{}','{}','{}')".format(question, options, answer)
try:
cursor.execute(sql) # 使用execute方法执行SQL语句
db.commit() # 提交到数据库执行
print(">> 插入完成")
except Exception as e:
print(">> 插入失败")
print('*' * 20 + '\r\n' + str(e) + '\r\n' + '*' * 20)
db.rollback() # 发生错误时回滚
cursor.close()
db.close()
if __name__ == '__main__':
bank = Bank()
content = bank.getQuestionBank()
arrays = bank.parseQuestions(content)
bank.storeQuestions(arrays)