题库爬虫方案
# %% import openpyxl import requests from bs4 import BeautifulSoup import operator from functools import reduce # URL = 'http://www.hzwolf.com/dxxlx.htm' # 大学心理学 # URL = 'http://www.hzwolf.com/gdjyfg.htm' # 高等教育法规 # URL = 'http://www.hzwolf.com/gdjyx.htm' # 高等教育学 URL = 'http://www.hzwolf.com/jsllx.htm' # 教师伦理学 res = requests.get(URL) # %% res.encoding = 'GB2312' html = res.text soup = BeautifulSoup(html, 'html.parser') # %% items = soup.find_all('span') # print(items) item_str = [] for item in items: if not item.string is None: item_str.append(item.string.strip()) # %% len_str = len(item_str) # %% i_slice = [] for i in range(0, len_str): if(item_str[i][2:5] == '单选题' or item_str[i][2:5] == '多选题' or item_str[i][2:5] == '判断题'): i_slice.append(i) i_slice.append(len_str) # %% item_str_single = [] item_str_multi = [] item_str_judge = [] for i in range(0, len(i_slice)-1): if (i % 3 == 0): item_str_single.append(item_str[i_slice[i]+1:i_slice[i+1]]) if (i % 3 == 1): item_str_multi.append(item_str[i_slice[i]+1:i_slice[i+1]]) if (i % 3 == 2): item_str_judge.append(item_str[i_slice[i]+1:i_slice[i+1]]) # %% item_str_single = reduce(operator.add, item_str_single) item_str_multi = reduce(operator.add, item_str_multi) item_str_judge = reduce(operator.add, item_str_judge) # %% # print(item_str_judge) # %% file = openpyxl.Workbook() sheet1 = file.active k = 0 i_slice = [] len_single = len(item_str_single) for i in range(0, len_single): if (item_str_single[i][0].isdigit() and len(item_str_single[i]) < 5): i_slice.append(i) i_slice.append(len_single) ques_all = [] for i in range(0, len(i_slice)-1): ques_list = item_str_single[i_slice[i]:i_slice[i+1]] del ques_list[0] ques_all.append(ques_list) ques_del = list(set([tuple(t) for t in ques_all])) for i in range(0, len(ques_del)): # len(ques_del) k = k+1 ques_one = list(ques_del[i]) sheet1.cell(row=k, column=2).value = ques_one[1] del ques_one[1] sheet1.cell(row=k, column=1).value = (5-len(str(k))) * '0' + f'{k}_单选题\n'+(f'\n').join(ques_one) # %% i_slice = [] len_single = len(item_str_multi) for i in range(0, len_single): if (item_str_multi[i][0].isdigit() and len(item_str_multi[i]) < 5): i_slice.append(i) i_slice.append(len_single) ques_all = [] for i in range(0, len(i_slice)-1): ques_list = item_str_multi[i_slice[i]:i_slice[i+1]] del ques_list[0] ques_all.append(ques_list) ques_del = list(set([tuple(t) for t in ques_all])) for i in range(0, len(ques_del)): k = k+1 ques_one = list(ques_del[i]) sheet1.cell(row=k, column=2).value = ques_one[1] del ques_one[1] sheet1.cell(row=k, column=1).value = (5-len(str(k))) * '0' + f'{k}_多选题\n'+(f'\n').join(ques_one) # %% i_slice = [] len_single = len(item_str_judge) for i in range(0, len_single): if (item_str_judge[i][0].isdigit() and len(item_str_judge[i]) < 5): i_slice.append(i) i_slice.append(len_single) ques_all = [] for i in range(0, len(i_slice)-1): ques_list = item_str_judge[i_slice[i]:i_slice[i+1]] del ques_list[0] ques_all.append(ques_list) ques_del = list(set([tuple(t) for t in ques_all])) for i in range(0, len(ques_del)): k = k+1 ques_one = list(ques_del[i]) sheet1.cell(row=k, column=2).value = ques_one[1] del ques_one[1] sheet1.cell(row=k, column=1).value = (5-len(str(k))) * '0' + f'{k}_判断题\n'+(f'\n').join(ques_one) file.save('card.xlsx')
标签:练习题,slice,高校教师,single,len,item,ques,str,浙江省 From: https://www.cnblogs.com/wayne-tao/p/17685489.html