# encoding=utf-8 import json # json 包,用于读取解析,生成json格式的文件内容 import time from random import randint import requests # 请求包 用于发起网络请求 from bs4 import BeautifulSoup # 解析页面内容帮助包 from lxml import etree import re # 正则表达式 import csv # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码 # 全局变量 booknamelist = [] authorlist = [] typelist = [] contentlist = [] novel = [] def get_data(url): """ 获取数据 :param url: 请求网址 :return:返回请求的页面内容 """ # 请求头,模拟浏览器,否则请求会返回418 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'} time.sleep(randint(1, 8))#控制爬虫速率以保证不会很快被网站察觉否则爬两页就会被封 resp = requests.get(url=url, headers=header) # 发送请求 resp.encoding = 'utf-8' if resp.status_code == 200: # 如果返回成功,则返回内容 return resp.text else: # 否则,打印错误状态码,并返回空 print('返回状态码:', resp.status_code) return '' # 写入文件 def writeIntoCSVFile(fileName): ''' :param fileName:待保存csv文件路径 :return: None ''' # newline = ''解决csv写入内容自动换行的问题 # 参考文献:https://blog.csdn.net/weixin_44064937/article/details/105745398 f = open(fileName, 'w', newline='', encoding='utf-8') csv_writer = csv.writer(f) # 构建列表头 csv_writer.writerow(['作者', '书名', '类型', '简介', '小说']) for i in range(len(booknamelist)): csv_writer.writerow( [str(authorlist[i]).strip(), str(booknamelist[i]).strip(), str(typelist[i]).strip(), str(contentlist[i]).strip(), str(novel[i])])#加上strip去除里面的空格 f.close() # 爬取600页数据一共9000条数据 for i in range(30, 100): # LoopUrl = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter=' # 网页切换 url = 'http://read.nlc.cn/yuewen/index?&pageNo=' + str(i) + '&categoryId=14500'#url里面的参数可以根据实际信息修改 html = get_data(url=url) # 获取数据 root = etree.HTML(html) booknames = root.xpath('//li/a/span[@class="right"]/span[@class="tt"]/text()') # 书名 authors = root.xpath('//li/a/span[@class="right"]/span[@class="txt1"]/text()') # 作者信息 types = root.xpath('//li/a/span[@class="right"]/span[@class="txt1"]/i/text()') # 类型 contents = root.xpath('//li/a/span[@class="right"]/span[@class="txt2"]/text()') # 简介 for bookname in booknames: str(bookname).strip() booknamelist.append(bookname) # print(bookname) for author in authors: str(author).strip() if author != '\r\n\t\t\t\t\t\t\t\t\t': # 因为打印出来的里面有很多空格所以吧不是空格的代替掉 authorlist.append(author) # print(author) for type in types: str(type).strip() typelist.append(type) # print(type) for content in contents: novel.append("传记")#根据不同的类型进行修改 str(content).strip() contentlist.append(content) # print(content) print("第", i, "页爬取完成") # print("bookname:list:", booknamelist) # print("authorlist:", authorlist) writeIntoCSVFile(fileName='data/cultural2.txt') # print(html + "555555555555555555555") print('done')
标签:span,python,爬取,str,strip,print,import,class,图书 From: https://www.cnblogs.com/222wan/p/18253117