大数据分析与可视化 之 百度贴吧爬虫
import csv
import datetime
import json
from urllib import request, parse
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import requests
# 定义一个爬虫类
class TiebaSpider(object):
# 初始化url属性
def __init__(self):
self.url = 'http://tieba.baidu.com/f?{}'
# 1.请求函数,得到页面,传统三步
def get_html(self, url):
ua = UserAgent()
response = requests.get(url=url, headers={'User-Agent': ua.random})
# res = request.urlopen(req)
# windows会存在乱码问题,需要使用 gbk解码,并使用ignore忽略不能处理的字节
# linux不会存在上述问题,可以直接使用decode('utf-8')解码
html = response.content.decode('utf-8').replace("<!--", "").replace("-->", "")
return html
# 2.解析函数,此处代码暂时省略,还没介绍解析模块
def parse_html(self, html):
eroot = etree.HTML(html)
# 提取行数据
li_list = eroot.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
data = []
for li in li_list:
item = {}
item["title"] = li.xpath('./text()')[0]
item["link"] = 'https://tieba.baidu.com' + li.xpath('./@href')[0]
data.append(item)
return data
def save_csv(self, data, filename):
with open(filename, 'a', newline='', encoding='utf_8_sig') as csv_file:
# 指定CSV文件的列名
fieldnames = ['title', 'link']
# 创建CSV写入器
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# 写入列名
writer.writeheader()
# 写入数据
for row in data:
writer.writerow(row)
print(f'Data has been written to {filename}')
# def save_items(self, items, filename):
# for data in items:
# # 将字典转换为字符串,将字符串编码为UTF-8字节流
# data_str = json.dumps(data).encode("utf-8")
# data_str = data_str.decode("unicode_escape")
# self.write(data_str, filename)
#
# def write(self, item, filename):
# # 打开文件进行追加写入
# with open(filename, "a", encoding="utf-8") as file:
# # 写入数据 将字节流解码为字符串
# file.write(item)
# file.write("\n")
# 3.保存文件函数
# def save_html(self, filename, html):
# with open(filename, 'w') as f:
# f.write(html)
# 4.入口函数
def run(self):
# current_datetime = datetime.datetime.now()
# 构建文件名,精确到小时和分钟
# filename = current_datetime.strftime("%Y-%m-%d-%H-%M.csv")
name = input('输入贴吧名:')
begin = int(input('输入起始页:'))
stop = int(input('输入终止页:'))
# +1 操作保证能够取到整数
for page in range(begin, stop + 1):
pn = (page - 1) * 50
params = {
'kw': name,
'pn': str(pn)
}
# 拼接URL地址
params = parse.urlencode(params)
url = self.url.format(params)
# 发请求
html = self.get_html(url)
items = self.parse_html(html)
# self.save_items(items, filename)
filename = '{}-{}页.csv'.format(name, page)
self.save_csv(items, filename)
# 定义路径
# filename = '{}-{}页.html'.format(name, page)
# self.save_html(filename, html)
# 提示
print('第%d页抓取成功' % page)
# 每爬取一个页面随机休眠1-2秒钟的时间
time.sleep(random.randint(1, 2))
# 以脚本的形式启动爬虫
if __name__ == '__main__':
start = time.time()
spider = TiebaSpider() # 实例化一个对象spider
spider.run() # 调用入口函数
end = time.time()
# 查看程序执行时间
print('执行时间:%.2f' % (end - start)) # 爬虫执行时间
标签:数据分析,url,self,爬虫,filename,html,可视化,import,data
From: https://www.cnblogs.com/IvanKK/p/17936786