目录
静态网页的爬取
以爬取https://hongdou.gxnews.com.cn/viewforum-21-1.html
这个静态网页的南宁论坛数据为例
- 数据表的设计:
from peewee import *
db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root", password="123456")
class BaseModel(Model):
class Meta:
database = db
class Topic(BaseModel):
topic_id = IntegerField(primary_key=True) # 主题id,主键
title = CharField() # 主题标题
author = CharField() # 主题作者
publish_time = DateField() # 发表时间
click_num = IntegerField(default=0) # 点击数
answer_num = IntegerField(default=0) # 回复数量
final_answer_author = CharField() # 最后回复作者
final_answer_time = DateTimeField() # 最后回复时间
if __name__ == '__main__':
# 创建表结构
db.create_tables([Topic])
- 单线程版本,脚本如下:
import re
import time
from datetime import datetime
import requests
from scrapy import Selector
from models import *
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
def parse_url(url):
res = requests.get(url, headers=headers)
res.encoding = 'gb2312' #该论坛静态页面编码为gb2312
html_text = res.text
sel = Selector(text=html_text)
items = sel.xpath('//div[@class="threadbit1"]')
for item in items:
title_lst = item.xpath(".//div[@class='thread-row openTitle']/a/font/text()").extract()
if title_lst:
title = title_lst[0].strip()
author_lst = item.xpath(".//div[4]/a[1]/text()").extract()
if (author_lst):
author = author_lst[0]
publish_time_lst = item.xpath(".//div[4]/a[2]/text()").extract()
if (publish_time_lst):
publish_time = publish_time_lst[0]
publish_time = datetime.strptime(publish_time, r'%Y-%m-%d')
click_answer_lst = item.xpath(".//div[@style='float:right;width:90px;']/text()").extract()
if click_answer_lst:
click_answer_str = click_answer_lst[0].strip()
click_answer_str = click_answer_str.split('/')
answer_num = int(click_answer_str[0])
if click_answer_str[1] == '':
click_num = int(item.xpath(".//div[3]/font/text()").extract()[0])
else:
click_num = int(click_answer_str[1])
final_answer_author = item.xpath(".//div[2]/a[1]/text()").extract()[0]
final_answer_time = item.xpath(".//div[2]/a[2]/text()").extract()
if final_answer_time:
final_answer_time = final_answer_time[0]
final_answer_time = datetime.strptime(final_answer_time, r'%Y-%m-%d %H:%M')
id_lst = item.xpath(".//div[@class='thread-row openTitle']/a/@href").extract()
topic = Topic()
if id_lst:
id = int(re.search(r'(\d+)', id_lst[0]).group(1))
topic.topic_id = id
topic.title = title
topic.author = author
topic.publish_time = publish_time
topic.click_num = click_num
topic.answer_num = answer_num
topic.final_answer_author = final_answer_author
topic.final_answer_time = final_answer_time
existed_topics = Topic.select().where(Topic.topic_id == topic.topic_id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
print("start download topic: " + str(topic.topic_id))
time.sleep(1)
if __name__ == "__main__":
res = requests.get("https://xxx/viewforum-21-1.html", headers=headers)
res.encoding = 'gb2312'
html_text = res.text
sel = Selector(text=html_text)
# 获取总页数
td_str = sel.xpath("//div[@class='pagenav']//td[@class='pagenav']/text()").extract()[0]
match = re.search(r'(\d+/\d+)', td_str)
if match:
total_page = int(match.group(1).split('/')[1])
# total_page = 1
for i in range(0, total_page):
parse_url("https://xxx/viewforum-21-{0}.html".format(i+1))
标签:实战,text,爬虫,topic,lst,time,answer,click
From: https://www.cnblogs.com/xiaocer/p/17981394