1. 主题式网络爬虫名称:天气预报爬取数据与可视化数据
2. 主题式网络爬虫爬取的内容与数据特征分析:
- 爬取内容:天气预报网站上的历史天气数据 包括(日期,最高温度,最低温度,天气,风向)等信息
- 数据特征分析:时效性,完整性,结构化,可预测性等特性
3. 主题式网络爬虫设计方案概述
(1) 导航栏位于界面顶部
(2) 右侧热门城市历史天气
(3) 中间是内容区海口气温走势图以及风向统计
(4) 页面底部是网站信息和网站服务
2. Htmls 页面解析
3. 节点(标签)查找方法与遍历方法
Part1: 爬取天气网历海口史天气数据并保存未:"海口历史天气【2023年11月】.xls"文件
1 import requests 2 from lxml import etree 3 import xlrd, xlwt, os 4 from xlutils.copy import copy 5
6 class TianQi(): 7 def \_\_init\_\_(self):
8 pass
10 #爬虫部分
11 def spider(self): 12 city\_dict = { 13 "海口": "haikou"
14 }
15 city = '海口'
16 city = city\_dict\[f'{city}'\]
17 year = '2023'
18 month = '11'
19 start\_url = f'https://lishi.tianqi.com/{city}/{year}{month}.html'
20 headers = { 21 'authority': 'lishi.tianqi.com',
22 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,\*/\*;q=0.8,application/signed-exchange;v=b3;q=0.7',
23 'accept-language': 'zh-CN,zh;q=0.9',
24 'cache-control': 'no-cache',
25 # Requests sorts cookies= alphabetically
26 'cookie': 'Hm\_lvt\_7c50c7060f1f743bccf8c150a646e90a=1701184759; Hm\_lvt\_30606b57e40fddacb2c26d2b789efbcb=1701184793; Hm\_lpvt\_30606b57e40fddacb2c26d2b789efbcb=1701184932; Hm\_lpvt\_7c50c7060f1f743bccf8c150a646e90a=1701185017',
27 'pragma': 'no-cache',
28 'referer': 'https://lishi.tianqi.com/ankang/202309.html',
29 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A\_Brand";v="24"',
30 'sec-ch-ua-mobile': '?0',
31 'sec-ch-ua-platform': '"Windows"',
32 'sec-fetch-dest': 'document',
33 'sec-fetch-mode': 'navigate',
34 'sec-fetch-site': 'same-origin',
35 'sec-fetch-user': '?1',
36 'upgrade-insecure-requests': '1',
37 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36',
38 }
39 response = requests.get(start\_url,headers=headers).text
40 tree = etree.HTML(response) 41 datas = tree.xpath("/html/body/div\[@class='main clearfix'\]/div\[@class='main\_left inleft'\]/div\[@class='tian\_three'\]/ul\[@class='thrui'\]/li")
42 weizhi = tree.xpath("/html/body/div\[@class='main clearfix'\]/div\[@class='main\_left inleft'\]/div\[@class='inleft\_tian'\]/div\[@class='tian\_one'\]/div\[@class='flex'\]\[1\]/h3/text()")\[0\]
43 self.parase(datas,weizhi,year,month)
46 #解析部分
47 def parase(self,datas,weizhi,year,month): 48 for data in datas: 49 #1、日期
50 datetime = data.xpath("./div\[@class='th200'\]/text()")\[0\]
51 #2、最高气温
52 max\_qiwen = data.xpath("./div\[@class='th140'\]\[1\]/text()")\[0\]
53 #3、最低气温
54 min\_qiwen = data.xpath("./div\[@class='th140'\]\[2\]/text()")\[0\]
55 #4、天气
56 tianqi = data.xpath("./div\[@class='th140'\]\[3\]/text()")\[0\]
57 #5、风向
58 fengxiang = data.xpath("./div\[@class='th140'\]\[4\]/text()")\[0\]
59 dict\_tianqi = { 60 '日期':datetime,
61 '最高气温':max\_qiwen,
62 '最低气温':min\_qiwen,
63 '天气':tianqi,
64 '风向':fengxiang
65 }
66 data\_excel = { 67 f'{weizhi}【{year}年{month}月】':\[datetime,max\_qiwen,min\_qiwen,tianqi,fengxiang\]
68 }
69 self.chucun\_excel(data\_excel,weizhi,year,month)
70 print(dict\_tianqi)
73 #储存部分
74 def chucun\_excel(self, data,weizhi,year,month): 75 if not os.path.exists(f'{weizhi}【{year}年{month}月】.xls'):
76 # 1、创建 Excel 文件
77 wb = xlwt.Workbook(encoding='utf-8')
78 # 2、创建新的 Sheet 表
79 sheet = wb.add\_sheet(f'{weizhi}【{year}年{month}月】', cell\_overwrite\_ok=True)
80 # 3、设置 Borders边框样式
81 borders = xlwt.Borders() 82 borders.left = xlwt.Borders.THIN 83 borders.right = xlwt.Borders.THIN 84 borders.top = xlwt.Borders.THIN 85 borders.bottom = xlwt.Borders.THIN 86 borders.left\_colour = 0x40
87 borders.right\_colour = 0x40
88 borders.top\_colour = 0x40
89 borders.bottom\_colour = 0x40
90 style = xlwt.XFStyle() # Create Style
91 style.borders = borders # Add Borders to Style
92 # 4、写入时居中设置
93 align = xlwt.Alignment() 94 align.horz = 0x02 # 水平居中
95 align.vert = 0x01 # 垂直居中
96 style.alignment = align 97 # 5、设置表头信息, 遍历写入数据, 保存数据
98 header = ( 99 '日期', '最高气温', '最低气温', '天气', '风向')
100 for i in range(0, len(header)):
101 sheet.col(i).width = 2560 \* 3
102 #行,列, 内容, 样式
103 sheet.write(0, i, header\[i\], style)
104 wb.save(f'{weizhi}【{year}年{month}月】.xls')
105 # 判断工作表是否存在
106 if os.path.exists(f'{weizhi}【{year}年{month}月】.xls'):
107 # 打开工作薄
108 wb = xlrd.open\_workbook(f'{weizhi}【{year}年{month}月】.xls')
109 # 获取工作薄中所有表的个数
110 sheets = wb.sheet\_names()
111 for i in range(len(sheets)):
112 for name in data.keys():
113 worksheet = wb.sheet\_by\_name(sheets\[i\])
114 # 获取工作薄中所有表中的表名与数据名对比
115 if worksheet.name == name:
116 # 获取表中已存在的行数
117 rows\_old = worksheet.nrows
118 # 将xlrd对象拷贝转化为xlwt对象
119 new\_workbook = copy(wb)
120 # 获取转化后的工作薄中的第i张表
121 new\_worksheet = new\_workbook.get\_sheet(i)
122 for num in range(0, len(data\[name\])):
123 new\_worksheet.write(rows\_old, num, data\[name\]\[num\])
124 new\_workbook.save(f'{weizhi}【{year}年{month}月】.xls')
126 if \_\_name\_\_ == '\_\_main\_\_':
127 t=TianQi()
128 t.spider()
1 import pandas as pd
2 from pyecharts.charts import Pie 3 from pyecharts import options as opts 4 from pyecharts.globals import ThemeType 5
6 def on(gender\_counts): 7 total = gender\_counts.sum() 8 percentages = {gender: count / total \* 100 for gender, count in gender\_counts.items()} 9 analysis\_parts = \[\]
10 for gender, percentage in percentages.items():
11 analysis\_parts.append(f"{gender}天气占比为{percentage:.2f}%,")
12 analysis\_report = "天气比例饼状图显示," + ''.join(analysis\_parts)
13 return analysis\_report
15 df = pd.read\_excel("海口历史天气【2023年11月】.xls")
16 gender\_counts = df\['天气'\].value\_counts()
17 analysis\_text = on(gender\_counts)
18 pie = Pie(init\_opts=opts.InitOpts(theme=ThemeType.WESTEROS,bg\_color='#e4cf8e'))
20 pie.add(
21 series\_name="海口市天气分布",
22 data\_pair=\[list(z) for z in zip(gender\_counts.index.tolist(), gender\_counts.values.tolist())\],
23 radius=\["40%", "70%"\],
24 rosetype="radius",
25 label\_opts=opts.LabelOpts(is\_show=True, position="outside", font\_size=14,
26 formatter="{a}<br/>{b}: {c} ({d}%)")
27 )
28 pie.set\_global\_opts(
29 title\_opts=opts.TitleOpts(title="海口市11月份天气分布",pos\_right="50%"),
30 legend\_opts=opts.LegendOpts(orient="vertical", pos\_top="15%", pos\_left="2%"),
31 toolbox\_opts=opts.ToolboxOpts(is\_show=True)
32 )
33 pie.set\_series\_opts(label\_opts=opts.LabelOpts(formatter="{b}: {c} ({d}%)"))
34 html\_content = pie.render\_embed()
36 # 生成HTML文件
37 complete\_html = f"""
38 <html>
39 <head>
40 <title>天气数据分析</title>
42 </head>
43 <body style="background-color: #e87f7f">
44 <div style='margin-top: 20px;background-color='#e87f7f''>
45 <div>{html\_content}</div>
46 <h3>分析报告:</h3>
47 <p>{analysis\_text}</p>
48 </div>
49 </body>
50 </html>
51 """
52 # 保存到HTML文件
53 with open("海口历史天气【2023年11月】饼图可视化.html", "w", encoding="utf-8") as file:
54 file.write(complete\_html)
1 import pandas as pd 2 import matplotlib.pyplot as plt 3 from matplotlib import font\_manager 4 import jieba 5
6 # 中文字体
7 font\_CN = font\_manager.FontProperties(fname="C:\\Windows\\Fonts\\STKAITI.TTF")
9 # 读取数据
10 df = pd.read\_excel('海口历史天气【2023年11月】.xls')
12 # 使用 jieba 处理数据,去除 "C"
13 df\['最高气温'\] = df\['最高气温'\].apply(lambda x: ''.join(jieba.cut(x))).str.replace('℃', '').astype(float)
14 df\['最低气温'\] = df\['最低气温'\].apply(lambda x: ''.join(jieba.cut(x))).str.replace('℃', '').astype(float)
15 # 开始绘图
16 plt.figure(figsize=(20, 8), dpi=80)
17 max\_tp = df\['最高气温'\].tolist()
18 min\_tp = df\['最低气温'\].tolist()
19 x\_day = range(1, 31)
20 # 绘制30天最高气温
21 plt.plot(x\_day, max\_tp, label = "最高气温", color = "red")
22 # 绘制30天最低气温
23 plt.plot(x\_day, min\_tp, label = "最低气温", color = "skyblue")
24 # 增加x轴刻度
25 \_xtick\_label = \["11月{}日".format(i) for i in x\_day\]
26 plt.xticks(x\_day, \_xtick\_label, fontproperties=font\_CN, rotation=45)
27 # 添加标题
28 plt.title("2023年11月最高气温与最低气温趋势", fontproperties=font\_CN)
29 plt.xlabel("日期", fontproperties=font\_CN)
30 plt.ylabel("温度(单位°C)", fontproperties=font\_CN)
31 plt.legend(prop = font\_CN)
32 plt.show()
1 from pyecharts.charts import WordCloud 2 from pyecharts import options as opts 3 from pyecharts.globals import SymbolType 4 import jieba 5 import pandas as pd 6 from collections import Counter 7
8 # 读取Excel文件
9 df = pd.read\_excel('海口历史天气【2023年11月】.xls')
10 # 提取商品名
11 word\_names = df\["风向"\].tolist() + df\["天气"\].tolist()
12 # 提取关键字
13 seg\_list = \[jieba.lcut(text) for text in word\_names\]
14 words = \[word for seg in seg\_list for word in seg if len(word) > 1\]
15 word\_counts = Counter(words)
16 word\_cloud\_data = \[(word, count) for word, count in word\_counts.items()\]
18 # 创建词云图
19 wordcloud = (
20 WordCloud(init\_opts=opts.InitOpts(bg\_color='#00FFFF'))
21 .add("", word\_cloud\_data, word\_size\_range=\[20, 100\], shape=SymbolType.DIAMOND,
22 word\_gap=5, rotate\_step=45,
23 textstyle\_opts=opts.TextStyleOpts(font\_family='cursive', font\_size=15))
24 .set\_global\_opts(title\_opts=opts.TitleOpts(title="天气预报词云图",pos\_top="5%", pos\_left="center"),
25 toolbox\_opts=opts.ToolboxOpts(
26 is\_show=True,
27 feature={
28 "saveAsImage": {},
29 "dataView": {},
30 "restore": {},
31 "refresh": {}
32 }
33 )
35 )
36 )
38 # 渲染词图到HTML文件
39 wordcloud.render("天气预报词云图.html")
129 import pandas as pd
130 import jieba
131 from pyecharts.charts import Scatter
132 from pyecharts import options as opts
134 from scipy import stats
136 # 读取数据
137 df = pd.read\_excel('海口历史天气【2023年11月】.xls')
139 # 使用 jieba 处理数据,去除 "C"
140 df\['最高气温'\] = df\['最高气温'\].apply(lambda x: ''.join(jieba.cut(x))).str.replace('℃', '').astype(float)
141 df\['最低气温'\] = df\['最低气温'\].apply(lambda x: ''.join(jieba.cut(x))).str.replace('℃', '').astype(float)
143 # 创建散点图
144 scatter = Scatter()
145 scatter.add\_xaxis(df\['最低气温'\].tolist())
146 scatter.add\_yaxis("最高气温", df\['最高气温'\].tolist())
147 scatter.set\_global\_opts(title\_opts=opts.TitleOpts(title="最低气温与最高气温的散点图"))
148 html\_content = scatter.render\_embed()
150 # 计算回归方程
151 slope, intercept, r\_value, p\_value, std\_err = stats.linregress(df\['最低气温'\], df\['最高气温'\])
153 print(f"回归方程为:y = {slope}x + {intercept}")
155 analysis\_text = f"回归方程为:y = {slope}x + {intercept}"
156 # 生成HTML文件
157 complete\_html = f"""
158 <html>
159 <head>
160 <title>天气数据分析</title>
161 </head>
162 <body style="background-color: #e87f7f">
163 <div style='margin-top: 20px;background-color='#e87f7f''>
164 <div>{html\_content}</div>
165 <p>{analysis\_text}</p>
166 </div>
167 </body>
168 </html>
169 """
170 # 保存到HTML文件
171 with open("海口历史天气【2023年11月】散点可视化.html", "w", encoding="utf-8") as file:
172 file.write(complete\_html)
1.根据散点图的显示回归方:y = 0.6988742964352719x + 10.877423389618516来获取海口市11月份温度趋势
