一、实验二
作业①:
要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
输出信息
序号 | 地区 | 日期 | 天气信息 | 温度 |
---|---|---|---|---|
1 | 北京 | 7日(今天) | 多云转晴 | 31℃/17℃ |
2 | 北京 | 8日(明天) | 晴 | 34℃/20℃ |
3 | 北京 | 9日(后天) | 多云转晴 | 32℃/19℃ |
4 | 北京 | 10日(周六) | 阴转阵雨 | 34℃/18℃ |
5 | 北京 | 11日(周末) | 阵雨 | 29℃/17℃ |
Gitee链接:作业二链接
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def __init__(self):
self.cursor = None
self.con = None
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table weathers (wCity varchar(16),"
"wDate varchar(16),"
"wWeather varchar(64),"
"wTemp varchar(32),"
"constraint pk_weather primary key (wCity,wDate))")
except Exception as err:
print(err)
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
"Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " 找不到代码")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
x = 0
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
if x == 0: # 为今天只有一个温度做判断 <i>14℃</i>
x += 1
temp = li.select('p[class="tem"] i')[0].text
else:
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")
心得体会:
学习了解了sqlite3,加深了对bs4的理解
作业②:
要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。
候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
技巧:在谷歌浏览器中进入 F12 调试模式进行抓包,查找股票列表加载使用的 url,并分析 api 返回的值,并根据所要求的参数可适当更改api 的请求参数。根据 URL 可观察请求的参数 f1、f2 可获取不同的数值,根据情况可删减请求的参数。
参考链接:https://zhuanlan.zhihu.com/p/50099084
Gitee链接:作业二链接
输出:
import requests
import json
import pandas as pd
import pymysql
HOSTNAME = '127.0.0.1'
PORT = '3306'
DATABASE = 'spider_ex'
USERNAME = 'root'
PASSWORD = '1234'
# 打开数据库连接
conn = pymysql.connect(host=HOSTNAME, user=USERNAME,password=PASSWORD,database=DATABASE,charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = conn.cursor()
sql_table = """
CREATE TABLE ex2_2 (
代码 VARCHAR(10),
名称 VARCHAR(50),
最新价 FLOAT,
涨跌幅 FLOAT,
涨跌额 FLOAT,
成交量 BIGINT,
成交额 FLOAT,
振幅 FLOAT,
最高 FLOAT,
最低 FLOAT,
今开 FLOAT,
昨收 FLOAT
)
"""
try:
cursor.execute(sql_table)
conn.commit()
print("表创建成功")
except Exception as err:
conn.rollback()
print("表创建失败", err)
def InsertData(data):
global conn
global cursor
sql = "INSERT INTO ex2_2(代码, 名称, 最新价, 涨跌幅, 涨跌额, 成交量, 成交额, 振幅, 最高, 最低, 今开, 昨收) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
try:
values = (
data['代码'],
data['名称'],
data['最新价'],
data['涨跌幅'],
data['涨跌额'],
data['成交量'],
data['成交额'],
data['振幅'],
data['最高'],
data['最低'],
data['今开'],
data['昨收'],
)
cursor.execute(sql, values)
print("插入成功")
except Exception as err:
print("插入失败",err)
def getHtml(page):
url = f"http://49.push2.eastmoney.com/api/qt/clist/get?"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"}
params = (
('cb', 'jQuery112405254755655209056_1696661743317'),
('pn', '%d' %page), # 第几页
('pz', '20'),
('po', '1'),
('np', '1'),
('ut', 'bd1d9ddb04089700cf9c27f6f7426281'),
('fltt', '2'),
('invt', '2'),
('wbp2u', '|0|0|0|web'),
('fid', 'f3'),
('fs', 'm:1 t:2,m:1 t:23'),# 板块
('fields', 'f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152'),
('_', '1696661743318'),
)
resp = requests.get(url, headers=headers,params=params, verify=False)
res = resp.content[len("jQuery112405254755655209056_1696661743317("):len(resp.content) - 2]
res = str(res, 'utf-8')
resp_json = json.loads(res)
data_list = [] # 创建一个列表来存储提取的数据
for k in resp_json['data']['diff']:
data = {
"代码": k['f12'],
"名称": k['f14'],
"最新价": k['f2'],
"涨跌幅": k['f3'],
"涨跌额": k['f4'],
"成交量": k['f5'],
"成交额": k['f6'],
"振幅": k['f7'],
"最高": k['f15'],
"最低": k['f16'],
"今开": k['f17'],
"昨收": k['f18'],
}
InsertData(data)
conn.commit()
for page_number in range(1, 10):
getHtml(page_number)
cursor.close()
conn.close()
心得体会:
学习了对接口数据的提取转化成json格式,对接口请求url参数的理解与分析,pymysql数据库的使用等,收获蛮多。
作业③:
要求:中国大学2021主榜
爬取主榜所有院校信息,并存储在数据库中,同时将浏览器F2调试分析的过程录制Gif加入至博客中。技巧:分析该网站的发包情况,分析获取数据的api
Gitee链接:作业二链接
输出结果:
import requests
import json
import pymysql
import pandas as pd
import re
HOSTNAME = '127.0.0.1'
PORT = '3306'
DATABASE = 'spider_ex'
USERNAME = 'root'
PASSWORD = '1234'
# 打开数据库连接
conn = pymysql.connect(host=HOSTNAME, user=USERNAME,password=PASSWORD,database=DATABASE,charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = conn.cursor()
sql_table = """
CREATE TABLE if not exists ex2_3 (
排名 INT AUTO_INCREMENT PRIMARY KEY,
名称 VARCHAR(50) ,
省份 VARCHAR(32),
类型 VARCHAR(32),
总分 VARCHAR(32)
) AUTO_INCREMENT=1
"""
try:
cursor.execute(sql_table)
conn.commit()
print("表创建成功")
except Exception as err:
conn.rollback()
print("表创建失败", err)
def InsertData(data):
global conn
global cursor
sql = "INSERT INTO ex2_3(名称 ,省份 ,类型 ,总分) VALUES (%s, %s, %s, %s)"
try:
values = (
data["名称"],
data["省份"],
data["类型"],
data["总分"],
)
cursor.execute(sql, values)
print("插入成功")
except Exception as err:
print("插入失败",err)
def getHtml():
url = "https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"
}
resp = requests.get(url=url, headers=headers)
res = resp.content.decode('utf-8')
univNameCn = re.findall('univNameCn:"(.*?)"', res, re.S)
province = re.findall('province:(.*?),', res, re.S)
univCategory = re.findall('univCategory:(.*?),', res, re.S)
score = re.findall('score:(.*?),', res, re.S)
data = []
for i in range(len(univNameCn)):
province1 = province[i]
univCategory1 = univCategory[i]
province_name = p.get(province1, province1)
category_name = c.get(univCategory1, univCategory1)
data ={
"名称": univNameCn[i],
"省份": province_name,
"类型": category_name,
"总分": score[i],
}
InsertData(data)
conn.commit()
getHtml()
cursor.close()
conn.close()
心得体会:
更熟悉了如何通过F12获取对应数据,数据的转换提取
加深了对正则表达式的使用理解,挺万金油的。