最近在看图神经网络的视频,里面使用了北京上海的地铁信息,这里整了一下郑州的地铁信息,直接上代码
# 引包
from bs4 import BeautifulSoup
import requests
url = 'http://www.zzmetro.com/lines/query/operating_hours'
html = requests.get(url).text
soup = BeautifulSoup(html)
# 看地铁线的名字
name_arr = []
for i in soup.select('.lines_div .lines_ul_div')[0].find_all('p'):
# print(i.text)
name_arr.append(i.text)
# 找每条地铁线的沿线站点,以及用末班车的到站时间来计算邻近两站之间的通勤时间
zong_arr = []
for name, x in zip(name_arr, soup.select('.line_site .table_lx_div')):
# if name == '7号线':
# break
ditie_name = []
ditie_time = []
ll_time = 0
rr_time = 0
a1 = 0
a2 = 0
# 找到每行的数据
for i in x.find_all('tr')[2:]:
nn = i.find_all('td')
# 去掉无效值
if nn[2].text.strip() == '——' and nn[4].text.strip() == '——':
continue
# 记录时间
l_time = nn[2].text.strip().split(':')
r_time = nn[4].text.strip().split(':')
# 计算时间
if not l_time[0] == '——' and not l_time[0] == '---':
if l_time[0] == '0':
l_time[0] = '24'
a1 = abs(int(l_time[0]) * 60 + int(l_time[1]) - ll_time)
ll_time = int(l_time[0]) * 60 + int(l_time[1])
if not r_time[0] == '——' and not r_time[0] == '---':
if r_time[0] == '0':
r_time[0] = '24'
a2 = abs(int(r_time[0]) * 60 + int(r_time[1]) - rr_time)
rr_time = int(r_time[0]) * 60 + int(r_time[1])
# 打印信息
print(nn[0].text.strip(), nn[2].text.strip(), nn[4].text.strip(), name, min(a1, a2))
ditie_name.append(nn[0].text.strip())
ditie_time.append(min(a1, a2))
a1 = a2 = 100
# 将信息存储成csv
df = pd.DataFrame()
df['前一站'] = ditie_name[:-1]
df['后一站'] = ditie_name[1:]
df['地铁线'] = name
df['时间分钟'] = ditie_time[1:]
zong_arr.append(df)
# 拼接信息
df = pd.concat(zong_arr).reset_index().drop(['index'], axis=1)
`
标签:name,nn,int,text,strip,地铁,time,数据挖掘 From: https://www.cnblogs.com/heKaiii/p/17407909.html