#http://www.fz-bus.cn/index.asp
#1)在MongoDB中创建一个数据库和一个集合。
#2)在程序执行过程中可输入线路名称查询公交线路,
# 每查询到一条线路的信息后,查询MongoDB数据库中是否存在该线路。若存在,则不做任何操作,否则执行第3步。
#将线路名称、起点和终点、途径站点、
# 冬季首末班车时间、非空调车价格和空调车价格以六个键值对保存为步骤1)中集合的一个文档。
import requests
import cchardet
from lxml import etree,html
from fake_useragent import UserAgent
import pymongo
def parse_url():
items=source.xpath('//table[@width="520"][1]/tr[@bgcolor="#ECF4F9"]')
for item in items:
# 路线名称
route_name=item.xpath('td[1]/div/text()')
print(route_name)
#起止站点
origin_site=item.xpath('td[2]/div/text()')
origin_site=[i.replace('\r\n',"").replace("\t","") for i in origin_site]
print(origin_site)
#途经站点
passing_site=source.xpath('//table[@width="520"][2]/tr[@bgcolor="#ECF4F9"]/td[2]/div/text()')
temp=source.xpath('//table[@width="520"][3]/tr[@bgcolor="#ECF4F9"]/td[2]/div/text()')
passing_site=['->'.join(passing_site).replace("\r\n","")]
passing_site.extend(['->'.join(temp).replace("\r\n","")])
print(passing_site)
#冬季首末班车时间
winter_time=item.xpath('td[3]/div/text()')
winter_time=[i for i in winter_time if '冬季' in i]
print(winter_time)
#非空调车价格
non_air_price=item.xpath('td[4]/div/text()')
print(non_air_price)
#空调车价格
air_price=item.xpath('td[5]/div/text()')
print(air_price)
data={
'路线名称':route_name[0],
'起止站点':'\n'.join(origin_site),
'途经站点':'\n'.join(passing_site),
'冬季首末班车时间':'\n'.join(winter_time),
'非空调车价格':non_air_price[0],
'空调车价格':air_price[0]
}
print(data)
save_to_mongoDB(data)
def save_to_mongoDB(data):
#myclient = pymongo.MongoClient("mongodb+srv://user_maria:hong12345@cluster0.92lly.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
myclient=pymongo.MongoClient(
"mongodb+srv://HJY:hong12345@cluster0.nhhtz.mongodb.net/myFirstDatabase?retryWrites=true&w=majority") #创建数据库
mydb=myclient['Fuzhou_db']
#创建集合
mycollection=mydb.Fuzhou_bus
#查询数据是否已存在数据库中
print(mycollection.find_one({'路线名称':str(route)}))
if mycollection.find_one({'路线名称':route})!=None:
pass
else:
# mycollection.insert(data)
mycollection.insert_one(data)
print(myclient.list_database_names())
if __name__ == '__main__':
url="http://www.fz-bus.cn/line_Search.asp"
route=7
headers={
'user-agent':UserAgent().random
}
req = requests.post(url, params={'Xianl': route, 'Ftype': 2},headers=headers)
req.encoding = cchardet.detect(req.content)['encoding']
source=html.fromstring(req.text)
if "找不到你要查询的线路" in req.text :
print("找不到你要查询的线路!")
parse_url()
import requests
import cchardet
from lxml import html
import pymongo
head = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
def get_data(route):
data = {'Xianl': route} # , 'imageField3.x':27, 'imageField3.y':10,'Ftype':2 这些字段加不加都可以
resp = requests.post('http://www.fz-bus.cn/line_Search.asp',data=data, headers=head)
resp.encoding = cchardet.detect(resp.content)['encoding']
if '找不到你要查询的线路' in resp.text:
print(f'不存在{route}公交线路')
return None
else:
root = html.fromstring(resp.text)
table_list = root.xpath('//table[@width="520"]')
route_name = table_list[0].xpath('tr[2]/td[1]/div/text()')[0]
start_terminal_stop = table_list[0].xpath('tr[2]/td[2]/div')[0].xpath('string(.)')
start_terminal_stop = '\n'.join(start_terminal_stop.split())
begin_end_time = table_list[0].xpath('tr[2]/td[3]/div/text()')
begin_end_time = begin_end_time[1] + '\n' + begin_end_time[3]
price_non_air_conditioned_bus = table_list[0].xpath('tr[2]/td[4]/div/text()')[0]
price_air_conditioned_bus = table_list[0].xpath('tr[2]/td[5]/div/text()')[0]
# print(route_name, start_terminal_stop, begin_end_time, price_non_air_conditioned_bus, price_air_conditioned_bus)
bus_stop_routes = []
for table in table_list[1:3]:
bus_stop_tr_list = table.xpath('tr')
bus_stop_list = []
for bus_stop_tr in bus_stop_tr_list[2:]:
bus_stop_name = bus_stop_tr.xpath('td[2]/div/text()')[0]
bus_stop_list.append(bus_stop_name)
bus_stop_routes.append('->'.join(bus_stop_list))
bus_stop_route = '\n'.join(bus_stop_routes)
return {'路线名称': route_name, '起止站点': start_terminal_stop, '途径站点': bus_stop_route, '冬季首末班车时间': begin_end_time, '非空调车价格': price_non_air_conditioned_bus,
'空调车价格': price_air_conditioned_bus}
def save_to_mongodb(data):
if data:
client = pymongo.MongoClient()
bus_route_db = client['bus_route']
bus_route_collection = bus_route_db['bus_route_info']
result = bus_route_collection.find_one({'路线名称': data['路线名称']})
if result:
print('数据库中已存在该线路')
else:
bus_route_collection.insert_one(data)
client.close()
def show_routes():
client = pymongo.MongoClient()
bus_route_db = client['bus_route']
bus_route_collection = bus_route_db['bus_route_info']
result = bus_route_collection.find({})
for route in result:
print(route)
client.close()
if __name__ == '__main__':
route_name = input('请输入查询的公交车路线名称')
save_to_mongodb(get_data(route_name))
show_routes()
标签:xpath,MongoDB,route,stop,爬虫,爬取,text,bus,data
From: https://www.cnblogs.com/Gimm/p/18116349