【爬虫】项目篇-爬取福州公交线路并保存至MongoDB

标签：xpath MongoDB route stop 爬虫爬取 text bus data

#http://www.fz-bus.cn/index.asp
#1）在MongoDB中创建一个数据库和一个集合。
#2）在程序执行过程中可输入线路名称查询公交线路，
# 每查询到一条线路的信息后，查询MongoDB数据库中是否存在该线路。若存在，则不做任何操作，否则执行第3步。
#将线路名称、起点和终点、途径站点、
# 冬季首末班车时间、非空调车价格和空调车价格以六个键值对保存为步骤1）中集合的一个文档。

import requests
import cchardet
from lxml import etree,html
from fake_useragent import UserAgent
import pymongo


def parse_url():
    items=source.xpath('//table[@width="520"][1]/tr[@bgcolor="#ECF4F9"]')

    for item in items:
        # 路线名称
        route_name=item.xpath('td[1]/div/text()')
        print(route_name)

        #起止站点
        origin_site=item.xpath('td[2]/div/text()')
        origin_site=[i.replace('\r\n',"").replace("\t","") for i in origin_site]
        print(origin_site)

        #途经站点
        passing_site=source.xpath('//table[@width="520"][2]/tr[@bgcolor="#ECF4F9"]/td[2]/div/text()')
        temp=source.xpath('//table[@width="520"][3]/tr[@bgcolor="#ECF4F9"]/td[2]/div/text()')
        passing_site=['->'.join(passing_site).replace("\r\n","")]
        passing_site.extend(['->'.join(temp).replace("\r\n","")])
        print(passing_site)

        #冬季首末班车时间
        winter_time=item.xpath('td[3]/div/text()')
        winter_time=[i for i in winter_time if '冬季' in i]
        print(winter_time)

        #非空调车价格
        non_air_price=item.xpath('td[4]/div/text()')
        print(non_air_price)

        #空调车价格
        air_price=item.xpath('td[5]/div/text()')
        print(air_price)
        data={
            '路线名称':route_name[0],
            '起止站点':'\n'.join(origin_site),
            '途经站点':'\n'.join(passing_site),
            '冬季首末班车时间':'\n'.join(winter_time),
            '非空调车价格':non_air_price[0],
            '空调车价格':air_price[0]
        }
        print(data)
        save_to_mongoDB(data)

def save_to_mongoDB(data):
    #myclient = pymongo.MongoClient("mongodb+srv://user_maria:hong12345@cluster0.92lly.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")
    myclient=pymongo.MongoClient(
        "mongodb+srv://HJY:hong12345@cluster0.nhhtz.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")    #创建数据库
    mydb=myclient['Fuzhou_db']
    #创建集合
    mycollection=mydb.Fuzhou_bus

    #查询数据是否已存在数据库中
    print(mycollection.find_one({'路线名称':str(route)}))
    if mycollection.find_one({'路线名称':route})!=None:
        pass

    else:
        # mycollection.insert(data)
        mycollection.insert_one(data)
    print(myclient.list_database_names())
if __name__ == '__main__':
    url="http://www.fz-bus.cn/line_Search.asp"
    route=7
    headers={
        'user-agent':UserAgent().random
    }
    req = requests.post(url, params={'Xianl': route, 'Ftype': 2},headers=headers)
    req.encoding = cchardet.detect(req.content)['encoding']
    source=html.fromstring(req.text)
    if "找不到你要查询的线路" in req.text :
        print("找不到你要查询的线路!")
    parse_url()

import requests
import cchardet
from lxml import html
import pymongo
head = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}


def get_data(route):
    data = {'Xianl': route}  # , 'imageField3.x':27, 'imageField3.y':10,'Ftype':2  这些字段加不加都可以
    resp = requests.post('http://www.fz-bus.cn/line_Search.asp',data=data, headers=head)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    if '找不到你要查询的线路' in resp.text:
        print(f'不存在{route}公交线路')
        return None
    else:
        root = html.fromstring(resp.text)
        table_list = root.xpath('//table[@width="520"]')
        route_name = table_list[0].xpath('tr[2]/td[1]/div/text()')[0]
        start_terminal_stop = table_list[0].xpath('tr[2]/td[2]/div')[0].xpath('string(.)')
        start_terminal_stop = '\n'.join(start_terminal_stop.split())
        begin_end_time = table_list[0].xpath('tr[2]/td[3]/div/text()')
        begin_end_time = begin_end_time[1] + '\n' + begin_end_time[3]
        price_non_air_conditioned_bus = table_list[0].xpath('tr[2]/td[4]/div/text()')[0]
        price_air_conditioned_bus = table_list[0].xpath('tr[2]/td[5]/div/text()')[0]
 #       print(route_name, start_terminal_stop, begin_end_time, price_non_air_conditioned_bus, price_air_conditioned_bus)
        bus_stop_routes = []
        for table in table_list[1:3]:
            bus_stop_tr_list = table.xpath('tr')
            bus_stop_list = []
            for bus_stop_tr in bus_stop_tr_list[2:]:
                bus_stop_name = bus_stop_tr.xpath('td[2]/div/text()')[0]
                bus_stop_list.append(bus_stop_name)
            bus_stop_routes.append('->'.join(bus_stop_list))

        bus_stop_route = '\n'.join(bus_stop_routes)
        return {'路线名称': route_name, '起止站点': start_terminal_stop, '途径站点': bus_stop_route, '冬季首末班车时间': begin_end_time, '非空调车价格': price_non_air_conditioned_bus,
                '空调车价格': price_air_conditioned_bus}


def save_to_mongodb(data):
    if data:
        client = pymongo.MongoClient()
        bus_route_db = client['bus_route']
        bus_route_collection = bus_route_db['bus_route_info']
        result = bus_route_collection.find_one({'路线名称': data['路线名称']})
        if result:
            print('数据库中已存在该线路')
        else:
            bus_route_collection.insert_one(data)
        client.close()


def show_routes():
    client = pymongo.MongoClient()
    bus_route_db = client['bus_route']
    bus_route_collection = bus_route_db['bus_route_info']
    result = bus_route_collection.find({})
    for route in result:
        print(route)
    client.close()


if __name__ == '__main__':
    route_name = input('请输入查询的公交车路线名称')
    save_to_mongodb(get_data(route_name))
    show_routes()

标签：xpath,MongoDB,route,stop,爬虫,爬取,text,bus,data
From： https://www.cnblogs.com/Gimm/p/18116349

【爬虫】项目篇-爬取福州公交线路并保存至MongoDB

相关文章

赞助商

阅读排行