首页 > 其他分享 >【爬虫】项目篇-使用xpath爬取搜房网二手房信息

【爬虫】项目篇-使用xpath爬取搜房网二手房信息

时间:2024-04-05 21:33:07浏览次数:19  
标签:xpath info url house list 爬取 搜房网 class

#使用requests和xpath从搜房网上抓取福州地区的二手房房源信息
# (要求获取所有分页上的房源,且每套房源包含标题、楼盘、
# 地点、经纬度、面积、房型、楼层、朝向、建筑年代、单价、总价、经纪人、联系电话等,缺数据的留空)。
import requests
from lxml import etree
from fake_useragent import UserAgent
import cchardet
import re
import csv
import logging
import time
import os
# 获取随机请求头
headers = {
    'user-agent': UserAgent().ie,
    #'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    #'cookie':'cityNum=28; cityNum=28; UM_distinctid=17c8cd027e5626-084f43eefc826-513c1e45-144000-17c8cd027e6706; uniqueName=688cf35d22519598d68971bf65fdde9b; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634450995; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634450835,1634486994,1634569816; CNZZDATA1262285598=2125395324-1634440330-|1634564759; codeNum=eyJpdiI6IjVXR1wvaWYxbFNFeUhiRFd6cUFvVFBnPT0iLCJ2YWx1ZSI6IitzZVp5cmw3MlA1aXRTdXpDeDBhT0Yyd3dTOHhqeHNwbzFzWmtKYndrU2E4VlY2QVpuVFc4Y0hKemNrN0FHR1giLCJtYWMiOiJhNmU2Yzk5YmRjYTliNjRlZmYyM2UzMzhhZDgzYzI5OWM1NDRhMDhkMjE5ZjAwZGIyM2IyMjI2ODhhZjY4MGM3In0=; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634569881; XSRF-TOKEN=eyJpdiI6IndzdEdtS3NPbGdUeVwvSk5JOXgzWVdBPT0iLCJ2YWx1ZSI6Ikk2UE8yTzhJUjFMclI4K1RJU3pzdTMzMWRkRVk4VE04d1ZWOTJTVDNVdlwvM2Y2U3RJeXlrV09hdGFRN3dKNVNtQm84VXNRb0tjQ3o4Z2VEbHZlQmpwdz09IiwibWFjIjoiNzA2NGUyNDM0NWU2OGIyZjRlMTljZmU4M2ZjNzY4ZTA5ZTUwZjIxNWQyMWFhM2UzZDU5M2JmNWU3ZDc4YTM2ZCJ9; www_sofang_session=eyJpdiI6IlwvQzRpejFBKzdaNnVUQXV6OUFxN0d3PT0iLCJ2YWx1ZSI6Im54dVI2WDFySHlIRVBKTHdTc1RZNzZEd2hha3ltTE1GYVU3T2ZYdGc1bktTbW13MUFiU2dSeXl2Wk5saU5JVmlpTFZibit5QkxYSWoxTFl0ZzNUVmxBPT0iLCJtYWMiOiIyMGEyMTJlZTA0NWZjMDc4ZTFmYmI5MzAyYjc2NmRlNzE3Yjc4NDVlOWM5MWY5YzdhMzg3NjAwZmMyMDhkYjc3In0=; cityid=eyJpdiI6IkJDZ1B0VG1oeGQ4R28xUjFLRFdja3c9PSIsInZhbHVlIjoiQTd2V2RcL2VyTlkzRXJsdTRMREp3aFE9PSIsIm1hYyI6ImJlZTA2MjBhN2RhNTUyOWFiNTI0ZjYxYzE5ZmFiZjFiNzUxNGFhYzdmODQ0Njk4NTY4ZDE2ZjUxNTE1NWJmM2YifQ==; city=eyJpdiI6InBMTWhBelBUaW1VQWdvWXlma0hQU2c9PSIsInZhbHVlIjoicFhkdHQ5K1YyTzBZUkZUVklrZE1JTW9ueE1NVWVFQlRCTk5DQkJIdm5xZUtkNERzTHRXWjhKK0VJNkpFUnhqMDFrdmdCTWZIc1JBU1YxSWtzZUZlSWprMGtkZ0FPb1hDTjR5NlNKOWM5ZzJETktBcE1Gb091MGp0TjRqcEZTRmVoZmRVMUV0ZlhEdWwzeTB6WWNXbndTN3FRWXR6dHlsTGM0Um9YM0VzTE5QWGFqRnRIOWp4SG16YkVESU9YbHNXWDdsamdyY1NSNVJMN1k3cFRqZlBuQT09IiwibWFjIjoiZGRkOTAyODUxODllZjBjNTIzMThlMTA3YzhkOTM1ZDRhNGM2YTRkYjUyY2U2Y2NhYjg5Y2VlNGY0YWIwYWUyNSJ9; citypy=eyJpdiI6IlRROTdoN0ZCUVEyU2t6Mk5Ua0JVZFE9PSIsInZhbHVlIjoiZVRleVdFZXlBN2dxdEJkeUErZGp2QT09IiwibWFjIjoiOTk4ZjIwZGI5MGUzYjJmN2E1YTA5MjhlMmY1YzNlZTU2ODIyZjIwZGRmZmYzYmY0ZGFjY2FmYmI5MDc4OGIxZiJ9; cityId=eyJpdiI6Inkxc1l0SkNaN3ZZeFJMMTAySjFWanc9PSIsInZhbHVlIjoibkpyZkhOVDdmZkJZa3B1WEoram5pdz09IiwibWFjIjoiYTg0NGI5ZDNhMmExMzg3Mjg0YmY3MzAxMGVlMDcyNzk5NTNhOGE5NmMzNWViN2Q5NWVjYTVjODhmZDgwYjU1MyJ9'
}
def getProxies():
    proxy_url="http://www.66ip.cn/"
    proxy={
        'https:''223.241.77.45:3256'
    }

#请求网页信息
def request_url(url):
    req=requests.get(url,headers=headers)
    req.encoding=cchardet.detect(req.content)['encoding']
    source=etree.HTML(req.text)
    return source

def get_condetion(source):

    #1.标题
    title_list=source.xpath('//p[@class="name"]/a/text()')
    time.sleep(5)
    #2.楼盘
    building_list=source.xpath('//strong/text()')
    time.sleep(5)
    #3.地址
    add_init=source.xpath("//span[@title]")
    add_list=[add.xpath("text()")[0] for add in add_init]
    # #去掉列表中的空格符和换行符
    add_list=[i.strip() for i in add_list if i.strip()!='']
    time.sleep(5)
    #使用百度api获取经纬度
    # for add in add_list:
    #     locate=getlocation_from_api(add)
    #     print(locate)

    #获取地图链接
    map_url=[''.join(i.xpath("following-sibling::*[1]/@href")) for i in add_init]
    time.sleep(5)
    #4.经纬度
    locate_list=[",".join([re.search("longitude=(.+)&latitude=(.+)",i,re.S).group(1),
                      re.search("longitude=(.+)&latitude=(.+)",i,re.S).group(2)])
            if i!='' else "" for i in map_url]
    time.sleep(5)
    #5.面积
    area_list=source.xpath('//p[@class="type clearfix"]/span[1]/text()')
    time.sleep(5)
    #6.房型
    layout_list=source.xpath('//p[@class="type clearfix"]/span[3]/text()')
    layout_list=[i.replace("\r\n","").replace(" ","") for i in layout_list]
    time.sleep(5)
    #7.楼层
    stories_list=source.xpath('//p[@class="type clearfix"]/span[5]/text()')
    time.sleep(5)

    #8.朝向
    direction_list=source.xpath('//p[@class="type clearfix"]/span[7]/text()')
    time.sleep(2)

    #9.建筑年代
    year_list=source.xpath('//p[@class="type clearfix"]/span[10]/text()')
    time.sleep(2)

    #10.单价
    uprice_list=source.xpath('//dd[@class="house_price"]/p[2]/text()')
    uprice_list=[i.replace("\r\n","").replace(" ","") for i in uprice_list]
    time.sleep(2)

    #11.总价
    tprice_list = source.xpath('//dd[@class="house_price"]/p[1]/span/text()')
    tprice_list=[ i+"万" for i in tprice_list]
    time.sleep(2)

    #12.经纪人
    agent_list=source.xpath('//a[@class="broker_name"]/text()')
    agent_list=[i.strip() for i in agent_list if i.strip() != '']

    #13.联系电话
    tel_url=source.xpath('//a[@class="broker_name"]/@href')
    tel_list=[]
    for url in tel_url:
        url="https://fz.sofang.com/"+url
        content=request_url(url)
        try:
            tel=content.xpath('//div[@class="broker_tel"]/text()')
            tel=[i.strip() for i in tel if i.strip() != '']
            tel_list.append(''.join(tel))
        except:
            tel_list.append("")
    #print(tel_list)
    time.sleep(2)

    #14.标签
    tag=source.xpath('//p[@class="tag clearfix"]')
    tag_list=[]
    for i in tag:
        i=i.xpath('span/text()')
        i="/".join(i)
        tag_list.append(i)
    #print(tag_list)
    time.sleep(2)
    for title,building,address,locate,area,layout,floor,direction,year,uprice\
            ,tprice,agent,tel,tag in \
            zip(title_list,building_list,add_list,locate_list,area_list,
                layout_list,stories_list,direction_list,year_list,uprice_list,
                tprice_list,agent_list,tel_list,tag_list):
        data = []
        data.append(title)
        data.append(building)
        data.append(address)
        data.append(locate)
        data.append(area)
        data.append(layout)
        data.append(floor)
        data.append(direction)
        data.append(year)
        data.append(uprice)
        data.append(tprice)
        data.append(agent)
        data.append(tel)
        data.append(tag)
        print(data)
        time.sleep(2)
        save(data)
def save(data):
    with open("搜房网-福州房价.csv","a",encoding="utf-8-sig",newline="") as f:
        a=csv.writer(f)
        a.writerow(data)

#调用百度地图api获取经纬度
def getlocation_from_api(address):
    bdurl = "https://api.map.baidu.com/geocoding/v3/?"
    params = {
        'address': address,
        'output': 'json',
        'ak': "UYKdveDmML50ykiqRIFfLekfCWcgeB4r",
        # 'callback': 'showlocation',
        'city': '福州'
    }
    req = requests.get(bdurl,headers=headers,params=params)
    #经度
    lgn=req.json()['result']['location']['lng']
    #纬度
    lat = req.json()['result']['location']['lat']
    return ','.join([str(lgn),str(lat)])

if __name__ == '__main__':
    base_url="https://fz.sofang.com/esfsale/area/"
    source=request_url(base_url)
    page=source.xpath('//li/a[@alt]/text()')[-3]
    if os.path.exists("搜房网-福州房价.csv")==True:
        pass
    else:
        with open("搜房网-福州房价.csv","w",encoding="utf-8-sig",newline="") as f:
            a=csv.writer(f)
            a.writerow(['标题','楼盘','地址','经纬度','面积','房型','楼层','朝向','建筑年代','单价','总价','经纪人','联系电话','标签'])
            f.close()

    for i in range(1,int(page)+1):
         url=base_url+"bl"+str(i)+"?"
         content=request_url(url)
         print(url)
         time.sleep(10)
         get_condetion(content)
         time. Sleep(10)

import requests
import cchardet
from lxml import html
import re
import xlsxwriter
import time
import random
head = {
'cookie': 'UM_distinctid=17c7f7b1d8b7c1-018dda850a8de5-b7a1b38-1fa400-17c7f7b1d8ccca; uniqueName=e1aac984e2899a3acdb3ec3f75190084; cityNum=28; Hm_lvt_bc2dcba98591e113d6329631910c965b=1634734120,1634791780,1634901813,1634905507; Hm_lvt_d2801fc638056c1aac7e8008f41cf828=1634280948,1634737369,1634798622,1634905516; Hm_lpvt_d2801fc638056c1aac7e8008f41cf828=1634905551; remember_82e5d2c56bdd0811318f0cf078b78bfc=eyJpdiI6Ild4S0I2ZVJPMjFLQkw3V1l1UTF6Mmc9PSIsInZhbHVlIjoiKytpb3NNTVRlV3Ntcjh2Vk9rSHB4QzFZNVRPTjNZWmF3UUNYdENESFwvWTA1dEw3SHlkaHYzXC9jb2lBZEVSelowaFNUQkdqS3Q1MWVsWTlnVVIxV29rY04rWThpdW1ZeUh6SnRYZzE3ZjNEZz0iLCJtYWMiOiJiOGFlZDZiNWQwNWMyNjBkMjhlMmQwZDMyMTU3YzQwYzM4YzdhYjcxM2Q5ODAyYzgyZWM3MzYzNWJlNjg2ZjhhIn0%3D; codeNum=eyJpdiI6IjVqSUJSNDlQVE1jaGRoN3NqemtnSlE9PSIsInZhbHVlIjoiS25vdzVGNjNLcVVNKzlQTVN2YSt0aW12VjR5K1A5U0lTRFUya2hndnZGOFlKZmdlaVFlTW1BTndoYTE0TncwRCIsIm1hYyI6IjJjYzBjYWU0YmYxNjJkOGFmZDUwOTg1OWNjNzBiNDY0ZjczNTBlMTY5ZTlhMGVlMDc0YWZmY2Q2MzI0NjkxYzEifQ%3D%3D; CNZZDATA1262285598=1501788919-1634225051-https%253A%252F%252Fwww.baidu.com%252F%7C1634907133; Hm_lpvt_bc2dcba98591e113d6329631910c965b=1634912068; XSRF-TOKEN=eyJpdiI6IlpMMVZ0VWFGYXUrUVMwclJ1a0phQXc9PSIsInZhbHVlIjoiM0dtWGZrSGJLN2o1bitZdWVINnZJWTNWWUhjc1N4Yk5pUjd1UkpaSnM3d2xLMTErTUZJVzcrZkdUdUprdFFmTVNwQloyZEZZSGdsYzkrK3hzV1RcL1pnPT0iLCJtYWMiOiIwNDRkNmQ3MGU3MGY4NzQ0NDFjMjUyNWVlYjgxZGIwNThkNWU3YTdlZDhhMWU0MzkxZTA1YWQ4MTYxMDAzNjkzIn0%3D; www_sofang_session=eyJpdiI6Ik5UclBnbnBWbnE1OWxQV3FlXC94dU53PT0iLCJ2YWx1ZSI6IjFlMVUwUGZsMDFHRUl3V0p5VlZcL2hSakVnSjFqTllPa0RYZE95Wk95NElFcW53TDk3eldCVUVcL2hZMUgxaEhUaVwvRGI0eHc1MW5FcW8zaUEzdFhiVDBRPT0iLCJtYWMiOiI4NGRhZWZkZGEwZmViYmQ1NTE3MTc4MzJiYTNlNmFhYmY2NGZjMTFhYjkyZGIyNTFjY2Q3MzU2YzgyM2Y4ZDNiIn0%3D; cityid=eyJpdiI6InF3VlV2WXY0YXowVmh5OUNCQnRMQUE9PSIsInZhbHVlIjoiVmtLWTRiXC9QMk9OWWZJUFpJbHJ4NHc9PSIsIm1hYyI6IjQ3NGQ3YzY4ZjQ3ZjVhNTMyNTJmODQ2ZTljYTA0NTY3YTRhYTNiODc0NWNlNTgyZmViMmMyZWRiYjM5OTYwY2IifQ%3D%3D; city=eyJpdiI6IlM4ZUtzcWt4QnY2RHppaTllaFFFTmc9PSIsInZhbHVlIjoiUndaXC92OFdycENKNW1lU01SUUZEdDZLQm1zMjJieVdJOWhJb25vN3FpeWZoV0xpWXBXTDdkejQ3b05cL1pTNUk5ME5HbXl3TWZkUWF6cW9icEZaS05kUkgrOFJlMTdaQmFxZ1lwdWx0elcxXC9OcTNFRGtGZm5zcnBlaFBCXC91ZzJ1RjNDY29WaHNGZWJMUGJHUDkzSGJiYnNoWWFXVXhaN09kOEZVVFVkVm50SjZLK0hVMVhYOGVweXY1NkxHbXhrXC9uQktIc3FJUnpvZFwvXC9JWTBcL3Y2ZlJBPT0iLCJtYWMiOiI0NTM3MDU4ODA2YzAzYWRhOTQ3YjhlOTZlOGU4ZWIyMGU4NTRkMTk4ZWM4MTEzODZhNmJmZmNlNDNmYTgxOGRjIn0%3D; citypy=eyJpdiI6ImJrZHhySjNTKzhONmpaeEpHK0RHRHc9PSIsInZhbHVlIjoiZmd2Q051VGpTZEhNTFBiM1FpbHRCZz09IiwibWFjIjoiYmUxNTc3NWMyYTg5ZTA3M2U3NWQ3OTIyZTkwNmE3MzJmNDQ3OTJhMzM1MjY4N2Q0NTc0NzEyMTExY2YzOGQwZSJ9; cityId=eyJpdiI6IkZaNjU4QnU4TXNibldnYWEwenpZSEE9PSIsInZhbHVlIjoidVZBVFhnVE1QamZrY1o0ZWNqTnVnUT09IiwibWFjIjoiYzk2NjRmNjZmZjY5MWVkZTY0OGVjOWM2ZGI4YzJkYjExMjJmMzE5Y2NhMmJjYmE3YmVkYzQ5Njc2MGRiZDdhNCJ9',
# 'upgrade-insecure-requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}

def get_source(url):
    resp = requests.get(url, headers=head)
    resp.encoding = cchardet.detect(resp.content)['encoding']
    return resp.text

def is_url_in_list(url, house_info_list):
    i = 0
    while i < len(house_info_list):
        if url == house_info_list[i][12]:
            return i
        i += 1
    return -1

def get_data_secondhand_house_onepage(source, house_info_list):
    root = html.fromstring(source)
    house_resources_list = root.xpath('//div[@class="list_l"]/div/dl')
    for house_resource in house_resources_list:
        title = house_resource.xpath('dd[@class="house_msg"]/p[@class="name"]/a/text()')[0]
        estate = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]//strong/text()')[0]
        address = house_resource.xpath('dd[@class="house_msg"]//span[@class="address"]/text()')[0].strip()
        longitude_latitude = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"area")]/a[@target="_blank"]/@href')
        if longitude_latitude:
            longitude_latitude = re.search(r'longitude=(.*?)&latitude=(.*)', longitude_latitude[0], re.S)
            longitude_latitude = '{},{}'.format(longitude_latitude.group(1), longitude_latitude.group(2))
        else:
            longitude_latitude = ''
        house_type = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"type")]')[0]
        area = house_type.xpath('span[1]/text()')[0]
        layout = re.sub('[\s\r\n]', '', house_type.xpath('span[3]/text()')[0])
        storey = house_type.xpath('span[5]/text()')[0]
        orientations = house_type.xpath('span[7]/text()')[0]
        construction_time = house_type.xpath('span[10]')
        if construction_time:
            construction_time = construction_time[0].xpath('text()')[0]
        else:
            construction_time = ''
        price = house_resource.xpath('dd[@class="house_price"]/p')[0].xpath('string(.)').strip()
        unit_price = house_resource.xpath('dd[@class="house_price"]/p[@class="junjia"]/text()')[0].strip()
        broker_a_tag = house_resource.xpath('dd[@class="house_msg"]/div[starts-with(@class,"house_info")]/p[starts-with(@class,"tag")]/a[@class="broker_name"]')[0]
        broker_url = 'https://fz.sofang.com' + broker_a_tag.xpath('@href')[0]
        broker_name = broker_a_tag.xpath('string(.)').strip()
        pos_in_house_info_list = is_url_in_list(broker_url, house_info_list)
        if pos_in_house_info_list != -1:
            broker_tel = house_info_list[pos_in_house_info_list][13]
        else:
            time.sleep(random.randint(1, 3))
            broker_root = html.fromstring(get_source(broker_url))
            broker_tel = broker_root.xpath('//div[@class="broker_tel"]')[0].xpath('string(.)').strip()
        print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
              .format(title,estate,address,longitude_latitude,area,layout,storey,orientations,construction_time,price,unit_price,broker_name,broker_url,broker_tel))
        house_info_list.append((title, estate, address, longitude_latitude, area, layout, storey, orientations,
                                construction_time, price, unit_price, broker_name, broker_url, broker_tel))


def get_next_url(source):
    root = html.fromstring(source)
    url_next_page = root.xpath('//a[contains(text(),"下一页")]/@href')
    if url_next_page:
        url_next_page = 'https://fz.sofang.com' + url_next_page[0]
    return url_next_page

def save_data(house_info_list):
    workbook = xlsxwriter.Workbook('soufang.xlsx')
    worksheet = workbook.add_worksheet()
    # worksheet.set_column('A:A', 20)
    # worksheet.set_column('B:B', 10)
    worksheet.write('A1', '标题')
    worksheet.write('B1', '楼盘')
    worksheet.write('C1', '地点')
    worksheet.write('D1', '经纬度')
    worksheet.write('E1', '面积')
    worksheet.write('F1', '房型')
    worksheet.write('G1', '楼层')
    worksheet.write('H1', '朝向')
    worksheet.write('I1', '建筑年份')
    worksheet.write('J1', '总价')
    worksheet.write('K1', '单价')
    worksheet.write('L1', '经纪人')
    worksheet.write('M1', '经纪人url')
    worksheet.write('N1', '电话')
    i = 2
    for house_info in house_info_list:
        j = 0
        for data in house_info:
            worksheet.write('{}{}'.format(chr(ord('A')+j), i), data)
            j += 1
        i += 1
    workbook.close()


if __name__ == '__main__':
    house_info_list = []
    i = 1
    url = 'https://fz.sofang.com/esfsale/area'
    while True:
        # 访问搜房网福州二手房页面,获得网页源码
        head['referer'] = 'https://fz.sofang.com/esfsale/area/bl{}?'.format(i)  #重要!网站反爬时采用了请求头中的referer字段
        source = get_source(url)
        # 从源码中提取需要的数据
        get_data_secondhand_house_onepage(source, house_info_list)
        time.sleep(random.randint(1, 3))
        print(f'已提取第{i}页的数据\n')
        url = get_next_url(source)
        if not url:
            break
        i += 1

    # # 输出数据
    # for house_info in house_info_list:
    #     print('标题:{}\n楼盘:{}\n地点:{}\n经纬度:{}\n面积:{}\n房型:{}\n楼层:{}\n朝向:{}\n建筑年份:{}\n总价:{}\n单价:{}\n经纪人:{}\n经纪人url:{}\n电话:{}\n'
    #           .format(house_info[0],house_info[1],house_info[2],house_info[3],house_info[4],
    #                   house_info[5],house_info[6],house_info[7],house_info[8],house_info[9],house_info[10],house_info[11],house_info[12],house_info[13]))
    
    save_data(house_info_list)

标签:xpath,info,url,house,list,爬取,搜房网,class
From: https://www.cnblogs.com/Gimm/p/18116245

相关文章

  • 数据解析-bs4、xpath
    爬虫网页解析方法1:bs4#导入frombs4importBeautifulSoup#创建一个解析对象, page_text是网络请求到的页面源码数据soup=BeautifulSoup(page_text,'lxml') #调用BeautifulSoup相关属性和方法进行数据提取#标签定位-方式1:  soup.tagName(只可以定位到第一次出......
  • 网页信息爬取
    网页信息爬取示例代码:importrequestsfrombs4importBeautifulSoupdefscrape_website(url):#发起GET请求并获取网页内容response=requests.get(url)#检查响应状态码,200表示请求成功ifresponse.status_code==200:#使用Beautif......
  • 爬虫之xpath的使用
    一、xpath初步认识1、xpath介绍XPath(XMLPathLanguage)是一种用于在XML文档中定位节点的语言。它是一种在XML文档中导航和查询信息的语言,类似于在关系数据库中使用SQL查询数据。XPath提供了一种灵活的方式来定位和处理XML文档中的元素和属性。2、lxml的安装lxml是Pyt......
  • xpath
    xpath(1)介绍可在XML中查找信息支持HTML的查找通过元素和属性进行导航pipinstalllxmlfromlxmlimportetree#将源码转化为能被XPath匹配的格式selector=etree.HTML(源码)#返回为一列表res=selector.xpath(表达式)(2)使用(1)路径表达式表达式描述实例......
  • 某宝之店铺id与旺旺号爬虫爬取
    1.先上昨日(24-3-29)图ShopId:旺旺号文章正文:随着互联网的快速发展,大数据时代已经到来。在这个时代,数据成为了宝贵的资源,而如何高效、自动化地获取数据成为了许多企业和开发者关注的焦点。本文将详细介绍如何利用协议爬取、自动化技术与隧道代理相结合,轻松采集淘宝店铺信息......
  • 初始xpath
    包的安装pipinstalllxml谷歌浏览器插件安装XPathHelper可以自行搜索安装也可以点击:传送门解析流程与使用实例化一个etree的对象,把即将被解析的页面源码加载到该对象。调用该对象的xpath方法结合着不同形式的xpath表达式进行标签定位和数据提取#导入lxml.etree......
  • XPath攻略:从入门到精通,元素查找不再难
    简介XPath是一种用于在XML文档中检索信息的语言。它通过路径表达式导航XML文档,广泛应用于各种场景。XPath的灵活性和强大功能使其成为在XML结构中准确定位和提取数据的重要工具。XPath使用场景Web自动化测试:XPath在Web自动化测试中广泛应用,XPath提供了一种强大......
  • 最详细爬虫零基础教程11——html格式提取之xpath
    文章目录一、html和xml二、xpath获取节点属性三、xpath语法四、案例展示总结一、html和xmllxml是一款高性能的PythonHTML/XML解析器,我们可以利用XPath,来快速的定位特定元素以及获取节点信息。区别:1.xml被设计为传输和存储数据,其焦点是数据的内容2.html是显示......
  • XPath定位如何在App自动化测试中大显神威
    简介和selenium类似,作为App自动化测试的主流框架,appium也是以webdriver为基础来自动化操作App的,所以对于元素定位,其实appium与selenium也是类似的,只是appium还有自己的安卓原生定位方式等方法,但是关于ID定位,name定位,xpath定位等,appium也是支持的,本篇文章就来给大家介绍一下xpath......
  • Fiddler 爬取edge“您的连接不是私密连接”解决方案
    这个一般只有https的网址才会出现,http的网站是不会出现的。因为https是安全的访问连接,是在http基础上增加了SSL/TLS协议来进行加密和身份验证,确保传输过程的安全性。当不符合安全规则的时候就会判定是不安全的私密连接。判断的安全规则包括:1.过期的SSL证书:SSL证书有一个有......