解析_1_xpath基本使用
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<title>Title</title>
</head>
<body>
<ul>
<li id="l1" class="c1">北京</li>
<li id="2l">上海</li>
<li >深圳</li>
<li >武汉</li>
</ul>
<ul>
<li id="i1" class="c1">大连</li>
<li class="c2">锦州</li>
<li>沈阳</li>
</ul>
</body>
</html>
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/4/3 21:38.
@Author: haifei
"""
import time
from lxml import etree
'''
xpath解析两种方式
1、本地文件
etree.parse
2、服务器响应的数据(response.read().decode('utf-8')) <---主用
etree.HTML()
xpath基本语法:
1.路径查询
//:查找所有子孙节点,不考虑层级关系
/ :找直接子节点
2.谓词查询
//div[@id]
//div[@id="maincontent"]
3.属性查询
//@class
4.模糊查询
//div[contains(@id, "he")]
//div[starts‐with(@id, "he")]
5.内容查询
//div/h1/text()
6.逻辑运算(用的情况比较少)
//div[@id="head" and @class="s_down"]
//title | //price
'''
tree = etree.parse('./解析_1_xpath基本使用.html')
print(tree) # <lxml.etree._ElementTree object at 0x1028bf100>
# 查找body下的li
li_list = tree.xpath('//body/ul/li')
print(li_list)
print(len(li_list))
li_list2 = tree.xpath('//body//li') # 效果同li_list
print(li_list2)
print(len(li_list2))
# 查找所有有id的属性的li标签
li_list = tree.xpath('//li[@id]')
print(li_list)
print(len(li_list))
li_list = tree.xpath('//li[@id]/text()') # text()获取标签中的内容
print(li_list) # ['北京', '上海', '大连']
# 找到id为l1的li标签
li_list = tree.xpath('//li[@id="l1"]/text()') # 注意嵌套引号
print(li_list) # ['北京']
# 查找到id为l1的li标签的class的属性值
li = tree.xpath('//li[@id="l1"]/@class')
print(li) # ['c1']
# 查询id中包含l的li标签
li_list = tree.xpath('//li[contains(@id,"l")]/text()')
print(li_list) # ['北京', '上海']
# 查询id的值以i开头的li标签
li_list = tree.xpath('//li[starts-with(@id,"i")]/text()')
print(li_list) # ['大连']
#查询id为l1和class为c1的li标签
li_list = tree.xpath('//li[@id="l1" and @class="c1"]/text()')
print(li_list) # ['北京']
#查询id为l1或class为c2的li标签
li_list = tree.xpath('//li[@id="l1"]/text() | //li[@class="c2"]/text()') # 注意|后边需要重写路径符
print(li_list) # ['北京', '锦州']
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
解析_2_获取百度首页百度一下四个大字
"""
.-''-.
.--. _..._ .' .-. )
|__| .' '. / .' / /
.--..-,.--. . .-. . (_/ / /
| || .-. | | ' ' | / /
| || | | | _ _ | | | | / / _ _
| || | | || ' / | | | | | . ' | ' / |
| || | '-.' | .' | | | | | / / _.-').' | .' |
|__|| | / | / | | | | | .' ' _.'.-'' / | / |
| | | `'. | | | | | / /.-'_.' | `'. |
|_| ' .'| '/| | | | / _.' ' .'| '/
`-' `--' '--' '--'( _.-' `-' `--'
Created on 2023/4/3 22:40.
@Author: haifei
"""
import time
from lxml import etree
from urllib import request
# (1) 获取网页的源码
# (2) 解析(用etree.HTML解析的服务器响应的文件)
# (3) 打印
url = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
# 请求对象的定制
_request = request.Request(url=url, headers=headers)
# 模拟浏览器访问服务器
response = request.urlopen(_request)
# 获取网页源码
content = response.read().decode('utf-8')
# 解析服务器响应的文件
tree = etree.HTML(content)
# 获取想要的数据
print(tree.xpath('//input[@id="su"]/@value')) # ['百度一下']
result = tree.xpath('//input[@id="su"]/@value')[0] # xpath的返回值是一个列表类型的数据
print(result) # 百度一下
if __name__ == '__main__':
start = time.time()
print('It takes', time.time() - start, "seconds.")
标签:xpath,tree,list,li,print,解析,id
From: https://www.cnblogs.com/yppah/p/17284846.html