目录
bs4模块
beautifulsoup4
从HTML
或XML
文件中提取数据的Python
库,用它来解析爬取回来的xml
。
1.安装
pip install beautifulsoup4 # 下载bs4模块
pip install lxml #解析库
2. 用法
soup=BeautifulSoup('要解析的内容str类型','html.parser/lxml')
bs4遍历文档树
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
kimi
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" name='kiki'>Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 实列化对象
soup = BeautifulSoup(html_doc, 'lxml')
# 1 美化,不是标准xml,完成美化
# print(soup)
# print(soup.prettify())
# 2 遍历文档树---》通过 . 来遍历
# print(soup.html.body.p.b) # 一层一层找 <b>The Dormouse's story</b>
# print(soup.p) # # 跨层 只找第一个p标签
# 3、获取标签的名称
# print(soup.a.name) # a
# 4、获取标签的属性 ---》属性字典
# print(soup.a.attrs['href']) # http://example.com/elsie
# print(soup.a.attrs['class']) # ['sister']
# print(soup.a.attrs.get('class')) # ['sister']
# print(soup.a.attrs.get('name')) # kiki
# 5、获取标签的内容
# print(soup.p.text) # 获取所有p标签的文本
"""string p下的文本只有一个时,取到,否则为None"""
# print(soup.p.string) # None
""" strings拿到的是生成器对象"""
# print(list(soup.p.strings)) # generator生成器对象
# 6、嵌套选择
# print(soup.html.body)
# ---- 了解
# 7.子节点、子孙节点
# print(soup.body.p.contents) #p下所有子节点,只取一层
# print(list(soup.p.children)) #list_iterator得到一个迭代器,包含p下所有子节点 只取一层
# print(list(soup.body.descendants)) # generator 拿到body标签下的所有的字节点
# 8.父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点
# print(list(soup.a.parents) ) #找到a标签所有的祖先节点,
# 9、兄弟节点
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象
bs4搜索文档树
find
和find_all
,
# find参数
name
class_
id
text
attrs
limit:限制调试,find_all用的 find本质是find_all limit=1
recursive:查找的时候,是只找第一层还是子子孙孙都找,默认是True,子子孙孙都找
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
kimi
<b>The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" name='kiki'>Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 实列化对象
soup = BeautifulSoup(html_doc, 'lxml')
"""五种过滤器: 字符串、正则表达式、列表、True、方法 """
# 1 字符串--->查询的条件是字符串
# res = soup.find(name='p') # 返回p标签
# res = soup.find_all('p') # 返回p标签的列表
# res = soup.find_all(class_='sister') # 类名叫sister的所有标签
# res = soup.find_all(attrs={'class':'sister'})
# res = soup.find_all(id='link1') # 查找link1的标签
# res = soup.find_all(attrs={'id':'link1'})
# res = soup.find(text='Elsie').parent # 文本内容叫Elsie的父标签
# 2 正则表达式
import re
# res = soup.find_all(id=re.compile('^l')) # 所有id以l开头的标签
# res = soup.find_all(class_=re.compile('^s')) # 所有class以s开头的标签
# 3 列表
# res = soup.find_all(id=['link1','link2']) # 拿到id=link1和id=link2的标签列表
# res = soup.find_all(name=['a','b']) # 拿到所有的a,p标签列表
# res = soup.find_all(['a','b']) # 不写默认是name=['a','b']
# 4 True
# res = soup.find_all(id=True) # 所有id的标签
# res = soup.find_all(href=True) # 所有href属性的标签
# res = soup.find_all(class_=True) # 所有class_属性的标签
# 5 方法
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
print(soup.find_all(name=has_class_but_no_id))
# limit 参数
# res=soup.find_all(href=True,limit=2) # 查找href属性的标签,只拿两个
# recursive 查找的时候,子子孙孙都找
# res=soup.find_all(name='p',recursive=True)
# res=soup.find_all(name='a')
# 建议遍历和搜索一起用
res=soup.html.body.p.find_all(name='b',recursive=False)
print(res) # [<b>The Dormouse's story</b>]
CSS筛选器
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
<b>The Dormouse's story</b>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div class='panel-1'>
<ul class='list' id='list-1'>
<li class='element'>Foo</li>
<li class='element'>Bar</li>
<li class='element'>Jay</li>
</ul>
<ul class='list list-small' id='list-2'>
<li class='element'><h1 class='yyyy'>Foo</h1></li>
<li class='element xxx'>Bar</li>
<li class='element'>Jay</li>
</ul>
</div>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
# soup = BeautifulSoup(html_doc,'lxml')
# select内写css选择器
# print(soup.select('.sister'))
# print(soup.select('#link1'))
# print(soup.select('#link1 span'))
# 终极大招---》如果不会写css选择器,可以复制
import requests
res = requests.get('https://www.w3school.com.cn/css/css_selector_attribute.asp')
soup = BeautifulSoup(res.text, 'lxml')
print(soup.select('#maincontent > div:nth-child(4) > h2')[0].text) # 网址--copy--copy selecter
标签:bs4,res,id,soup,模块,标签,print,find
From: https://www.cnblogs.com/zhanglanhua/p/17227987.html