bs4模块

bs4模块

beautifulsoup4从HTML或XML文件中提取数据的Python库,用它来解析爬取回来的xml。

1.安装
    pip install beautifulsoup4 # 下载bs4模块
	pip install lxml  #解析库
2. 用法
    soup=BeautifulSoup('要解析的内容str类型','html.parser/lxml')

bs4遍历文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
kimi
<b>The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" name='kiki'>Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# 实列化对象
soup = BeautifulSoup(html_doc, 'lxml')
# 1 美化，不是标准xml，完成美化
# print(soup)
# print(soup.prettify())

# 2 遍历文档树---》通过 . 来遍历
# print(soup.html.body.p.b)  # 一层一层找 <b>The Dormouse's story</b>
# print(soup.p)  # # 跨层  只找第一个p标签


# 3、获取标签的名称
# print(soup.a.name) # a

# 4、获取标签的属性  ---》属性字典
# print(soup.a.attrs['href']) # http://example.com/elsie
# print(soup.a.attrs['class']) # ['sister']
# print(soup.a.attrs.get('class'))  # ['sister']
# print(soup.a.attrs.get('name'))  # kiki


# 5、获取标签的内容
# print(soup.p.text)  # 获取所有p标签的文本
"""string p下的文本只有一个时，取到，否则为None"""
# print(soup.p.string)  # None
"""  strings拿到的是生成器对象"""
# print(list(soup.p.strings))  # generator生成器对象

# 6、嵌套选择
# print(soup.html.body)

# ---- 了解
# 7.子节点、子孙节点
# print(soup.body.p.contents)  #p下所有子节点，只取一层
# print(list(soup.p.children))  #list_iterator得到一个迭代器,包含p下所有子节点  只取一层
# print(list(soup.body.descendants))  # generator   拿到body标签下的所有的字节点

# 8.父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点
# print(list(soup.a.parents) ) #找到a标签所有的祖先节点，

# 9、兄弟节点
# print(soup.a.next_sibling)  #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象

bs4搜索文档树

find 和find_all,

# find参数
  name
  class_
  id
  text
  attrs
  limit:限制调试，find_all用的 find本质是find_all    limit=1
 recursive：查找的时候，是只找第一层还是子子孙孙都找，默认是True，子子孙孙都找

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
kimi
<b>The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" name='kiki'>Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# 实列化对象
soup = BeautifulSoup(html_doc, 'lxml')

"""五种过滤器: 字符串、正则表达式、列表、True、方法 """

# 1 字符串--->查询的条件是字符串
# res = soup.find(name='p')  # 返回p标签
# res = soup.find_all('p')  # 返回p标签的列表

# res = soup.find_all(class_='sister')  # 类名叫sister的所有标签
# res = soup.find_all(attrs={'class':'sister'})

# res = soup.find_all(id='link1')  # 查找link1的标签
# res = soup.find_all(attrs={'id':'link1'})

# res = soup.find(text='Elsie').parent # 文本内容叫Elsie的父标签


# 2 正则表达式
import re
# res = soup.find_all(id=re.compile('^l'))  # 所有id以l开头的标签
# res = soup.find_all(class_=re.compile('^s')) # 所有class以s开头的标签


# 3 列表
# res = soup.find_all(id=['link1','link2']) # 拿到id=link1和id=link2的标签列表
# res = soup.find_all(name=['a','b']) # 拿到所有的a,p标签列表
# res = soup.find_all(['a','b']) # 不写默认是name=['a','b']


# 4 True
# res = soup.find_all(id=True)  # 所有id的标签
# res = soup.find_all(href=True)  # 所有href属性的标签
# res = soup.find_all(class_=True)  # 所有class_属性的标签


# 5 方法
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
print(soup.find_all(name=has_class_but_no_id))


# limit 参数
# res=soup.find_all(href=True,limit=2) # 查找href属性的标签，只拿两个

# recursive  查找的时候，子子孙孙都找
# res=soup.find_all(name='p',recursive=True)
# res=soup.find_all(name='a')


# 建议遍历和搜索一起用
res=soup.html.body.p.find_all(name='b',recursive=False)
print(res) # [<b>The Dormouse's story</b>]

CSS筛选器

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
    <b>The Dormouse's story</b>
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
        <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    <div class='panel-1'>
        <ul class='list' id='list-1'>
            <li class='element'>Foo</li>
            <li class='element'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
        <ul class='list list-small' id='list-2'>
            <li class='element'><h1 class='yyyy'>Foo</h1></li>
            <li class='element xxx'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
    </div>
    and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
# soup = BeautifulSoup(html_doc,'lxml')

# select内写css选择器
# print(soup.select('.sister'))
# print(soup.select('#link1'))
# print(soup.select('#link1 span'))

# 终极大招---》如果不会写css选择器，可以复制
import requests

res = requests.get('https://www.w3school.com.cn/css/css_selector_attribute.asp')
soup = BeautifulSoup(res.text, 'lxml')

print(soup.select('#maincontent > div:nth-child(4) > h2')[0].text)  # 网址--copy--copy selecter

标签：bs4,res,id,soup,模块,标签,print,find
From： https://www.cnblogs.com/zhanglanhua/p/17227987.html

bs4模块

bs4模块

bs4遍历文档树

bs4搜索文档树

CSS筛选器

相关文章

赞助商

阅读排行