正则
元字符
元字符:具有固定含义的特殊符号
常用元字符:
. 匹配除换行符以外的任意字符,未来在Python的re模块是个坑
\w 匹配字母或数字或下划线
\d 匹配数字
\n 匹配一个换行符
\t 匹配一个制表符
^ 匹配字符串的开始
$ 匹配字符串的结尾
\W 匹配非字母或数字或下划线
\D 匹配非数字
\S 匹配非空白符
a|b 匹配字符a或字符b
() 匹配括号内的表达式,
[...] 匹配字符组中的字符
[^...] 匹配除了字符组中的字符的所有字符
量词
* 重复零次或更多次
+ 重复一次或更多次
? 重复零次或一次
{n} 重复n次
{n,} 重复n次或更多次
{n,m} 重复n次到m次
贪婪匹配和惰性匹配
.* 贪婪匹配
.*? 惰性匹配
re模块
# re模块是Python自带的模块
import re
# findall 找出所有
result = re.findall("a", "abcdefghij")
print(result)
result = re.findall(r"\d+", "我是一个1234哈324")
print(result)
# finditer() 重点,多多练习
result = re.finditer(r"\d+", "我今年18岁,我有20000元")
for item in result: # 从迭代器中拿到内容
print(item.group()) # 从匹配的结果中拿到数据
print(result)
# search只会匹配到第一次匹配的内容
result = re.search(r"\d+", "我叫周杰伦,我今年32岁,5年4班")
print(result.group())
# match,在匹配的时候,是从字符串的开头进行匹配的,类似在正则前面加上了^
result = re.match(r"\d+", "我叫周杰伦,我今年32岁,5年4班")
print(result)
# compile() 预加载,提前把正则对象加载完毕
obj = re.compile(r"\d+")
# 直接把加载好的正则进行使用
result = obj.findall("我叫周杰伦,今年44岁,3年2班")
print(result)
import re
s = """
<div class='西游记'><span id='10010'>中国联通</span></div>
<div class='西游记'><span id='10086'>中国移动</span></div>
"""
# 想要提取数据必须用小括号括起来,可以单独起名字
# (?P<名字>正则)
# 提取数据的时候,需要group("名字")
obj = re.compile(r"<span id='(?P<id>\d+)'>(?P<name>.*?)</span>")
result = obj.finditer(s)
for item in result:
id = item.group("id")
print(id)
name = item.group("name")
print(name)
bs4
# pip install bs4 安装bs4模块
from bs4 import BeautitfulSoup
html = """
<div class="container">
<h1 class="test">
<a href="t" id="home-link">我的小站-阿里云盘资源共享站</a>
</h1>
<h1 class="test">
<a href="ts" id="home-link">我的小站-</a>
</h1>
<h1 class="test">
<a href="tss" id="home-link">-阿里云盘资源共享站</a>
</h1>
<a></a>
<div id="header-primary" class="Header-primary"></div>
<div id="header-secondary" class="Header-secondary"></div>
</div>
"""
# 1.初始化BeautifulSoup对象
page = BeautifulSoup(html, "html.parser")
# 2. find()查找到一个 findAll()查找到所有的
h1 = page.find("h1", attrs={"class":"test"})
a = h1.find("a")
print(a.text) # 拿标签内的文本
print(a.get("href")) # 拿属性.get("属性名")
案例 1 提取文字
import requests
from bs4 import BeautifulSoup
for i in range(0, 401):
url = "http://2bhsck.cc/vodtype/1-"+str(i)+".html"
resp = requests.get(url)
html = resp.text
page = BeautifulSoup(html, "html.parser")
h4 = page.findAll("h4", attrs={"class": "title"})
open("hsck.scv", mode="a", encoding="utf-8").write(f"--------------第{i}页---------------\n")
for item in h4:
a = item.find("a").text
open("hsck.scv", mode="a", encoding="utf-8").write(f"{a}\n")
print(f"第{i}页 完成")
案例 2 图片提取
import requests
from bs4 import BeautifulSoup
url = "http://www.umeituku.com/bizhitupian/meinvbizhi/"
resp = requests.get(url)
resp.encoding = "utf-8"
n=1
main_page = BeautifulSoup(resp.text, "html.parser")
min_page = main_page.findAll("a", attrs={"class": "TypeBigPics"})
for item in min_page:
href = item.get("href")
resp2 = requests.get(href)
resp2.encoding = "utf-8"
child_page = BeautifulSoup(resp2.text, "html.parser")
img = child_page.find("div", attrs={"class": "ImageBody"})
img_src = img.find("img").get("src")
img_resp = requests.get(img_src)
print(img_src)
with open(f"12_图片提取/{n}.jpg", mode="wb") as f:
f.write(img_resp.content)
n+=1
xpath 解析
# pip install lxml
from lxml import etree
# 如果报错,可以考虑这种写法
# from lxml import html
# etree = html.etree
xml = """
<book>
<id>111</id>
<name>Lorem, ipsum.</name>
<sprice>1.23</sprice>
<nick>臭豆腐</nick>
<author>
<nick id="10085">大飞</nick>
<nick id="111">小飞</nick>
<nick id="jay">周杰伦</nick>
</author>
<div>
<nick>Lorem.</nick>
</div>
<partner>
<nick id="ppc">Lorem, ipsum dolor.</nick>
<nick id="pcc">Lorem ipsum dolor sit.
</nick>
</partner>
</book>
"""
# 此时练习只能用XMLxml
et = etree.XML(xml)
# / 表示在根节点
result = et.xpath("/book")
# 在xpath中间的/表示的是儿子
result = et.xpath("/book/name")
# text() 拿文本
result = et.xpath("/book/name/text()")[0]
# // 表示的是子孙后代
result = et.xpath("/book//nick")
# * 通配符
result = et.xpath("/book/*/nick/text()")
# []表示属性筛选,@属性名=值
result = et.xpath("/book/author/nick[@class='jay']/text()")
# 拿到id的值
result = et.xpath("/book/partner/nick/@id")
print(result)
from lxml import etree
import requests
html = "https://zhengzhou.zbj.com/search/service/?l=0&kw=%E6%95%B0%E6%8D%AE%E6%8A%93%E5%8F%96&r=2"
resp = requests.get(html)
et = etree.HTML(resp.text)
# 价格
jxge = et.xpath("//div[@class='price']/span/text()")
# 业务
yewu = et.xpath("//div[@class='name-pic-box']/a/text()")
# 公司名称
gssi = et.xpath("//div[@class='shop-info text-overflow-line']/text()")
for item in range(len(jxge)):
print(f"业务:{yewu[item]}\n价格:{jxge[item]} \n公司:{gssi[item]}")
print("")
pyquery
# pip install pyquery
from pyquery import PyQuery
import requests
def get_page_source(url):
resp = requests.get(url)
resp.encoding = "utf-8"
return resp.text
def parser_page_source(html):
doc = PyQuery(html)
user = doc("article>aside>header>div>h2>a").items()
time = doc("article>aside>section>div>div>p:nth-child(1)").items()
for item in time:
print(item.text())
def main():
url = "https://www.dongchedi.com/auto/series/score/4363-x-x-x-x-x"
html = get_page_source(url)
parser_page_source(html)
if __name__ == '__main__':
main()
标签:匹配,text,item,html,result,print,解析,数据
From: https://www.cnblogs.com/sroot/p/17414870.html