1、用bs4爬壁纸网
import requests
from bs4 import BeautifulSoup # 导入BeautifulSoup
from urllib.parse import urljoin # 专门用来做url路径拼接的
import time
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
url = "https://desk.zol.com.cn/pc/"
resp = requests.get(url, headers=header)
resp.encoding = "gbk" # 设置编码
main_page_source = resp.text
print(type(main_page_source))
# 需要解析页面源代码. 获取到a标签中的href的值
# 直接把页面源代码塞进去
main_page = BeautifulSoup(main_page_source, "html.parser")
# 从BeautifulSoup 提取你要的东西
a_list = main_page.find("ul", attrs={"class": "pic-list2"}).find_all("a")
for a in a_list:
# 我需要的是a标签的href
# 想要从bs4里面拿到某一个标签的某一个属性
href = a.get("href") # get(属性)
if href.endswith(".exe"): # 判断字符串href是否以.exe结尾
continue
text = a.find("em").text # 文本
# 你的href是不完整的, 是需要拼接的
# 用你获取到这个href的url和href拼接
# 必须记住. 好用
href = urljoin(url, href)
# 我要访问详情页. 获取图片下载地址
child_resp = requests.get(href, headers=header)
child_resp.encoding = "gbk"
child_page_source = child_resp.text
child_page = BeautifulSoup(child_page_source, "html.parser")
# 可能会有风险, NoneType xxxx
src = child_page.find("img", attrs={"id": "bigImg"}).get("src")
print(src)
# 下载图片
img_resp = requests.get(src)
# 1. 如果a中的名字不重复, 可以用a的文字
# 2. 如果a中的名字重复, 可以使用路径中的名字
file_name = src.split("/")[-1]
with open(file_name, mode="wb") as f:
f.write(img_resp.content)
break # 测试用
time.sleep(1)
2、用re爬动画影评网
# 需求: 文章标题, 来源, 作者, 时间, 内容
# http://www.animationcritics.com/chinese_aniamtion.html
# 1. 在首页中获取到10个详情页的url地址.
# 经过循环.拿到每一个url地址
# 2. 访问详情页的url. 得到详情页的内容
# 3. 在详情页内容中提取到最终你需要的内容
# 你要的东西在页面源代码里??
import requests
import re
import time
url = "http://www.animationcritics.com/chinese_aniamtion.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
resp = requests.get(url, headers=headers) # 发请求
main_page_source = resp.text # 字符串, 得到页面源代码
# re.S 让.能匹配换行
main_obj = re.compile(r'<li style="margin-bottom:10px;">.*?href="(?P<url>.*?)" title="(?P<title>.*?)"', re.S)
# 来源的正则
laiyuan_obj = re.compile(r"来源:</span>(?P<laiyuan>.*?)</span>", re.S)
zuozhe_obj = re.compile(r"作者:</span>(?P<zuozhe>.*?)</span>", re.S)
pub_data_obj = re.compile(r"发布时间: </span>(?P<pub_date>.*?)</span>", re.S)
section_obj = re.compile(r"<section.*?>(?P<content>.*?)</section>", re.S)
p_obj = re.compile(r"<p data-track=.*?>(?P<content>.*?)</p>", re.S)
content_filter_obj = re.compile(r"<.*?>", re.S)
# 匹配东西
result = main_obj.finditer(main_page_source, re.S)
for item in result: # 每次循环得到一条匹配的结果
child_url = item.group("url") # 详情页url
child_title = item.group("title") # 标题
# print(child_title, child_url)
# 访问详情页
child_resp = requests.get(child_url, headers=headers)
child_page_source = child_resp.text
# print(child_page_source)
# 对详情页进行数据提取
lyr = laiyuan_obj.search(child_page_source) # search的结果是match对象.
if lyr: # 判空操作
laiyuan = lyr.group("laiyuan") # 需要group拿结果
else:
laiyuan = ""
# print(laiyuan)
# 作者
zz_r = zuozhe_obj.search(child_page_source)
if zz_r:
zuozhe = zz_r.group("zuozhe")
else:
zuozhe = ""
pub_data_r = pub_data_obj.search(child_page_source)
if pub_data_r:
pub_data = pub_data_r.group("pub_date")
else:
pub_data = ""
print(child_title, laiyuan, zuozhe, pub_data)
# 内容怎么搞??
# 拿所有section中的内容
sec_list = []
section_results = section_obj.finditer(child_page_source, re.S)
for section in section_results:
content = section.group("content")
sec_list.append(content) # 把拿到的section的内容放到列表中
all_content = "".join(sec_list) # 拼接所有的section为一个字符串
if not all_content: # 如果是空. 用 p_obj 重新提取
section_results = p_obj.finditer(child_page_source, re.S)
for section in section_results:
content = section.group("content")
sec_list.append(content) # 把拿到的section的内容放到列表中
all_content = "".join(sec_list) # 拼接所有的section为一个字符串
# 用正则表达式去替换内容. re.sub()
# 结果 = re.sub(正则, 替换之后的结果, 整个字符串)
# all_content = re.sub(r"<.*?>", "", all_content)
all_content = content_filter_obj.sub("", all_content)
print(all_content)
time.sleep(1) # 睡眠.安全.
# break # 测试
3、用xpath爬中国票房数据
import requests
from lxml import etree
# 1.拿页面源代码#
# 2.xpath提取数据
# //table/tbody/tr
url = "http://www.boxofficecn.com/boxoffice2022"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
}
resp = requests.get(url, headers=headers)
page = etree.HTML(resp.text)
trs = page.xpath("//table/tbody/tr")[1:-1]
# tr全是数据
for tr in trs:
num = tr.xpath("./td[1]/text()")
year = tr.xpath("./td[2]//text()")
name = tr.xpath("./td[3]//text()")[0]
# py基础
if name:
"".join(name) # 这是合理的方案
money = tr.xpath("./td[4]/text()")
print(num, year, name, money)
# 不正常
movie1 = ['京北的我们(', '重映', ')'] # 京北的我们(重映)
len(movie1) # ???? 3
# 正常的 1
movie2 = ['不要忘记我爱你']
# 这个是空
movie3 = []
4、爬梨视频
import requests
# url = "https://www.pearvideo.com/videoStatus.jsp?contId=1756814&mrd=0.8773583037760648"
while 1:
main_url = input("请输入你需要爬取的梨视频的地址: ") # "https://www.pearvideo.com/video_1756814"
contId = main_url.split("_")[-1]
print(contId)
url = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}"
headers = {
"Referer": main_url, # 处理防盗链
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
}
resp = requests.get(url, headers=headers)
dic = resp.json()
# print(dic)
src_url = dic['videoInfo']['videos']['srcUrl']
systemTime = dic["systemTime"]
src_url = src_url.replace(systemTime, f"cont-{contId}")
# print(src_url)
# 下载视频
print("已经找到视频. 等待下载中....")
resp = requests.get(src_url, headers=headers)
with open(f"{contId}.mp4", mode="wb") as f:
f.write(resp.content)
print("下载成功")
# 对比
# https://video.pearvideo.com/mp4/third/20220330/cont-1756814-15454898-100434-hd.mp4 # 正常的
# https://video.pearvideo.com/mp4/third/20220330/1648910860599-15454898-100434-hd.mp4 # 破烂
5、ThreadPoolExecutor线程池爬电影票房
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import time
def str_tools(lst):
if lst:
s = "".join(lst)
return s.strip()
else:
return ""
def get_movie_info(year):
# 抓取1996年的电影票房
f = open(f"{year}.csv", mode="w", encoding="utf-8")
url = f"http://www.boxofficecn.com/boxoffice{year}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
resp = requests.get(url, headers=headers)
tree = etree.HTML(resp.text)
trs = tree.xpath("//table/tbody/tr")[1:]
for tr in trs:
num = tr.xpath("./td[1]//text()")
year = tr.xpath("./td[2]//text()")
name = tr.xpath("./td[3]//text()")
money = tr.xpath("./td[4]//text()")
num = str_tools(num) # ???
year = str_tools(year) # ???
name = str_tools(name) # ???
money = str_tools(money) # ???
f.write(f"{num},{year},{name},{money}\n")
if __name__ == '__main__':
# s1 = time.time() # 当前系统时间的时间戳
# for y in range(1994, 2023):
# get_movie_info(y)
# s2 = time.time() # 执行之后的时间戳
# print(s2 - s1) # 16.23
# 效果不够好. 有丢失数据现象.这个网站不能直接这么干. 需要数据的验证或者单线程.
s1 = time.time()
with ThreadPoolExecutor(20) as t:
for y in range(1994, 2023):
t.submit(get_movie_info, y) # 交任务
s2 = time.time()
print(s2 - s1) # 如果用来下载图片. 视频等资源. 效果会比这个好n倍
6、Queue队列和线程池爬斗图网
import requests
from lxml import etree
import time
from multiprocessing import Queue # 队列
from multiprocessing import Process # 进程
from concurrent.futures import ThreadPoolExecutor # 多线程
# 图片确实在页面源代码中
# 但是, 图片不在src里, 在data-original放着
# 1.拿到页面源代码
# 2.提取data-original
# 3.下载图片
# 知识点: 进程之间是不能直接通信的(操作系统层面)
# 写一个函数. 专门负责提取data-original
# 第一个进程. 只负责提取url
def get_img_url(q):
for page in range(1, 5):
# 先考虑一页数据怎么抓
url = f"https://www.pkdoutu.com/photo/list/?page={page}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
resp = requests.get(url, headers=headers)
tree = etree.HTML(resp.text)
img_urls = tree.xpath("//li[@class='list-group-item']//img/@data-original")
for img_url in img_urls:
print(img_url) # ? 7
# 把拿到的img_url 塞入队列
q.put(img_url) # 固定的
q.put("滚蛋吧.没了") # 结束的一个消息
# 第二个进程. 只负责下载图片
def img_process(q): # 从队列中提取url. 进行下载
with ThreadPoolExecutor(10) as t: # ?
while 1: # 这边不确定有多少个. 那就一直拿
img_url = q.get() # 没有问题. 这里面, get是一个阻塞的逻辑
if img_url == '滚蛋吧.没了':
break
# 在进程中开启多线程(唐马儒)
t.submit(download_img, img_url)
def download_img(url):
# 如何下载一张图片
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
resp = requests.get(url, headers=headers)
# 文件名称
file_name = url.split("/")[-1]
with open("./img/" + file_name, mode="wb") as f:
f.write(resp.content)
print("一张图片下载完成")
if __name__ == '__main__':
# 准备队列
s1 = time.time()
q = Queue() # 主进程 水
p1 = Process(target=get_img_url, args=(q,)) # 单独开辟一个内存 阿大
p2 = Process(target=img_process, args=(q,)) # 单独开辟一个内存 阿二
p1.start()
p2.start()
p1.join() # 主进程等待子进程跑完
p2.join() # 主进程等待子进程跑完
s2 = time.time()
print(s2 - s1)
7、用协程asyncio和异步包aiohttp和aiofiles爬美图网
import asyncio
import aiohttp # pip install aiohttp => requests
import aiofiles # pip install aiofiles => open
async def download(url):
print("我要开始下载了", url)
file_name = url.split("/")[-1]
# 我要发送请求
# 如果with后面用的是一个异步的包. 那么绝大多数这里前面要加async
async with aiohttp.ClientSession() as session: # 理解: session = requests.session()
async with session.get(url) as resp: # 理解: resp = session.get()
# 等待服务器返回结果了????
# 页面源代码
# page_source = await resp.text(encoding="utf-8")
# 需要json
# dic = await resp.json()
# 字节
content = await resp.content.read()
# 有了结果要干嘛??
# 在异步协程中. 可以用同步代码
# open() # 慢
# with open(file_name, mode="wb") as f:
# f.write(content)
async with aiofiles.open(file_name, mode="wb") as f:
await f.write(content)
print("一张图下载完毕!")
async def main():
urls = [
"https://www.xiurenji.vip/uploadfile/202110/20/1F214426892.jpg",
"https://www.xiurenji.vip/uploadfile/202110/20/91214426753.jpg"
]
tasks = []
for url in urls:
tasks.append(asyncio.ensure_future(download(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
# asyncio.run(main())
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(main())
标签:动画,墨迹,get,url,resp,content,headers,影评,page
From: https://blog.51cto.com/u_15920572/8614435