首页 > 其他分享 >wallhaven.cc网页爬取图片练习

wallhaven.cc网页爬取图片练习

时间:2022-12-17 21:57:51浏览次数:45  
标签:img filepath cc wallhaven filename 爬取 result print

import os
import re

import requests

url = "https://wallhaven.cc/search?q=id:12757&sorting=random&ref=fp"
#反爬措施
#暂无


#获取网页内容
response = requests.get(url)
response.encoding = 'utf-8'
text = response.text


#正则表达式
zhengze = '(data-wallpaper-id\=\").*?(\")'
zhengze1 = '\"([0-9a-zA-Z].*)\"'
zhengze2 = '".{1-6}"'
zhengze3 = '"https://w.wallhaven.cc/full/.*?(.png|.jpg)"'
moshi = re.compile(zhengze)
moshi2 = re.compile(zhengze3)
pipeijieguos = moshi.finditer(text)
#无法下载png格式图片
# for pipeijieguo in pipeijieguos:
# result = pipeijieguo.group(0)
# print(result)
# filename = result[19:25]
# id2 = filename[:2]
# src = "https://w.wallhaven.cc/full/"+id2+"/wallhaven-"+filename+".jpg"
# print(src)
# filepath = f'img/{filename}.jpg'
# if os.path.exists(filepath):
# print(f"{filepath}已经存在了,不需要下载")
# continue
# bts = requests.get(src).content
# img = open(filepath,'wb')
# img.write(bts)
# img.close()
# print(f"{filepath}下载完成")
for pipeijieguo in pipeijieguos:
result = pipeijieguo.group(0)
# print(result)
filename = result[19:result.__len__()-1]
# print(filename)
#获取子网站内容
url2 = "https://wallhaven.cc/w/"+filename
# print(url2)
response2 = requests.get(url2)
response2.encoding = 'utf-8'
text2 = response2.text
# print(text2)
pipeijieguos2 = moshi2.finditer(text2)
for pipeijieguo2 in pipeijieguos2:
result2 = pipeijieguo2.group(0)
src = result2.replace('"','')

name = src[41:51]
print(name)
filepath = f'img/{name}'
if os.path.exists(filepath):
print(f"{filepath}已经存在了,不需要下载")
continue
bts = requests.get(src).content
img = open(filepath,'wb')
img.write(bts)
img.close()
print(f"{filepath}下载完成")

标签:img,filepath,cc,wallhaven,filename,爬取,result,print
From: https://www.cnblogs.com/creeperrr/p/16989589.html

相关文章