大数据分析与可视化 之 百度图片爬虫
import requests
import re
from urllib import parse
import os
import time # Import the time module
class BaiduImageSpider(object):
def __init__(self):
self.url = 'https://image.baidu.com/search/flip?tn=baiduimage&word={}'
self.headers = {'User-Agent': 'Mozilla/4.0'}
# 获取图片
def get_image(self, url, word):
# 使用 requests模块得到响应对象
res = requests.get(url, headers=self.headers)
# 更改编码格式
res.encoding = "utf-8"
# 得到html网页
html = res.text
print(html)
# 正则解析
pattern = re.compile('"hoverURL":"(.*?)"', re.S)
img_link_list = pattern.findall(html)
# 存储图片的url链接
print(img_link_list)
# 创建目录,用于保存图片
directory = 'e:/a/image/{}/'.format(word)
# 如果目录不存在则创建,此方法常用
if not os.path.exists(directory):
os.makedirs(directory)
# 添加计数
i = 1
for img_link in img_link_list:
filename = '{}{}_{}.jpg'.format(directory, word, i)
self.save_image(img_link, filename)
i += 1
# Add a sleep time (e.g., 1 second) between each image download request
time.sleep(1)
# 下载图片
def save_image(self, img_link, filename):
html = requests.get(url=img_link, headers=self.headers).content
with open(filename, 'wb') as f:
f.write(html)
print(filename, '下载成功')
# 入口函数
def run(self):
word = input("您想要谁的照片?")
word_parse = parse.quote(word)
url = self.url.format(word_parse)
self.get_image(url, word)
if __name__ == '__main__':
spider = BaiduImageSpider()
spider.run()
标签:数据分析,word,img,url,self,爬虫,可视化,link,image
From: https://www.cnblogs.com/IvanKK/p/17936784