import os import time from fake_useragent import UserAgent import requests import re import uuid headers = {"User-agent": UserAgent().random, # 随机生成一个代理请求 "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive"} img_re = re.compile('"thumbURL":"(.*?)"') img_format = re.compile("f=(.*).*?w") def file_op(img, file_path): uuid_str = uuid.uuid4().hex tmp_file_name = file_path+'/%s.jpeg' % uuid_str with open(file=tmp_file_name, mode="wb") as file: try: file.write(img) except: pass def xhr_url(url_xhr, start_num=0, page=5, file_path=''): end_num = page*30 curre_page = 1 file_path = file_path for page_num in range(start_num, end_num, 30): print('正在爬取第{}页'.format(curre_page)) resp = requests.get(url=url_xhr+str(page_num), headers=headers) if resp.status_code == 200: img_url_list = img_re.findall(resp.text) # 这是个列表形式 for img_url in img_url_list: img_rsp = requests.get(url=img_url, headers=headers) file_op(img=img_rsp.content, file_path=file_path) else: break time.sleep(5) print('第{}页爬取完成!!!'.format(curre_page)) curre_page += 1 print("内容已经全部爬取") if __name__ == "__main__": basic_path = r'E:\spider_leaning\Images_Data_Dog' # folder_list = ['哈士奇', '比熊'] serch_list = ['巴哥犬', '法国斗牛犬', '博美犬', '吉娃娃', '约克夏', '雪纳瑞', '比熊', '贵宾犬', '马尔济斯犬', '西高地白梗犬', '哈士奇', '萨摩耶', '阿拉斯加犬', '金毛犬', '拉布拉多犬', '德牧', '柯基犬', '边境牧羊犬', '喜乐蒂', '腊肠犬', '松狮犬', '秋田犬', '罗威纳犬', '杜宾犬', '比格犬', '柴犬', '中华田园犬' ] start_page = 1 page_number = 50 for i in range(len(serch_list)): filePath = os.path.join(basic_path, serch_list[i]) if not os.path.exists(filePath): os.makedirs(filePath) org_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&word={text}&pn=".format(text=serch_list[i]) xhr_url(url_xhr=org_url, start_num=start_page, page=page_number, file_path=filePath)
标签:img,url,list,爬取,file,path,page,百度,图片 From: https://www.cnblogs.com/moon3496694/p/17209433.html