import urllib.request from lxml import etree # https://sc.chinaz.com/tupian/siwameinvtupian.html url = 'https://sc.chinaz.com/tupian/siwameinvtupian_2.html' def getTenGirlPhote(page): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62', } url = '' if(page == 1): url = 'https://sc.chinaz.com/tupian/siwameinvtupian.html' else: url = 'https://sc.chinaz.com/tupian/siwameinvtupian_' + str(page) + '.html' request = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(request) content = res.read().decode('utf-8') #print(content) tree = etree.HTML(content) src = tree.xpath('//div[@class="tupian-list com-img-txt-list"]/div/img//@data-original') name = tree.xpath('//div[@class="tupian-list com-img-txt-list"]/div/img//@alt') # print(name) print(len(src)) for i in range(len(src)): # 0-39 #print(i) imgUrl = 'https:' + src[i] print(imgUrl) urllib.request.urlretrieve(imgUrl,'./imgs/' + name[i] + '.jpg') for i in range(1,11): getTenGirlPhote(i)
标签:tupian,分页,批量,url,request,爬取,https,print,com From: https://www.cnblogs.com/sgj191024/p/17739007.html