import requests
from lxml import etree
import re
import random
import traceback
from time import sleep
# url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8700291432374701138&ipn=rj&ct=201326592&is=&fp=result&fr=ala&word=%E8%A1%A8%E6%83%85%E5%8C%85&queryWord=%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&expermode=&nojc=&isAsync=&pn=390&rn=30&gsm=186'
headers = {
'Cookie':'winWH=%5E6_1920x963; BDIMGISLOGIN=0; BDqhfp=%E8%A1%A8%E6%83%85%E5%8C%85%26%26NaN-1undefined%26%268772%26%2614; BIDUPSID=47D1A97F74FE4D84D9C060A7E9D9623C; PSTM=1688450494; BAIDUID=64354928A148308F322D02D378FB19A4:FG=1; BAIDUID_BFESS=64354928A148308F322D02D378FB19A4:FG=1; ZFY=r0Ch4DZ4vzKkjKsCTr20yTyvBoJZR:BJjX3:AbIpxAvCs:C; BA_HECTOR=05812la52la52l80008505891ieo2c31p; PSINO=1; H_PS_PSSID=36548_39226_39223_39193_39199_39240_39233_26350_39238_39138_39224_39137_22157_39100; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[A24tJn4Wkd_]=mk3SLVN4HKm',
'Host':'image.baidu.com',
'Referer':'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B1%ED%C7%E9%B0%FC&fr=ala&ala=1&alatpl=normal&pos=0&',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
for i in range(1350,1440,30):
sleep(random.uniform(1,2))
num =hex(i)
num = num[2:]
url =f'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8700291432374701138&ipn=rj&ct=201326592&is=&fp=result&fr=ala&word=%E8%A1%A8%E6%83%85%E5%8C%85&queryWord=%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&expermode=&nojc=&isAsync=&pn={i}&rn=30&gsm={num}'
data = {
'tn': 'resultjson_com',
'logid': '8700291432374701138',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'fr': 'ala',
'word': '表情包',
'queryWord': '表情包',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '',
'z': '',
'ic': '',
'hd': '',
'latest': '',
'copyright': '',
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': '',
'istype': '',
'qc': '',
'nc': '',
'expermode': '',
'nojc': '',
'isAsync': '',
'pn': f'{i}',
'rn': '30',
'gsm': f'{num}',
}
resp = requests.get(url , headers=headers,data=data)
resp_json = resp.json()
resp_urls = resp_json['data']
for resp_url in resp_urls:
sleep(random.uniform(0,1))
try:
fromPageTitle = resp_url['fromPageTitle']
fromPageTitle = re.sub(r'[/\*?<>|\n-_ ]','',fromPageTitle)
fromPageTitle = fromPageTitle[0:15]
middleURL = resp_url['middleURL']
name = re.split(r'(\w+)',middleURL)
info = requests.get(middleURL)
# print(name[-10],fromPageTitle,middleURL)
with open (str(i)+fromPageTitle+'.'+name[-10],'wb') as f:
f.write(info.content)
print(str(i) + fromPageTitle + '下载完成')
except Exception as e:
traceback.print_exc()
continue
这个是昨天做的百度表情包下载的程序
import traceback
# # for i in range(30,600,30):
# # num = hex(i)
# # num = num[2:]
# # print(num)
# # from time import sleep
# # import random
# # for i in range(0,10):
# # # x = random.uniform(1,2)
# # sleep(random.uniform(0,1))
# # print(random.uniform(0,1))
# for i in range(0,5):
# try:
# n =1/ i
# print(n)
# except Exception as e:
# traceback.print_exc()
import requests
import csv
from lxml import etree
data = []
for i in range(0,250,25):
url = f'https://movie.douban.com/top250?start={i}&filter='
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
resp = requests.get(url , headers=headers)
html = etree.HTML(resp.text)
html_info = html.xpath('//ol[@class="grid_view"]/li')
# print(html_info)
for info in html_info:
# print(info)
dic = {}
dic['href'] = info.xpath('.//div[@class="hd"]/a/@href')[0]
dic['title'] = info.xpath('.//div[@class="pic"]/a/img/@alt')[0]
dic['pingjiashu'] = info.xpath('.//div[@class="star"]/span[4]/text()')[0]
dic['content'] = info.xpath('.//div[@class="star"]/span[2]/text()')[0]
data.append(dic)
print(f'正在下载第{i}页')
with open('dbmovie.csv','a',encoding='utf-8',newline='')as f:
w = csv.DictWriter(f,fieldnames=['title','href','pingjiashu','content'])
w.writeheader()
w.writerows(data)
xpath练习
标签:xpath,info,Python,resp,num,fromPageTitle,print,import,日记 From: https://blog.51cto.com/u_2469839/7278171