环境python3.9版本及以上,开发工具pycharm
requests的进阶使用:
案例一 模拟用户登录-处理cookie:
# 登录 -> 得到cookie # 带着cookie 去请求到暑假url -> 书架上的内容 # 必须把上面两个操作连起来 # 我们可以使用session进行请求 -> session你可以认为是一连串的请求,在这个过程中的cookie不会丢失 import requests # # 会话 # session = requests.session() # data = { # "loginName": "18975575097", # "password": "hy.1211" # } # # # 1.登录 # url = "https://passport.17k.com/ck/user/login" # session.post(url, data=data) # # print(resp.text) # # print(resp.cookies) # 看cookie # # # 拿书架上的数据 # # 刚使用的session中是有cookie的 # resp = session.get('https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919') # # print(resp.json()) # 暴力爬取! resp = requests.get("https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919", headers = { "Cookie": "GUID=69519599-f002-43d3-a75d-629c3a6a99d9; sajssdk_2015_cross_new_user=1; __bid_n=187839078a08ad067b4207; Hm_lvt_9793f42b498361373512340937deb2a0=1681539496; FPTOKEN=qhSCogWrXfGuSVgVyrv5ugmyHMCk09hR09J4vwLe0ciErADRvtr5jcAk+9ZVtxemgBMe0YtpS/FNlGnyQjBBy86M33gU8fqmjxDEgV51pvABJY4f3EmMoOFVL1jVracFtg91N7m6HmRPjTrLYkYm40zgHeQF4inRorhmupWootM/w3CRy6ccjOjXZWnLmMIDuo7CFnjR8ooJNXNNLQmgNLl/Ft8AoPAjLi+uyKrFphfCrrcCw6WWZKp9AqXPkd/nE5z9VUSuebEcGOC8d8ZV0+JwhAOq5OupVvsqmC0AGKEVhfEO4rxrC7s2OkffXD3ZA472EdyMm1LDtupi42vNCu0t9zEOVz/6hbY3+CHJuf94fatzLwcqC9IrTL+UgoTLpvt5G/ozsAljI9p8EiC3NQ==|Q2G++K0W9yBUzbXcPpwV+HwjFl0HOP4N1PLWtwuBdOU=|10|f8d67ccba53f0bb3f3fdc404f8182129; c_channel=0; c_csc=web; BAIDU_SSP_lcr=https://api.weibo.com/; accessToken=avatarUrl%3Dhttps%253A%252F%252Fcdn.static.17k.com%252Fuser%252Favatar%252F08%252F28%252F67%252F100156728.jpg-88x88%253Fv%253D1681540416000%26id%3D100156728%26nickname%3D%25E4%25B9%25A6%25E5%258F%258B6K7GZ6La3%26e%3D1697092533%26s%3D62ff98a0444d1c29; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22100156728%22%2C%22%24device_id%22%3A%22187839078371431-02353e314bf148-26031b51-2073600-1878390783892a%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%A4%BE%E4%BA%A4%E7%BD%91%E7%AB%99%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fapi.weibo.com%2F%22%2C%22%24latest_referrer_host%22%3A%22api.weibo.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%2C%22first_id%22%3A%2269519599-f002-43d3-a75d-629c3a6a99d9%22%7D; Hm_lpvt_9793f42b498361373512340937deb2a0=1681540866" }) print(resp.text)
案例二 防盗链的处理-爬取梨视频:
# 1.拿到contId # 2.拿到videoStatus返回的json. -> srcURL # 3.srcURL里面的内容进行修整、 # 4.下载视频 import requests # 拉取视频的网址 url = "https://pearvideo.com/video_1721911" contId = url.split('_')[1] videoStatus = f"https://pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.12818526288984744" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", # 防盗链:溯源,当前本次请求的上一级是谁 "Referer": url } resp = requests.get(videoStatus, headers=headers) dic = resp.json() srcUrl = dic['videoInfo']['videos']['srcUrl'] systemTime = dic['systemTime'] srcUrl = srcUrl.replace(systemTime, f'cont-{contId}') # print(srcUrl) # 下载视频 with open("a.mp4", mode="wb") as f: f.write(requests.get(srcUrl).content)
案例三 代理:
# 代理,可以使用第三方的机器来代理你的请求 # 代理的弊端: # 1.慢 # 2.代理ip不好找 import requests # https://www.kuaidaili.com/free/intr/ url = "https://www.baidu.com" # 准备代理信息 proxy = { "http": "http://114.233.70.231:9000", "https": "https://114.233.70.231:9000" } # proxies resp = requests.get(url, proxies=proxy) resp.encoding = "utf-8" print(resp.text)
案例四 接入第三方代理:
import requests # 程序有待完善,有可能代理ip弄完,我不知道!! def get_ip(): url = "……" resp = requests.get(url) ips = resp.json() for ip in ips['data']['proxy_list']: # 拿到每一个ip yield ip # 一个一个返回代理ip def spider(): url = "https://www.baidu.com" while 1: try: proxy_ip = next(gen) # 拿到代理ip proxy = { "http": "http://" + proxy_ip, "https": "https://" + proxy_ip, } resp = requests.get(url, proxies=proxy) resp.encoding = 'utf-8' return resp.text except: print("报错了。") if __name__ == "__main__": gen = get_ip() # gen就是代理ip的生成器 for i in range(10): spider()
OK,基础过完,还得好好夯实夯实,后面烧脑的来了!
标签:Python,day7,22%,爬虫,url,https,ip,requests,resp From: https://www.cnblogs.com/Hyun79/p/17321388.html