python爬虫
系列文章 网上搜素的系列文章 记录一下后续可能会用https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzI3NzI1MzY4Mw==&action=getalbum&album_id=1786298272630816773#wechat_redirect
1. requests请求 2. User-Agent伪装
from selenium import webdriver
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Cookie':''
}
url='http://www.aaa.com/admin/'
response = webdriver.request('POST',url , data={"param1": "value1"}, headers=headers)
print(response)
3. re/xpath解析数据
import re
import requests
response = requests.get("https://fanyi.baidu.com/")
title = re.findall(r"<title>(.*?)</title>", response.text)[0]
content = re.findall(r"<p>(.*?)</p>", response.text)[0]
print("标题:", title)
print("正文:", content)
4. Selenium+Chrome/Firefox
from selenium import webdriver
browser = webdriver.Chrome("E:\\googleDriver\\chromedriver.exe")
browser.implicitly_wait(10)
browser.get("IP")
from selenium.webdriver.common.by import By
text = browser.find_elements(By.CLASS_NAME,"el-input__inner")
username = text[0]
password = text[1]
username.send_keys('username')
password.send_keys('password')
5. 多进程/多线程
6. Proxies代理
import requests
proxies = {
"http": "http://user:password@proxy-ip:proxy-port", # HTTP协议代理
"https": "http://user:password@proxy-ip:proxy-port", # HTTPS协议代理
}
response = requests.get("https://www.example.com", proxies=proxies)
7. ajax请求破解
import requests
url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2022-06-29&leftTicketDTO.from_station=HZH&leftTicketDTO.to_station=SHH&purpose_codes=ADULT'
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "_uab_collina=165606030144749982200458; JSESSIONID=E6E0AEED78C2D7C1F570B546D4EF1E54; highContrastMode=defaltMode; guidesStatus=off; cursorStatus=off; _jc_save_wfdc_flag=dc; RAIL_EXPIRATION=1656691421557; RAIL_DEVICEID=ri6nXn_Z4JvuTfJ_dKkesj62yt7o45BG6BTx7xmjwqzCkpc2n9XwDN03Jwe1zmbFvGtn3wq4kpkyCcfk8ffhwOZHh7Fj9QQZRXxt-3Wd54OC-InIOkYoe06yk8pAKK0LLBcbzOKj8MPwB51_xh8WDHQO09qjmooQ; BIGipServerpassport=770179338.50215.0000; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerotn=1089470986.24610.0000; _jc_save_toStation=%u4E0A%u6D77%2CSHH; _jc_save_toDate=2022-06-29; BIGipServerpool_passport=182714890.50215.0000; _jc_save_fromDate=2022-06-29; _jc_save_fromStation=%u676D%u5DDE%2CHZH",
"Host": "kyfw.12306.cn",
"If-Modified-Since": "0",
"Referer": "https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc&fs=%E6%88%90%E9%83%BD,CDW&ts=%E4%B8%8A%E6%B5%B7,SHH&date=2022-06-29&flag=N,N,Y",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"'
}
res = requests.get(url=url, headers=headers)
print(res.status_code)
print(res)
print(res.json())
标签:password,Python,基础,爬虫,https,print,import,save,jc
From: https://www.cnblogs.com/wuxiaolong4/p/17290972.html