目录
使用selenium
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pymongo
import time
import random
class GetCookies():
def GetCookies(self):
username="ho0"
password="hon5"
#login_url="https://login.tmall.com/"
login_url="https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Flist.tmall.com%2Fsearch_product.htm%3Fq%3D%25E8%258B%25B9%25E6%259E%259C%26type%3Dp%26vmarket%3D%26spm%3D875.7931836%252FB.a2227oh.d100%26from%3Dmallfp..pc_1_searchbutton&uuid=9b1b940679de3c3820589302ff75920b"
driver.get(login_url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="fm-login-id"]')))
login_name=driver.find_element_by_xpath('//*[@id="fm-login-id"]')
login_name.click()
login_name.send_keys(username)
time.sleep(random.randrange(5,7))
login_passwd=driver.find_element_by_xpath('//*[@id="fm-login-password"]')
login_passwd.click()
login_passwd.send_keys(password)
time.sleep(random.randrange(5,7))
driver.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
search_url="https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.193a74a7Gjoc16&id=656168531109&skuId=4902176242110&user_id=1917047079&cat_id=2&is_b=1&rn=96d6ce4c6e59b759d99176e5933c5e1f"
driver.get(search_url)
class TamllComment():
def GetCommentData(self):
goods_url="https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.193a74a7Gjoc16&id=656168531109&skuId=4902176242110&user_id=1917047079&cat_id=2&is_b=1&rn=96d6ce4c6e59b759d99176e5933c5e1f"
driver.get(goods_url)
username="ho"
password="hon"
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="fm-login-id"]')))
login_name=driver.find_element_by_xpath('//*[@id="fm-login-id"]')
login_name.click()
login_name.send_keys(username)
time.sleep(random.randrange(5,7))
login_passwd=driver.find_element_by_xpath('//*[@id="fm-login-password"]')
login_passwd.click()
login_passwd.send_keys(password)
time.sleep(random.randrange(5,7))
driver.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
def SaveAsMongo(self):
pass
if __name__ == '__main__':
options = ChromeOptions()
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"')
driver = Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
#获取cookies
#cookies=GetCookies().GetCookies()
cookies=[{'domain': '.taobao.com', 'expiry': 1653840540, 'httpOnly': False, 'name': 'tfstk', 'path': '/', 'secure': False, 'value': 'cQnABVw_LQA0_Bw-LqLoflTjbiphayBYstNOXLwWsq2FZdsOfs2mxDCKIEwaTSpR.'}, {'domain': '.taobao.com', 'expiry': 2269008541, 'httpOnly': False, 'name': 'cna', 'path': '/', 'sameSite': 'None', 's'}]
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import re
import json
import random
if __name__ == '__main__':
driver = webdriver.Chrome()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})""",
})
driver.maximize_window()
driver.get('https://s.taobao.com/')
key = '华为手机'
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//input[@class="search-combobox-input"]')))
input_box = driver.find_element_by_class_name('search-combobox-input')
print(type(input_box))
input_box.send_keys(key)
driver.find_element_by_class_name('btn-search').click()
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@ID="container"]')))
# 登录
with open('taobao_user_info') as fp:
user_info = fp.read().split()
user_name = user_info[0]
password = user_info[1]
driver.find_element_by_id('fm-login-id').send_keys(user_name)
driver.find_element_by_id('fm-login-password').send_keys(password)
driver.find_element_by_class_name('fm-btn').click()
finally:
time.sleep(random.randint(5,10))
# 提取商品信息
source = driver.page_source
# print(source)
match_result = re.search(r'g_page_config =(.*?)g_srp_loadCss', source, flags=re.S) # 需使用u得到
# print(match_result.group(1))
# print(match_result.group(1).rstrip().rstrip(';'))
item_json = json.loads(match_result.group(1).rstrip().rstrip(';'))
print(item_json['mods']['itemlist']['data']['auctions'])
i = 44
while True:
try:
driver.find_element_by_link_text('下一页').click()
# window = driver.current_window_handle
time.sleep(random.randint(5,10))
# 提取商品信息
js = 'window.scrollTo(0,document.body.scrollHeight)' #"var q=document.documentElement.scrollTop=1000"
driver.execute_script(js)
time.sleep(random.randint(5,10))
i += 44
driver.get(f"https://s.taobao.com/search?q={key}&s={i}")
source = driver.page_source
# print(source)
match_result = re.search(r'g_page_config =(.*?)g_srp_loadCss', source, flags=re.S) # 需使用u得到
# print(match_result.group(1))
# print(match_result.group(1).rstrip().rstrip(';'))
item_json = json.loads(match_result.group(1).rstrip().rstrip(';'))
print(item_json['mods']['itemlist']['data']['auctions'])
except NoSuchElementException:
break
except Exception as e:
print('等待时间不够?还是其它异常')
finally:
driver.quit()
使用requests
import requests
from fake_useragent import UserAgent
import time
import random
import json
import redis
import openpyxl
def get_comment_data(start_page,end_page):
url="https://rate.tmall.com/list_detail_rate.htm?"
headers={
'user-agent':UserAgent().ie,
'cookie':'miid=4159704271564039423; cna=PMcSGlbGqDMCAXBvBSW1loSM; lid=hon; t=d33712a517f185cc4bc07f7e794e1c6a; tracknick=hone; lgc=ho0; enc=VIpLpF8/bb0v918fNc3kxn3fkXLRs7l0lJBEagrAtKl3BvVLNJ11PWm9yq/rtp1xwjNeORedsaJ0Ydu2F8OSDw==; _tb_token_=e3be35e037e4d; cookie2=1f5ad1c8e7ef467d07965d7cf008424d; xlly_s=1; _m_h5_tk=6b2f51112de120902a775b6f6c071c9c_1638285939290; ',
'referer':'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.47d174a7SwNZMX&id=656168531109&skuId=4902176242110&areaId=350100&user_id=1917047079&cat_id=2&is_b=1&rn=2a13cc7d543f8f0ff8e7d9492fc4d3b9'
}
while start_page<=end_page:
params={
'itemId':'656168531109',
'sellerId':'1917047079',
'order':'3',
'currentPage':start_page
}
source=requests.get(url,headers=headers,params=params).text
#print(source)
#解析数据
parse_comment_data(source)
# with open('iphone%d.txt'%start_page,'w',encoding='utf-8') as file:
# file.write(source)
time.sleep(random.randint(5, 8))
start_page+=1
def parse_comment_data(source):
comment_data=source.replace("jsonp128(","").replace(")","").replace("\n","")
comment_data=json.loads(comment_data)
for data in comment_data["rateDetail"]["rateList"]:
#用户名
username=data['displayUserNick']
#商品类型
goods_type=data['auctionSku']
#评论
content=data['rateContent']
#日期
date=data['rateDate']
# 追加评论和日期
try:
add_content = data['appendComment']['content']
add_content_date = data['appendComment']['commentTime']
except:
add_content = ""
add_content_date=""
print(username,goods_type,content,date,add_content,add_content_date)
datalist.append([username,goods_type,content,date,add_content,add_content_date])
def save_as_redis(datalist):
client = redis.Redis(host="localhost", port=6379, decode_responses=True, db=0)
for data in datalist:
data_dict = dict(zip(colnames, data))
client.rpush('Tmall_iphone',data_dict)
client.close()
def save_as_excel():
wb = openpyxl.Workbook()
ws = wb.active
ws.append(colnames)
for data in datalist:
ws.append(data)
wb.save('Tmall_iphone.xlsx')
wb.close()
if __name__ == '__main__':
datalist=[]
colnames=['用户名','商品类型','评论内容','日期','追评','追评日期']
#爬取iphone 1-7页的评论
get_comment_data(1,7)
print(datalist)
save_as_excel()
保存结果: