import time # from selenium import webdriver from selenium.webdriver.chrome.service import Service from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver import ActionChains from lxml import etree
作者:小辉 #爬取京东的小米的手机信息 class jingdo(): def __init__(self): self.sertd = Service("\自动\chromedriver.exe") #控制浏览 self.mko=webdriver.Chrome(service=self.sertd) #获取Service对象 self.html_top='https://item.jd.com/' self.html_huo='.html' def mko111(self): i1=5 s=116 ick=1 for i in range(1,21): # 20页的小米手机的url if i==1: c='https://search.jd.com/Search?keyword=%E5%B0%8F%E7%B1%B3%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=98f95d796dc64a139bb211652a371657' elif i==2: c = 'https://search.jd.com/Search?keyword=%E5%B0%8F%E7%B1%B3%E6%89%8B%E6%9C%BA&qrst=1&ev=exbrand_%E5%B0%8F%E7%B1%B3%EF%BC%88MI%EF%BC%89%5E&pvid=98f95d796dc64a139bb211652a371657&page=3&s=57&click=0' else: c='https://search.jd.com/Search?keyword=%E5%B0%8F%E7%B1%B3%E6%89%8B%E6%9C%BA&qrst=1&ev=exbrand_%E5%B0%8F%E7%B1%B3%EF%BC%88MI%EF%BC%89%5E&pvid=98f95d796dc64a139bb211652a371657&page='+str(i1)+'&s='+str(s)+'&click='+str(ick) i1+=2 s+=60 self.mko.get(c) time.sleep(5)
js = "var q=document.documentElement.scrollTop=9000" self.mko.execute_script(js) # # 向下偏移了10000个像素,到达底部 time.sleep(5) nji_1=BeautifulSoup(self.mko.page_source,'lxml') xxh=nji_1.select('.ml-wrap') nji_name=etree.HTML(str(xxh)) c1='//*[@id="J_goodsList"]/ul/li/@data-sku' hao_url=nji_name.xpath(c1)#每个网页手机的地址 try: for i2 in hao_url: self.mko.get(self.html_top+str(i2)+self.html_huo) time.sleep(5) THTML=etree.HTML(self.mko.page_source) name=THTML.xpath('//*[@class="sku-name"]/text()')#手机名字 moung=THTML.xpath('//*[@class="summary-price J-summary-price"]/div[2]/span[1]/span[2]/text()')#手机价格 pj=THTML.xpath('//*[@id="comment-count"]/a/text()')#手机评论 # 取消空格 name_1=[str(i3).replace(' ','').strip() for i3 in name if str(i3).replace(' ','').strip()!=''] # moung_1 = [str(i).replace(' ', '').strip() for i in moung if str(i).replace(' ', '').strip() != ''] pj_1 = [str(i).replace(' ', '').strip() for i in pj if str(i).replace(' ', '').strip() != ''] print(name_1,moung_1,pj_1) except Exception as naoc: print('错误') htmkl=jingdo() htmkl.mko111()
标签:20,name,self,selenium,E6%,爬取,str,import From: https://www.cnblogs.com/xxh12/p/16726546.html