import requests,re,json
import xmltodict
from lxml import etree
headers = {
"User-Agent": 'xxxxxxx',
"Accept-Language": "xxxxxx",
"Host": "xxxxxx",
"Cookie": "xxxxxx"
}
for c in range(1,10):
url = 'https://weixin.sogou.com/weixin?query=银川&_sug_type_=&s_from=input&_sug_=y&type=2&page=%s&ie=utf8' %c
r = requests.get(url=url,headers=headers)
page_text = r.text
tree=etree.HTML(r.text)
for x in range(0, 10):
for td in tree:
f = {}
title = ''.join(td.xpath('//*[@id="sogou_vr_11002601_title_%s"]//text()' % x))
content = ''.join(td.xpath('//*[@id="sogou_vr_11002601_summary_%s"]//text()' % x))
channl = ''.join(td.xpath('//*[@id="sogou_vr_11002601_account_%s"]//text()' % x))
f['title'] = title
f['content'] = content
f['channl'] = channl
print(f)
营销类网站反爬不会很难,隐私性较强得网站也别怕。道高一尺,魔高一丈啊~兄弟
标签:搜狗,title,channl,text,抓取,sogou,vr,微信,td From: https://www.cnblogs.com/djl-0628/p/17014250.html