python-爬虫-自带库抓取网页内容
版本:Python 2.7.10
python-爬虫-自带库抓取网页内容
版本:Python 2.7.10
# -*- coding: utf-8 -*-
import sys
import lxml
import requests
import codecs
import time
from lxml import etree
from lxml.html.clean import Cleaner
reload(sys)
sys.setdefaultencoding('utf8') # 设置默认编码格式为'utf-8'
#防止 print 不能输出中文
if sys.stdout.encoding != 'UTF-8':
sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
if sys.stderr.encoding != 'UTF-8':
sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')
demo_file="/Library/temp/demo.txt"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def save_tags(url):
print '请求',url
resp = requests.get(url) #请求
print '请求完成'
if not resp:
print '无响应内容'
return
txt = resp.text
print 'txt = ',type(txt)
dom = etree.HTML(txt)
print 'dom type = ',type(dom)
xpath = '//div[@class=\'myClassName\']/a' //xpath
result = dom.xpath(xpath)
if not result or len(result)<1:
print 'xpath = ',xpath,' 无内容'
return
n=0
print '系统默认编码:',sys.getdefaultencoding()
print '准备写入文件:',demo_file
f = codecs.open(demo_file,'a+','utf-8')
f.write('###{}\n'.format(url))
for t in result:
n=n+1
txt = t.text.strip()
print n,txt
f.write('{}\n'.format(txt))
f.close()
print '写入文件结束:{}'.format(f.name)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def run():
url_base = 'http://www.xxx.net/list?page='
page_index=1
end = 863
print '爬取开始'
for i in range(page_index,end+1):
print '*'*30
url = '{}{}'.format(url_base,i)
save_tags(url)
print 'sleep 中...'
time.sleep(3)
print '*'*30
print '爬取结束'
run()