首页 > 编程语言 >python-爬虫-自带库抓取网页内容

python-爬虫-自带库抓取网页内容

时间:2022-10-28 14:34:53浏览次数:45  
标签:xpath utf python 爬虫 sys print url 网页内容 txt


python-爬虫-自带库抓取网页内容

版本:Python 2.7.10 

python-爬虫-自带库抓取网页内容

版本:Python 2.7.10

# -*- coding: utf-8 -*-
import sys
import lxml
import requests
import codecs
import time
from lxml import etree
from lxml.html.clean import Cleaner

reload(sys)
sys.setdefaultencoding('utf8') # 设置默认编码格式为'utf-8'

#防止 print 不能输出中文
if sys.stdout.encoding != 'UTF-8':
sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
if sys.stderr.encoding != 'UTF-8':
sys.stderr = codecs.getwriter('utf-8')(sys.stderr, 'strict')


demo_file="/Library/temp/demo.txt"
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def save_tags(url):
print '请求',url
resp = requests.get(url) #请求
print '请求完成'
if not resp:
print '无响应内容'
return

txt = resp.text
print 'txt = ',type(txt)

dom = etree.HTML(txt)
print 'dom type = ',type(dom)

xpath = '//div[@class=\'myClassName\']/a' //xpath
result = dom.xpath(xpath)
if not result or len(result)<1:
print 'xpath = ',xpath,' 无内容'
return

n=0
print '系统默认编码:',sys.getdefaultencoding()
print '准备写入文件:',demo_file
f = codecs.open(demo_file,'a+','utf-8')
f.write('###{}\n'.format(url))
for t in result:
n=n+1
txt = t.text.strip()
print n,txt
f.write('{}\n'.format(txt))
f.close()
print '写入文件结束:{}'.format(f.name)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def run():
url_base = 'http://www.xxx.net/list?page='
page_index=1
end = 863
print '爬取开始'
for i in range(page_index,end+1):
print '*'*30
url = '{}{}'.format(url_base,i)
save_tags(url)
print 'sleep 中...'
time.sleep(3)
print '*'*30
print '爬取结束'

run()



标签:xpath,utf,python,爬虫,sys,print,url,网页内容,txt
From: https://blog.51cto.com/u_4518216/5804855

相关文章