- 基础版本
流程:
1.访问url
2.转换格式,抓取数据
3.格式化输出,保存到本地文件
import requests from lxml import etree url = 'http://www.51testing.com/html/90/category-catid-90.html' # 访问url rsp = requests.get(url) # 进行页面字符集转换 cod = rsp.apparent_encoding rsp.encoding = 'gbk' content = rsp.text # 将页面信息转为dom格式 doc = etree.HTML(content) # 查看返回结果 #print(content)
# 新建文件 file = open('data.txt','w') for j in range(2,5): print(j - 1) file.write(r'第{}页'.format(j-1)+'\n') # 提取数据 for i in range(1,11): ele = doc.xpath('/html/body/div[2]/div[3]/div[2]/div['+str(i)+']/div/p/text()')[0] print("第:{} 行--{}".format(i, ele)) # ele里面存在不可识别的字符,防止写入文件时报错,需要进行处理 new_ele = ''.join(ele.split()) file.write(r"第:{} 行{}".format(i, new_ele)+'\n'+'\n') # 获取第二页 url = 'http://www.51testing.com/html/90/category-catid-90-page-'+str(j)+'.html' rsp = requests.get(url) cod = rsp.apparent_encoding rsp.encoding = 'gbk' content = rsp.text doc = etree.HTML(content)
标签:content,测开,url,爬虫,ele,---,html,div,rsp From: https://www.cnblogs.com/testKK/p/17040145.html