小白爬虫综合项目1

标签：get url text 爬虫 content headers 小白 page 综合

项目1:搜狗知乎,爬取指定存储上一级目录的名字,以及存储爬取的页码

项目2:爬取某用户的豆瓣网的个人主页页面数据

项目3:goubanjia应用爬取ip这个关键词的页面

项目4: 爬取糗事百科首页的图片&&升级

项目5:爬取段子网

项目6:爬取古诗文网的三国这篇小说

项目7:selenium

项目8:豆瓣电影分类排行榜==>喜剧片

项目1:搜狗知乎,爬取指定存储上一级目录的名字,以及存储爬取的页码

#爬取前三页数据
import requests
import os
file_name=input('请输入一个文件名:')
#创建一个文件夹
if not os.path.exists(f'./{file_name}'):
    os.mkdir(f'./{file_name}')
word=input('enter a search word:')
#动态指定页码的范围
start_pageNum=int(input('enter a start pageNum'))
end_pageNum=int(input('enter a end pageNum'))

#自定义请求头信息
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
url='https://www.sogou.com/sogou'
for page in range(start_pageNum,end_pageNum+1):
    params={
        'query':word,
        'page':page,
        'ie':'utf-8',
        'insite':'zhihu.com'
    }
    response=requests.get(url=url,params=params,headers=headers)
    #获取响应中的页面数据(指定页码(page))
    page_text=response.text
    
    #进行持久化存储
    fileName=word+str(page)+'.html'
    filePath=f'./{file_name}/'+fileName
    with open(filePath,'w',encoding='utf-8')as fp:
        fp.write(page_text)
        print('第%d页数据写入成功'%page)

项目2:爬取某用户的豆瓣网的个人主页页面数据

import requests

session=requests.session()
#1.发起登录请求:将cookie获取,且存储到session对象中
login_url='https://accounts.douban.com/j/mobile/login/basic'
data={
    'ck': '',
    'name': '17862982790',
    'password': 'spc121314',
    'remember': 'false',
    'ticket': '',
}

#2.自定义请求头信息
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}

#3.使用session发起post 请求获取cookie,我们不需要响应页面的数据,因此不需要获取
login_response = session.post(url=login_url,data=data,headers=headers)

#4.找到个人主页对应的url,对个人主页发起请求(session(cookie)),获取响应页面数据
url='https://www.douban.com/people/198409299/'
response=session.get(url=url,headers=headers)
page_text=response.text

with open('./douban110.html','w',encoding='utf-8') as fp:
    fp.write(page_text)

项目3:goubanjia应用爬取ip这个关键词的页面

import requests
url = 'https://www.baidu.com/s?wd=ip&ie=utf-8'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}

#将代理ip封装到字典
proxy = {
    'http':'39.137.77.66:8080'
}
#更换网路IP
response=requests.get(url=url,proxies=proxy,headers=headers)

with open('./daili.html','w',encoding='utf-8')as fp:
    fp.write(response.text)

项目4: 爬取糗事百科首页的图片

import os
import requests
import re
#1.指定url
url='https://www.qiushibaike.com/pic/'

#2.自定义请求头信息
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
#3.发起请求
response=requests.get(url=url,headers=headers)
#4.获取页面数据
page_text=response.text
# 5.数据解析(该列表中存储的就是当前页面源码中所有的图片链接)
img_list=re.findall('<div class="thumb">.*?<img src="(.*?)".*?>.*?</div>',page_text,re.S)
# print(img_list)

#创建一个存储图片数据的文件夹
if not os.path.exists('./imgs'):
    os.mkdir('imgs')

for url in img_list:
    img_url='https:'+url
    img_data=requests.get(url=img_url,headers=headers).content     #请求图片
    imgName=url.split('/')[-1]
    imgPath='imgs/'+imgName
    with open(imgPath,'wb') as fp:
        fp.write(img_data)
        print(imgName+'写入成功')

升级指定爬取:思考可不可以将爬取到的图片进行分成第几页处理

import os
import requests
import re

#创建一个存储图片数据的文件夹
file_name=input('请输入一个文件名:')
#创建一个文件夹
if not os.path.exists(f'./{file_name}'):
    os.mkdir(f'./{file_name}')
#动态指定页码的范围
start_pageNum=int(input('enter a start pageNum'))
end_pageNum=int(input('enter a end pageNum'))

#1.指定url
url='https://www.qiushibaike.com/pic/'

for page in range(start_pageNum,end_pageNum+1):
    #https://www.qiushibaike.com/pic/page/6/?s=5204493

    url=f'https://www.qiushibaike.com/pic/page/{page}/?s=5204493'
    #2.自定义请求头信息
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    #3.发起请求
    response=requests.get(url=url,headers=headers)
    #4.获取页面数据
    page_text=response.text
    # 5.数据解析(该列表中存储的就是当前页面源码中所有的图片链接)
    img_list=re.findall('<div class="thumb">.*?<img src="(.*?)".*?>.*?</div>',page_text,re.S)
    # print(img_list)

    for url in img_list:
        img_url='https:'+url
        img_data=requests.get(url=img_url,headers=headers).content     #请求图片
        imgName=url.split('/')[-1]
        imgPath=f'./{file_name}/'+imgName
        with open(imgPath,'wb') as fp:
            fp.write(img_data)
            print(imgName+'写入成功')

项目5:爬取段子网

with方式:

import requests
from lxml import etree

#1.指定url
url='https://ishuo.cn/'
# 2.发起请求
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
response=requests.get(url=url,headers=headers)
#3.获取页面内容
page_text=response.text
#4.数据解析
tree=etree.HTML(page_text)
#获取所有的li标签(段子内容和标题都被包含在li标签中)
li_list=tree.xpath('//div[@id="list"]/ul/li')
#注意:Element类型的对象可以继续调用xpath函数,对该对象表示的局部内容进行指定内容的解析
with open('./段子2.txt','w',encoding='utf-8') as fp:
    for li in li_list:
        content=li.xpath('./div[@class="content"]/text()')[0]
        title=li.xpath('./div[@class="info"]/a/text()')[0]
        #5.持久化
        fp.write(title+":"+content+"\n\n")
print("数据写入成功")

方式2:fp

import requests
from lxml import etree

#1.指定url
url='https://ishuo.cn/'
# 2.发起请求
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
response=requests.get(url=url,headers=headers)
#3.获取页面内容
page_text=response.text
#4.数据解析
tree=etree.HTML(page_text)
#获取所有的li标签(段子内容和标题都被包含在li标签中)
li_list=tree.xpath('//div[@id="list"]/ul/li')

#注意:Element类型的对象可以继续调用xpath函数,对该对象表示的局部内容进行指定内容的解析
fp = open('./段子3.txt','w',encoding='utf-8')
for li in li_list:
    content=li.xpath('./div[@class="content"]/text()')[0]
    title=li.xpath('./div[@class="info"]/a/text()')[0]
    #5.持久化
    fp.write(title+":"+content+"\n\n")
print("数据写入成功")
fp.close()

项目6:爬取古诗文网的三国这篇小说

import requests
from bs4 import BeautifulSoup
# 1.指定url
url='http://www.shicimingju.com/book/sanguoyanyi.html'
# 2.发起请求
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
#根据url获取页面内容中指定的标题所对应的文章内容
def get_content(url):
    content_page=requests.get(url=url,headers=headers).text
    soup=BeautifulSoup(content_page,'lxml')  #汤对象
    div=soup.find('div',class_='chapter_content')
    return div.text

page_text=requests.get(url=url,headers=headers).text
#数据解析
soup=BeautifulSoup(page_text,'lxml')
a_list=soup.select('.book-mulu > ul > li > a')
# type(a_list[0])    
# a_list
fp=open('./sanguo.txt','w',encoding='utf-8')
for a in a_list:
    title=a.string
    #获取章节的标题
    content_url='http://www.shicimingju.com'+a['href']
    #获取章节的内容
    print(content_url)
    content=get_content(content_url)
    fp.write(title+':'+content+"\n\n\n")
    print('写完1条')
print("写完")

项目7:selenium

我的google浏览器版本号是86,因此我选择最新的驱动2.46

anaconda下安装selenium以及驱动安装: javascript:void(0)

将驱动和爬虫文件放在同一文件夹下面

打开百度浏览器:

#编码流程
from selenium import webdriver
#创建一个浏览器对象 executable_path驱动的路径
bro=webdriver.Chrome(executable_path='./chromedriver')
#get方法可以指定一个url,让浏览器进行请求
bro.get('https://www.baidu.com')

让百度执行一个词条的搜索:

#编程流程:
from selenium import webdriver
from time import sleep
#创建一个浏览器对象executable_path驱动的路径
bro=webdriver.Chrome(executable_path='./chromedriver')
#get方法可以指定一个url,让浏览器进行请求
bro.get('https://www.baidu.com')
sleep(1)
#让百度进行指定词条的一个搜索
text=bro.find_element_by_id('kw')    #定位到了text文本框
text.send_keys('美元')               #表示向文本框中录入指定内容
sleep(1)
button=bro.find_element_by_id('su')
button.click() #click表示的是点击操作
sleep(3)
bro.quit()   #关闭浏览器

无头浏览器的处理:

#编程流程:
from selenium import webdriver
bro=webdriver.PhantomJS(executable_path=r'C:\Users\Administrator\Downloads\phantomjs-2.1.1-windows\bin\phantomjs.exe')

#打开浏览器
bro.get('https://www.baidu.com')
#截屏
bro.save_screenshot('./1.png')
text=bro.find_element_by_id('kw')    #定位到了text文本框
text.send_keys('日元')              #表示向文本框中录入指定内容

bro.save_screenshot('./2.png')
bro.quit()

项目8:豆瓣电影分类排行榜==>喜剧片

#编程流程:
from selenium import webdriver
from time import sleep
bro=webdriver.PhantomJS(executable_path=r'C:\Users\Administrator\Downloads\phantomjs-2.1.1-windows\bin\phantomjs.exe')
url='https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7%E7%89%87&type=24&interval_id=100:90&action='
bro.get(url)
sleep(1)
#截屏
bro.save_screenshot('./98.png')
#编写jsdiamante:让页面中的滚轮向下滑动(底部)
js='window.scrollTo(0,document.body.scrollHeight)'
#如何让浏览器对象执行js代码
bro.execute_script(js)
sleep(1)
#截屏
bro.save_screenshot('./99.png')
#获取加载数据后的页面:page_source获取浏览器当前的页面数据
page_text=bro.page_source
print(page_text)

标签：get,url,text,爬虫,content,headers,小白,page,综合
From： https://blog.51cto.com/u_11182673/5848921

相关文章

赞助商

阅读排行