首页 > 其他分享 >102102128汪伟杰

102102128汪伟杰

时间:2023-09-26 20:33:05浏览次数:57  
标签:get text self await 汪伟杰 sleep import 102102128

作业①

要求:用requests和BeautifulSoup库方法定向爬取给定网址(http://www.shanghairanking.cn/rankings/bcur/2020 )的数据,屏幕打印爬取的大学排名信息。

输出信息:

排名 学校名称 省市 学校类型 总分
1 清华大学 北京 综合 852.5
2 ... ... ... ...

代码

import requests
import urllib.request  
from bs4 import BeautifulSoup
import bs4

def crawl():
    url = "https://www.shanghairanking.cn/rankings/bcur/2020"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"}
    list = []  
    try:
        res = requests.get(url)
        res.encoding = res.apparent_encoding
        html = res.text
    except:
        html = "error"

    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            a = tr('a')  
            tds = tr('td')  
            list.append([tds[0].text.strip(), a[0].string.strip(), tds[2].text.strip(),
                          tds[3].text.strip(), tds[4].text.strip()])

    form = "{0:<15}\t{1:<15}\t{2:<15}\t{3:<15}\t{4:<15}"
    print(form.format("排名", "学校名称", "省份", "学校类型", "总分"))
    for i in range(28):
        l = list[i]
        print(form.format(l[0], l[1], l[2], l[3], l[4]))

if __name__ == '__main__':
    crawl()

运行截图

心得体会

  • 挺好。

作业②

要求:用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。

输出信息:

序号 价格 商品名
1 65.00 xxx
2 ... ...

代码

from asyncio import sleep, get_event_loop
from pyppeteer import launch
from random import random
from re import compile, S
import bs4


class TaoBaoSpider:
    def __init__(self):
        self.width, self.height = 1500, 800
        get_event_loop().run_until_complete(self.init())
        get_event_loop().run_until_complete(self.login())
        get_event_loop().run_until_complete(self.search())
        get_event_loop().run_until_complete(self.parsePage())

    async def init(self):
        # noinspection PyAttributeOutsideInit
        self.browser = await launch(headless=False,
                                    args=['--disable-infobars', f'--window-size={self.width},{self.height}'])
        # noinspection PyAttributeOutsideInit
        self.page = await self.browser.newPage()
        await self.page.setViewport({'width': self.width, 'height': self.height})
        await self.page.goto('https://login.taobao.com/member/login.jhtml?redirectURL=https://www.taobao.com/')
        await self.page.evaluate('()=>{Object.defineProperties(navigator,{webdriver:{get:()=>false}})}')

    @staticmethod
    async def login():
        await sleep(20)

    @property
    def sleep_time(self):
        return 1+random()*3

    async def search(self):
        await self.page.click('#q')
        await sleep(self.sleep_time)
        input_text = '书包'
        for char in input_text:
            await self.page.keyboard.type(char)
            await sleep(self.sleep_time)
        await sleep(self.sleep_time)
        await self.page.click('#J_TSearchForm > div.search-button > button')
        await sleep(self.sleep_time)

    async def parsePage(self):
        uinfo = []
        await sleep(self.sleep_time)
        data = await self.page.content()
        await sleep(self.sleep_time)
        bs = bs4.BeautifulSoup(data, "html.parser")
        try:
            tlt = bs.find_all("div", class_='Title--descWrapper--HqxzYq0')
            plt = bs.find_all("div", class_='Price--priceWrapper--Q0Dn7pN')
            tlt1 = []
            plt1 = []
            for div in tlt:
                names = div.find('span').get_text()
                tlt1.append(names)
            for div in plt:
                price = div.find(
                    'span', class_='Price--priceInt--ZlsSi_M').get_text()
                decimal = div.find(
                    'span', class_='Price--priceFloat--h2RR0RK').get_text()
                plt1.append(price+decimal)
            for i in range(len(tlt1)):
                uinfo.append([i + 1, tlt1[i], plt1[i]])
        except Exception as err:
            print(err)

        tplt = "{0:^5}\t{1:^10}\t{2:^20}"
        print(tplt.format("序号", "价格", "商品名称"))
        for i in uinfo:
            print(tplt.format(i[0], i[2], i[1]))

        await sleep(self.sleep_time)
if __name__ == '__main__':
    TaoBaoSpider()

运行截图

心得体会

淘宝有反爬,参考的这个链接

需要手机扫码登录,模拟人的行为,有概率失败,当识别很多次你的账号可能很难通过。

作业③

要求:爬取一个给定网页( https://xcb.fzu.edu.cn/info/1071/4481.htm)或者自选网页的所有JPEG和JPG格式文件

输出信息:将自选网页内的所有JPEG和JPG文件保存在一个文件夹中

代码

import requests
from lxml import etree
import os
from urllib import parse

def crawl():
    url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"}
    response = requests.get(url=url, headers=headers)
    page_text = response.text
    tree = etree.HTML(page_text)
    src_list = tree.xpath('//div//p[@class="vsbcontent_img"]')
    if not os.path.exists("./img3"):
        os.mkdir("./img3")

    for src in src_list:
        src = src.xpath('./img/@src')[0]
        string = src.replace(".", " ").replace("/", " ")
        string1 = string.split()
        img_name = f"{string1[-3]}.{string1[-1]}"
        src = "https://xcb.fzu.edu.cn" + src
        img_data = requests.get(url=src, headers=headers).content
        img_path = './img3/' + img_name
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_name, '下载成功!')
if __name__ == '__main__':
    crawl()

运行截图


心得体会

使用的xpath很方便,图片没有具体的name,把爬下来的src分割好几部分,一部分标识做name,一部分分识别图片的格式为jpg还是png还是其他。

标签:get,text,self,await,汪伟杰,sleep,import,102102128
From: https://www.cnblogs.com/hiuboom/p/17731083.html

相关文章