作业①
要求:用requests和BeautifulSoup库方法定向爬取给定网址(http://www.shanghairanking.cn/rankings/bcur/2020 )的数据,屏幕打印爬取的大学排名信息。
输出信息:
排名 | 学校名称 | 省市 | 学校类型 | 总分 |
---|---|---|---|---|
1 | 清华大学 | 北京 | 综合 | 852.5 |
2 | ... | ... | ... | ... |
代码
import requests
import urllib.request
from bs4 import BeautifulSoup
import bs4
def crawl():
url = "https://www.shanghairanking.cn/rankings/bcur/2020"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"}
list = []
try:
res = requests.get(url)
res.encoding = res.apparent_encoding
html = res.text
except:
html = "error"
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
a = tr('a')
tds = tr('td')
list.append([tds[0].text.strip(), a[0].string.strip(), tds[2].text.strip(),
tds[3].text.strip(), tds[4].text.strip()])
form = "{0:<15}\t{1:<15}\t{2:<15}\t{3:<15}\t{4:<15}"
print(form.format("排名", "学校名称", "省份", "学校类型", "总分"))
for i in range(28):
l = list[i]
print(form.format(l[0], l[1], l[2], l[3], l[4]))
if __name__ == '__main__':
crawl()
运行截图
心得体会
- 挺好。
作业②
要求:用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。
输出信息:
序号 | 价格 | 商品名 |
---|---|---|
1 | 65.00 | xxx |
2 | ... | ... |
代码
from asyncio import sleep, get_event_loop
from pyppeteer import launch
from random import random
from re import compile, S
import bs4
class TaoBaoSpider:
def __init__(self):
self.width, self.height = 1500, 800
get_event_loop().run_until_complete(self.init())
get_event_loop().run_until_complete(self.login())
get_event_loop().run_until_complete(self.search())
get_event_loop().run_until_complete(self.parsePage())
async def init(self):
# noinspection PyAttributeOutsideInit
self.browser = await launch(headless=False,
args=['--disable-infobars', f'--window-size={self.width},{self.height}'])
# noinspection PyAttributeOutsideInit
self.page = await self.browser.newPage()
await self.page.setViewport({'width': self.width, 'height': self.height})
await self.page.goto('https://login.taobao.com/member/login.jhtml?redirectURL=https://www.taobao.com/')
await self.page.evaluate('()=>{Object.defineProperties(navigator,{webdriver:{get:()=>false}})}')
@staticmethod
async def login():
await sleep(20)
@property
def sleep_time(self):
return 1+random()*3
async def search(self):
await self.page.click('#q')
await sleep(self.sleep_time)
input_text = '书包'
for char in input_text:
await self.page.keyboard.type(char)
await sleep(self.sleep_time)
await sleep(self.sleep_time)
await self.page.click('#J_TSearchForm > div.search-button > button')
await sleep(self.sleep_time)
async def parsePage(self):
uinfo = []
await sleep(self.sleep_time)
data = await self.page.content()
await sleep(self.sleep_time)
bs = bs4.BeautifulSoup(data, "html.parser")
try:
tlt = bs.find_all("div", class_='Title--descWrapper--HqxzYq0')
plt = bs.find_all("div", class_='Price--priceWrapper--Q0Dn7pN')
tlt1 = []
plt1 = []
for div in tlt:
names = div.find('span').get_text()
tlt1.append(names)
for div in plt:
price = div.find(
'span', class_='Price--priceInt--ZlsSi_M').get_text()
decimal = div.find(
'span', class_='Price--priceFloat--h2RR0RK').get_text()
plt1.append(price+decimal)
for i in range(len(tlt1)):
uinfo.append([i + 1, tlt1[i], plt1[i]])
except Exception as err:
print(err)
tplt = "{0:^5}\t{1:^10}\t{2:^20}"
print(tplt.format("序号", "价格", "商品名称"))
for i in uinfo:
print(tplt.format(i[0], i[2], i[1]))
await sleep(self.sleep_time)
if __name__ == '__main__':
TaoBaoSpider()
运行截图
心得体会
淘宝有反爬,参考的这个链接
需要手机扫码登录,模拟人的行为,有概率失败,当识别很多次你的账号可能很难通过。
作业③
要求:爬取一个给定网页( https://xcb.fzu.edu.cn/info/1071/4481.htm)或者自选网页的所有JPEG和JPG格式文件
输出信息:将自选网页内的所有JPEG和JPG文件保存在一个文件夹中
代码
import requests
from lxml import etree
import os
from urllib import parse
def crawl():
url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.36"}
response = requests.get(url=url, headers=headers)
page_text = response.text
tree = etree.HTML(page_text)
src_list = tree.xpath('//div//p[@class="vsbcontent_img"]')
if not os.path.exists("./img3"):
os.mkdir("./img3")
for src in src_list:
src = src.xpath('./img/@src')[0]
string = src.replace(".", " ").replace("/", " ")
string1 = string.split()
img_name = f"{string1[-3]}.{string1[-1]}"
src = "https://xcb.fzu.edu.cn" + src
img_data = requests.get(url=src, headers=headers).content
img_path = './img3/' + img_name
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(img_name, '下载成功!')
if __name__ == '__main__':
crawl()
运行截图