(1)请用requests库的get()函数访问如下一个网站20次,打印返回状态,text()内容,计算text()属性和content属性所返回网页内容的长度。
import requests from bs4 import BeautifulSoup url='https://baidu.com' for i in range(20): try: r=requests.get(url) print(r.status_code) except: print(404) r=requests.get(url) r.encoding='utf-8' print("text()获取的内容为:",end="") print(r.text) print('---------------------') print("content()获取的内容:",end="") print(r.content) print("-"*20) soup=BeautifulSoup(r.text) print("soup的类型为:{}".format(type(soup))) print("-"*20) str1=soup.find_all('title') str2=soup.find_all('input') str3=soup.find_all('a') str4=str(str1)+str(str2)+str(str3) print("title标签的内容{}".format(str1)) print("input标签里面的内容{}".format(str2)) print("a标签里卖的内容{}".format(str3)) print("获取到的内容的长度{}".format(len(str4)))
3)这是一个简单的html页面,请保持为字符串,完成后面的计算要求。
<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runoob.com)</title> </head> <body> <h1>我的第一个标题</h1> <p id="first">我的第一个段落。</p> </body> <table border="1"> <tr> <td>row 1, cell 1</td> <td>row 1, cell 2</td> </tr> <tr> <td>row 2, cell 1</td> <td>row 2, cell 2</td> </tr> </table> </html>
要求:a 打印head标签内容和你的学号后两位
b,获取body标签的内容
c. 获取id 为first的标签对象
d. 获取并打印html页面中的中文字符
import re import requests from bs4 import BeautifulSoup text='''<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runoob.com)</title> </head> <body> <h1>我的第一个标题</h1> <p id="first">我的第一个段落。</p> <table border="1"> <tr> <td>row 1, cell 1</td> <td>row 1, cell 2</td> </tr> <tr> <td>row 2, cell 1</td> <td>row 2, cell 2</td> </tr> </table> </body> </html>''' soup=BeautifulSoup(text) print(type(soup)) print("-"*30) print("head标签的内容为:{}".format(soup.head)) print("-"*30) print("body标签的内容为:{}".format(soup.body)) print("-"*30) print("id为first的标签对象:{}".format(soup.p)) print('-'*30) print('页面里面的所有中文字符为:{}'.format(re.findall('[\u1100-\uFFFDh]+?',soup.text)))
(4) 爬中国大学排名网站内容,
https://www.shanghairanking.cn/rankings/bcur/201811
要求:(一),爬取大学排名(学号尾号1,2,爬取年费2020,a,爬取大学排名(学号尾号3,4,爬取年费2016,)a,爬取大学排名(学号尾号5,6,爬取年费2017,)a,爬取大学排名(学号尾号7,8,爬取年费2018,))a,爬取大学排名(学号尾号9,0,爬取年费2019,)
# e23.1CrawUnivRanking.py import re import pandas as pd import requests from bs4 import BeautifulSoup allUniv = [] def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "" def fillUnivList(soup): soup.encode('utf-8') data = soup.find_all('tr') list1=[] for tr in data: ltd = tr.find_all('td') if len(ltd) == 0: continue singleUniv = [] for td in ltd: temp=re.findall('[\u4e00-\u9fff]+' ,str(td)) if td.string!=None and td.string!="[]": singleUniv.append(td.string) if temp!=[]: if type(temp)==list: str1='' for i in temp: str1+=i singleUniv.append(str1) allUniv.append(singleUniv) return allUniv def printUnivList(num): print("{:^5}{:^4}{:^5}{:^10}{:^10}".format("排名", "学校名称", "省市", "类型", "总分")) for i in range(num): u = allUniv[i] u[0]=u[0][29:31] u[1]=u[1][:4] u[4]=u[4][25:31] print("{:^5} {:^4}{:^5}{:^10}{:^10}".format(u[0], u[1], u[2], u[3], u[4])) def main(flag): url = 'https://www.shanghairanking.cn/rankings/bcur/201711' html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") list1=fillUnivList(soup) if flag==0: printUnivList(10) else: return list1 # 定义一个函数,将里面的嵌套列表的第一个元素取出 def combination(list1,count): list2=[] for i in list1: list2.append(i[count]) return list2 main(0) list1=main(1) # 定义一个函数,处理一下获取到的数据 def deal_data(list1): list_1=combination(list1,0) list_2=combination(list1,1) list_3=combination(list1,2) list_4=combination(list1,3) list_5=combination(list1,4) data = pd.DataFrame({ "排名": list_1, "学校名称": list_2, '省市': list_3, '类型': list_4, '总分': list_5 }) return data data=deal_data(list1) data.to_csv('University_grade.csv',index=False)
标签:format,list,py,list1,爬虫,爬取,soup,print From: https://www.cnblogs.com/shangyishui/p/17895811.html