#爬虫爬取Google20次
import requests
url = 'https://www.google.com'
for i in range(20):
response = requests.get(url)
print(f"第{i+1}次访问")
print(f'Response status: {response.status_code}')
print(f'Text content length: {len(response.text)}')
print(f'Content length: {len(response.content)}')
print(response.text)
#bs4库练习使用
from bs4 import BeautifulSoup
import re
text = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
<table border="1">
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</html>
"""
# 创建BeautifulSoup对象
soup = BeautifulSoup(text, features="html.parser")
# 打印head标签和学号后两位
print(soup.head.prettify())
print("学号: 45\n")
# 获取body标签对象
print(soup.body.prettify())
# 获取id为first的对象
first_p = soup.find(id="first")
print(first_p)
# 获取打印中文字符
pattern = re.compile(u'[\u4e00-\u9fff]+')
chinese_chars = pattern.findall(text)
print(chinese_chars)
#爬取中国大学2017年排名
import requests
from bs4 import BeautifulSoup
import csv
all_univ = []
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fill_univ_list(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd) < 5:
continue
single_univ = [ltd[0].string.strip(), ltd[1].find('a', 'name-cn').string.strip(), ltd[2].text.strip(),
ltd[4].string.strip()]
all_univ.append(single_univ)
def print_univ_list(num):
file_name = "大学排行.csv"
print("{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}".format("排名", "学校名称", "省市", "总分", chr(12288)))
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "学校名称", "省市", "总分"])
for i in range(num):
u = all_univ[i]
writer.writerow(u)
print("{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}".format(u[0], u[1], u[2], u[3], chr(12288)))
def main(num):
url = "https://www.shanghairanking.cn/rankings/bcur/201711.html"
html = get_html_text(url)
soup = BeautifulSoup(html, features="html.parser")
fill_univ_list(soup)
print_univ_list(num)
main(20)