目录
crawling
crawling1.x
crawling1.0
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
return e
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def main():
url = "https://military.cnr.cn/jdt/20230203/t20230203_526143652.shtml"
html = get_html_text(url)
news = parse_news_page(html)
print(news)
news = pd.Series(news)
news.to_csv("news.csv", header=False, index=False)
main()
项目内容:根据一篇新闻的url链接,爬取新闻文本。
技术路线:使用Beautifulsoup库,找到这个网页title标签和所有p标签,将文本存入列表,最后保存为CSV格式的文件。
crawling2.x
crawling2.0
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
return e
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:]:
hrefs.append(href)
return hrefs
def main():
hrefs = []
newses = []
url = "http://military.cnr.cn/"
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
news = parse_news_page(html)
print(news)
newses.append(news)
newses = pd.Series(newses)
newses.to_csv("newses.csv", header=False, index=False)
main()
项目内容:爬取一个网页的新闻链接,进而获取这些链接的新闻文本。
技术路线:使用Beautifulsoup库,找到a标签的href,留下"shtml"==href[-5:]的链接。
待解决:爬取的新闻文本,有多处,连着2篇是一样的新闻文本。
crawling2.1
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
return e
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
return hrefs
def main():
hrefs = []
newses = []
url = "http://military.cnr.cn/"
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
news = parse_news_page(html)
print(news)
newses.append(news)
newses = pd.Series(newses)
newses.to_csv("newses.csv", header=False, index=False)
main()
改进:新闻链接,不收集已在hrefs中的。
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
crawling3.x
crawling3.0
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
return e
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
return hrefs
def get_newses(url, newses):
hrefs = []
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
news = parse_news_page(html)
print(news)
newses.append(news)
def main():
newses = []
urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
"http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
"http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
"http://gongyi.cnr.cn/"]
for url in urls:
print(url)
get_newses(url, newses)
newses = pd.Series(newses)
newses.to_csv("newses.csv", header=False, index=False)
main()
项目内容:爬取10个网页的新闻链接,进而获取这些链接的新闻文本。
技术路线:遍历10个网页链接。
待解决:爬取的新闻文本中,部分新闻出现乱码;部分新闻链接不完整;部分新闻存在空格、Tab、换行等空白字符。
crawling3.1
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print(e)
print(url)
return url
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
return hrefs
def get_newses(url, newses):
hrefs = []
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
if html == href:
continue
news = parse_news_page(html)
# print(news)
newses.append(news)
def main():
newses = []
urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
"http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
"http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
"http://gongyi.cnr.cn/"]
for url in urls:
print(url)
get_newses(url, newses)
newses = pd.Series(newses)
newses.to_csv("newses.csv", header=False, index=False)
main()
改进:添加encoding="utf-8"(未解决问题,乱码仍存在);考虑到工作量,去掉了不正常的新闻链接(保存的全都是新闻文本了)。
def get_html_text(url):
except Exception as e:
return url
def get_newses(url, newses):
for href in hrefs:
html = get_html_text(href)
if html == href:
continue
crawling3.2
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print(e)
print(url)
return url
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
return hrefs
def get_newses(url, newses, labels, count):
hrefs = []
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
if html == href:
continue
news = parse_news_page(html)
# print(news)
newses.append(news)
labels.append(count)
def main():
newses = []
labels = []
urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
"http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
"http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
"http://gongyi.cnr.cn/"]
count = 0
for url in urls:
print(url)
get_newses(url, newses, labels, count)
count += 1
newses = pd.DataFrame({"label": labels, "text": newses})
newses.to_csv("newses.csv", index=False)
main()
项目内容:为完成机器学习任务新闻文本分类,为获取的新闻加标签。
疑问:不加标签时,一个列表转成Series存为CSV,有的新闻中间有换行符,导致存储后占了3行,但read_csv()读入后,新闻顺序没出错。
crawling3.3
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print(e)
print(url)
return url
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
s = "".join(s.split("\n"))
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
return hrefs
def get_newses(url, newses, labels, count):
hrefs = []
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
if html == href:
continue
news = parse_news_page(html)
# print(news)
newses.append(news)
labels.append(count)
def main():
newses = []
labels = []
urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
"http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
"http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
"http://gongyi.cnr.cn/"]
count = 0
for url in urls:
print(url)
get_newses(url, newses, labels, count)
count += 1
newses = pd.DataFrame({"label": labels, "text": newses})
newses.to_csv("newses.csv", index=False)
main()
改进:将新闻文本中的换行符去掉,使用s="".join(s.split("\n"))。
标签:以央广网,newses,为例,href,爬取,url,html,news,hrefs From: https://www.cnblogs.com/yymqdu/p/17094059.html