crawling

crawling1.x

crawling1.0

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        return e


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def main():
    url = "https://military.cnr.cn/jdt/20230203/t20230203_526143652.shtml"
    html = get_html_text(url)
    news = parse_news_page(html)
    print(news)
    news = pd.Series(news)
    news.to_csv("news.csv", header=False, index=False)


main()

项目内容：根据一篇新闻的url链接，爬取新闻文本。
技术路线：使用Beautifulsoup库，找到这个网页title标签和所有p标签，将文本存入列表，最后保存为CSV格式的文件。

crawling2.x

crawling2.0

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        return e


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:]:
            hrefs.append(href)
    return hrefs


def main():
    hrefs = []
    newses = []
    url = "http://military.cnr.cn/"
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        news = parse_news_page(html)
        print(news)
        newses.append(news)
    newses = pd.Series(newses)
    newses.to_csv("newses.csv", header=False, index=False)


main()

项目内容：爬取一个网页的新闻链接，进而获取这些链接的新闻文本。
技术路线：使用Beautifulsoup库，找到a标签的href，留下"shtml"==href[-5:]的链接。
待解决：爬取的新闻文本，有多处，连着2篇是一样的新闻文本。

crawling2.1

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        return e


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:] and href not in hrefs:
            hrefs.append(href)
    return hrefs


def main():
    hrefs = []
    newses = []
    url = "http://military.cnr.cn/"
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        news = parse_news_page(html)
        print(news)
        newses.append(news)
    newses = pd.Series(newses)
    newses.to_csv("newses.csv", header=False, index=False)


main()

改进：新闻链接，不收集已在hrefs中的。

if "shtml" == href[-5:] and href not in hrefs:
    hrefs.append(href)

crawling3.x

crawling3.0

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        return e


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:] and href not in hrefs:
            hrefs.append(href)
    return hrefs


def get_newses(url, newses):
    hrefs = []
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        news = parse_news_page(html)
        print(news)
        newses.append(news)


def main():
    newses = []
    urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
            "http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
            "http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
            "http://gongyi.cnr.cn/"]
    for url in urls:
        print(url)
        get_newses(url, newses)
    newses = pd.Series(newses)
    newses.to_csv("newses.csv", header=False, index=False)


main()

项目内容：爬取10个网页的新闻链接，进而获取这些链接的新闻文本。
技术路线：遍历10个网页链接。
待解决：爬取的新闻文本中，部分新闻出现乱码；部分新闻链接不完整；部分新闻存在空格、Tab、换行等空白字符。

crawling3.1

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)
        print(url)
        return url


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:] and href not in hrefs:
            hrefs.append(href)
    return hrefs


def get_newses(url, newses):
    hrefs = []
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        if html == href:
            continue
        news = parse_news_page(html)
        # print(news)
        newses.append(news)


def main():
    newses = []
    urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
            "http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
            "http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
            "http://gongyi.cnr.cn/"]
    for url in urls:
        print(url)
        get_newses(url, newses)
    newses = pd.Series(newses)
    newses.to_csv("newses.csv", header=False, index=False)


main()

改进：添加encoding="utf-8"（未解决问题，乱码仍存在）；考虑到工作量，去掉了不正常的新闻链接（保存的全都是新闻文本了）。

def get_html_text(url):
    except Exception as e:
        return url
def get_newses(url, newses):
    for href in hrefs:
        html = get_html_text(href)
        if html == href:
            continue

crawling3.2

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)
        print(url)
        return url


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:] and href not in hrefs:
            hrefs.append(href)
    return hrefs


def get_newses(url, newses, labels, count):
    hrefs = []
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        if html == href:
            continue
        news = parse_news_page(html)
        # print(news)
        newses.append(news)
        labels.append(count)


def main():
    newses = []
    labels = []
    urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
            "http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
            "http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
            "http://gongyi.cnr.cn/"]
    count = 0
    for url in urls:
        print(url)
        get_newses(url, newses, labels, count)
        count += 1
    newses = pd.DataFrame({"label": labels, "text": newses})
    newses.to_csv("newses.csv", index=False)


main()

项目内容：为完成机器学习任务新闻文本分类，为获取的新闻加标签。
疑问：不加标签时，一个列表转成Series存为CSV，有的新闻中间有换行符，导致存储后占了3行，但read_csv()读入后，新闻顺序没出错。

crawling3.3

import requests
from bs4 import BeautifulSoup
import pandas as pd


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)
        print(url)
        return url


def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            s = "".join(s.split("\n"))
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e


def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:] and href not in hrefs:
            hrefs.append(href)
    return hrefs


def get_newses(url, newses, labels, count):
    hrefs = []
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        if html == href:
            continue
        news = parse_news_page(html)
        # print(news)
        newses.append(news)
        labels.append(count)


def main():
    newses = []
    labels = []
    urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
            "http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
            "http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
            "http://gongyi.cnr.cn/"]
    count = 0
    for url in urls:
        print(url)
        get_newses(url, newses, labels, count)
        count += 1
    newses = pd.DataFrame({"label": labels, "text": newses})
    newses.to_csv("newses.csv", index=False)


main()

改进：将新闻文本中的换行符去掉，使用s="".join(s.split("\n"))。

标签：以央广网,newses,为例,href,爬取,url,html,news,hrefs
From： https://www.cnblogs.com/yymqdu/p/17094059.html

新闻文本爬取——以央广网为例