首页 > 其他分享 >新闻文本爬取和分类_代码

新闻文本爬取和分类_代码

时间:2023-02-13 11:47:48浏览次数:38  
标签:score val 代码 爬取 print train test import 文本

目录

1 机器学习sklearn新闻文本分类

import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import time
 
 
def load_data(filepath):
    train_df = pd.read_csv(filepath)
    x_train, y_train = train_df['text'], train_df['label']
    x_train, x_test, y_train, y_test = \
        train_test_split(x_train, y_train, test_size=0.2)
    return x_train, x_test, y_train, y_test
 
 
def data_prep(x_train, y_train, x_test):
    tf_idf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
    x_train = [" ".join(jieba.lcut(i)) for i in x_train]
    x_test = [" ".join(jieba.lcut(i)) for i in x_test]
    x_train = tf_idf.fit_transform(x_train)
    x_test = tf_idf.transform(x_test)
    selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
    selector.fit(x_train, y_train)
    x_train = selector.transform(x_train)  # numpy.float64
    x_test = selector.transform(x_test)
    return x_train, x_test
 
 
def main():
    x_train, x_test, y_train, y_test = load_data("newses.csv")
    x_train, x_test = data_prep(x_train, y_train, x_test)
 
    start = time.time()
    gnb = GaussianNB()  # 朴素贝叶斯
    print(f'1:{cross_val_score(gnb, x_train.toarray(), y_train, cv=10)}')
    gnb.fit(x_train.toarray(), y_train)
    answer_gnb = pd.Series(gnb.predict(x_test.toarray()))
    answer_gnb.to_csv("answer_gnb.csv", header=False, index=False)
    score_gnb = f1_score(y_test, answer_gnb, average='macro')
    print(f'F1_core_gnb:{score_gnb}')
    end = time.time()
    print(f'时间:{end - start}s')
 
    start = time.time()
    rc = RidgeClassifier()  # 岭回归分类器
    print(f'\n2:{cross_val_score(rc, x_train, y_train, cv=10)}')
    rc.fit(x_train, y_train)
    answer_rc = pd.Series(rc.predict(x_test))
    answer_rc.to_csv("answer_rc.csv", header=False, index=False)
    score_rc = f1_score(y_test, answer_rc, average='macro')
    print(f'F1_core_rc:{score_rc}')
    end = time.time()
    print(f'时间:{end - start}s')
 
    start = time.time()
    sv = svm.SVC()  # 支持向量机
    print(f'\n3:{cross_val_score(sv, x_train, y_train, cv=10)}')
    sv.fit(x_train, y_train)
    answer_sv = pd.Series(sv.predict(x_test))
    answer_sv.to_csv("answer_sv.csv", header=False, index=False)
    score_sv = f1_score(y_test, answer_sv, average='macro')
    print(f'F1_core_sv:{score_sv}')
    end = time.time()
    print(f'时间:{end - start}s')
 
 
main()

2 深度学习MLP新闻文本分类

import numpy as np
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
 
 
def load_data(filepath):
    train_df = pd.read_csv(filepath)
    x_train, y_train = train_df['text'], train_df['label']
    x_train, x_test, y_train, y_test = \
        train_test_split(x_train, y_train, test_size=0.2)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
    return x_train, x_val, x_test, y_train, y_val, y_test
 
 
def data_prep(x_train, y_train, x_val, x_test):
    tf_idf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
    x_train = [" ".join(jieba.lcut(i)) for i in x_train]
    x_test = [" ".join(jieba.lcut(i)) for i in x_test]
    x_train = tf_idf.fit_transform(x_train)
    x_val = tf_idf.transform(x_val)
    x_test = tf_idf.transform(x_test)
    selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
    selector.fit(x_train, y_train)
    x_train = selector.transform(x_train)  # numpy.float64
    x_val = selector.transform(x_val)
    x_test = selector.transform(x_test)
    return x_train, x_val, x_test
 
 
def main():
    x_train, x_val, x_test, y_train, y_val, y_test = load_data("newses.csv")
    x_train, x_val, x_test = data_prep(x_train, y_train, x_val, x_test)
    model = models.Sequential([
        Dropout(rate=0.2, input_shape=x_train.shape[1:]),  # x_train.shape[1:]:(20000,)
        Dense(units=64, activation='relu'),
        Dropout(rate=0.2),
        Dense(10, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    history = model.fit(x_train.toarray(), y_train, epochs=100, verbose=0,
                        validation_data=(x_val.toarray(), y_val),
                        batch_size=128)
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_accuracy'][-1],
                                                            loss=history['val_loss'][-1]))
    model.evaluate(x_test.toarray(), y_test)
    y_predict = model.predict(x_test.toarray())
    predicts = []
    for i in y_predict:
        predicts.append(np.argmax(i))
    print(f'Predicts:{predicts}')
    score = f1_score(y_test, predicts, average='macro')
    print(f'F1_core:{score}')
    model.save('News_mlp_model.h5')
 
 
main()

3 新闻文本爬取

import requests
from bs4 import BeautifulSoup
import pandas as pd
 
 
def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)
        print(url)
        return url
 
 
def parse_news_page(html):
    try:
        ilt = []
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find("title").string
        ilt.append(title)
        content = soup.find_all("p")
        for p in content:
            s = p.text.strip()
            s = "".join(s.split("\n"))
            ilt.append(s)
        news = "".join(ilt)
        return news
    except Exception as e:
        return e
 
 
def parse_href_page(html, hrefs):
    soup = BeautifulSoup(html, "html.parser")
    tags = soup.find_all("a")
    for tag in tags:
        href = tag.attrs["href"]
        if "shtml" == href[-5:] and href not in hrefs:
            hrefs.append(href)
    return hrefs
 
 
def get_newses(url, newses, labels, count):
    hrefs = []
    html = get_html_text(url)
    parse_href_page(html, hrefs)
    for href in hrefs:
        html = get_html_text(href)
        if html == href:
            continue
        news = parse_news_page(html)
        # print(news)
        newses.append(news)
        labels.append(count)
 
 
def main():
    newses = []
    labels = []
    urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
            "http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
            "http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
            "http://gongyi.cnr.cn/"]
    count = 0
    for url in urls:
        print(url)
        get_newses(url, newses, labels, count)
        count += 1
    newses = pd.DataFrame({"label": labels, "text": newses})
    newses.to_csv("newses.csv", index=False)
 
 
main()

标签:score,val,代码,爬取,print,train,test,import,文本
From: https://www.cnblogs.com/yymqdu/p/17115749.html

相关文章