目录
1 机器学习sklearn新闻文本分类
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import time
def load_data(filepath):
train_df = pd.read_csv(filepath)
x_train, y_train = train_df['text'], train_df['label']
x_train, x_test, y_train, y_test = \
train_test_split(x_train, y_train, test_size=0.2)
return x_train, x_test, y_train, y_test
def data_prep(x_train, y_train, x_test):
tf_idf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
x_train = [" ".join(jieba.lcut(i)) for i in x_train]
x_test = [" ".join(jieba.lcut(i)) for i in x_test]
x_train = tf_idf.fit_transform(x_train)
x_test = tf_idf.transform(x_test)
selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
selector.fit(x_train, y_train)
x_train = selector.transform(x_train) # numpy.float64
x_test = selector.transform(x_test)
return x_train, x_test
def main():
x_train, x_test, y_train, y_test = load_data("newses.csv")
x_train, x_test = data_prep(x_train, y_train, x_test)
start = time.time()
gnb = GaussianNB() # 朴素贝叶斯
print(f'1:{cross_val_score(gnb, x_train.toarray(), y_train, cv=10)}')
gnb.fit(x_train.toarray(), y_train)
answer_gnb = pd.Series(gnb.predict(x_test.toarray()))
answer_gnb.to_csv("answer_gnb.csv", header=False, index=False)
score_gnb = f1_score(y_test, answer_gnb, average='macro')
print(f'F1_core_gnb:{score_gnb}')
end = time.time()
print(f'时间:{end - start}s')
start = time.time()
rc = RidgeClassifier() # 岭回归分类器
print(f'\n2:{cross_val_score(rc, x_train, y_train, cv=10)}')
rc.fit(x_train, y_train)
answer_rc = pd.Series(rc.predict(x_test))
answer_rc.to_csv("answer_rc.csv", header=False, index=False)
score_rc = f1_score(y_test, answer_rc, average='macro')
print(f'F1_core_rc:{score_rc}')
end = time.time()
print(f'时间:{end - start}s')
start = time.time()
sv = svm.SVC() # 支持向量机
print(f'\n3:{cross_val_score(sv, x_train, y_train, cv=10)}')
sv.fit(x_train, y_train)
answer_sv = pd.Series(sv.predict(x_test))
answer_sv.to_csv("answer_sv.csv", header=False, index=False)
score_sv = f1_score(y_test, answer_sv, average='macro')
print(f'F1_core_sv:{score_sv}')
end = time.time()
print(f'时间:{end - start}s')
main()
2 深度学习MLP新闻文本分类
import numpy as np
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
def load_data(filepath):
train_df = pd.read_csv(filepath)
x_train, y_train = train_df['text'], train_df['label']
x_train, x_test, y_train, y_test = \
train_test_split(x_train, y_train, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
return x_train, x_val, x_test, y_train, y_val, y_test
def data_prep(x_train, y_train, x_val, x_test):
tf_idf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
x_train = [" ".join(jieba.lcut(i)) for i in x_train]
x_test = [" ".join(jieba.lcut(i)) for i in x_test]
x_train = tf_idf.fit_transform(x_train)
x_val = tf_idf.transform(x_val)
x_test = tf_idf.transform(x_test)
selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
selector.fit(x_train, y_train)
x_train = selector.transform(x_train) # numpy.float64
x_val = selector.transform(x_val)
x_test = selector.transform(x_test)
return x_train, x_val, x_test
def main():
x_train, x_val, x_test, y_train, y_val, y_test = load_data("newses.csv")
x_train, x_val, x_test = data_prep(x_train, y_train, x_val, x_test)
model = models.Sequential([
Dropout(rate=0.2, input_shape=x_train.shape[1:]), # x_train.shape[1:]:(20000,)
Dense(units=64, activation='relu'),
Dropout(rate=0.2),
Dense(10, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(x_train.toarray(), y_train, epochs=100, verbose=0,
validation_data=(x_val.toarray(), y_val),
batch_size=128)
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_accuracy'][-1],
loss=history['val_loss'][-1]))
model.evaluate(x_test.toarray(), y_test)
y_predict = model.predict(x_test.toarray())
predicts = []
for i in y_predict:
predicts.append(np.argmax(i))
print(f'Predicts:{predicts}')
score = f1_score(y_test, predicts, average='macro')
print(f'F1_core:{score}')
model.save('News_mlp_model.h5')
main()
3 新闻文本爬取
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_html_text(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print(e)
print(url)
return url
def parse_news_page(html):
try:
ilt = []
soup = BeautifulSoup(html, "html.parser")
title = soup.find("title").string
ilt.append(title)
content = soup.find_all("p")
for p in content:
s = p.text.strip()
s = "".join(s.split("\n"))
ilt.append(s)
news = "".join(ilt)
return news
except Exception as e:
return e
def parse_href_page(html, hrefs):
soup = BeautifulSoup(html, "html.parser")
tags = soup.find_all("a")
for tag in tags:
href = tag.attrs["href"]
if "shtml" == href[-5:] and href not in hrefs:
hrefs.append(href)
return hrefs
def get_newses(url, newses, labels, count):
hrefs = []
html = get_html_text(url)
parse_href_page(html, hrefs)
for href in hrefs:
html = get_html_text(href)
if html == href:
continue
news = parse_news_page(html)
# print(news)
newses.append(news)
labels.append(count)
def main():
newses = []
labels = []
urls = ["http://finance.cnr.cn/", "http://tech.cnr.cn/", "http://food.cnr.cn/",
"http://health.cnr.cn/", "http://edu.cnr.cn/", "http://travel.cnr.cn/",
"http://military.cnr.cn/", "http://auto.cnr.cn/", "http://house.cnr.cn/",
"http://gongyi.cnr.cn/"]
count = 0
for url in urls:
print(url)
get_newses(url, newses, labels, count)
count += 1
newses = pd.DataFrame({"label": labels, "text": newses})
newses.to_csv("newses.csv", index=False)
main()
标签:score,val,代码,爬取,print,train,test,import,文本
From: https://www.cnblogs.com/yymqdu/p/17115749.html