代码如下:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# 定义新闻类别
categories = ["军事", "经济", "文化", "科学", "娱乐"]
# 文件路径
data_folder = "E:/news"
# 读取数据函数
def load_data(data_folder, categories, train_range, test_range):
train_texts, train_labels = [], []
test_texts, test_labels = [], []
for category in categories:
folder_path = os.path.join(data_folder, category)
for i in range(1, 26): # 文件从1到25
file_path = os.path.join(folder_path, f"{i}.txt")
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
if i in train_range:
train_texts.append(content)
train_labels.append(category)
elif i in test_range:
test_texts.append(content)
test_labels.append(category)
return train_texts, train_labels, test_texts, test_labels
# 加载数据
train_range = range(1, 21) # 训练集范围:1-20
test_range = range(21, 26) # 测试集范围:21-25
train_texts, train_labels, test_texts, test_labels = load_data(data_folder, categories, train_range, test_range)
# 特征提取
vectorizer = TfidfVectorizer(max_features=5000) # 限制最大特征数
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)
# 标签映射
label_mapping = {category: i for i, category in enumerate(categories)}
y_train = np.array([label_mapping[label] for label in train_labels])
y_test = np.array([label_mapping[label] for label in test_labels])
# 训练逻辑回归模型
model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='ovr')
model.fit(X_train, y_train)
# 测试预测
y_pred = model.predict(X_test)
# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("Logistic回归模型测试集准确率:", accuracy)
print(y_pred)
print(y_test)
结果如下:
正确率达到80%,初步估计是因为数据太少,这里只选用了120份数据。采用更多数据学习效果会更好。
标签:texts,test,labels,label,小课,range,train,Logistic,IDF From: https://blog.csdn.net/2302_76500884/article/details/144476545