和新闻按照标题分类差不多,用的朴素贝叶斯
#导入必要的包
import random
import sys
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
import joblib
import re,string
import pandas as pd
import numpy as np
def text_to_words(file_path):#将文本拆分成 词语 和 标签
myTrain = pd.read_csv(file_path)
sentences_arr = []
lab_arr = list(myTrain.values[:, 4])
for i in range(len(myTrain.values)):
sentence = myTrain.values[i, 3].split(' ')[-1].strip() # 得到句子
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()《》:]+", " ", sentence) # sub是代替,这里是把标点符号换成空格
sentence = sentence.split(' ')
sentences_arr.append(sentence)
return sentences_arr, lab_arr
def load_stopwords(file_path):#创建停用词表
stopwords = [line.strip() for line in open(file_path, encoding='UTF-8').readlines()]#line.strip()用于去除两端空格
return stopwords
def get_dict(sentences_arr,stopswords):#生成词典
word_dic = {}
for sentence in sentences_arr:
for word in sentence:
if word != ' ' and word.isalpha():#isalpha函数用于判断字符串是否全部由字母组成
if word not in stopswords:
word_dic[word] = word_dic.get(word,1) + 1
word_dic=sorted(word_dic.items(),key=lambda x:x[1],reverse=True) #按词频序排列
return word_dic
def get_feature_words(word_dic,word_num):#选取出现次数最多的前 word_num 个单词作为特征词
n = 0
feature_words = []
for word in word_dic:
if n < word_num:
feature_words.append(word[0])
n += 1
return feature_words
# 文本特征
def get_text_features(train_data_list, test_data_list, feature_words):#根据特征词,将 训练集 和 测试集 中的句子转化为特征向量
def text_features(text, feature_words):
text_words = set(text)
features = [1 if word in text_words else 0 for word in feature_words] # 形成特征向量
return features
train_feature_list = [text_features(text, feature_words) for text in train_data_list]
test_feature_list = [text_features(text, feature_words) for text in test_data_list]
return train_feature_list, test_feature_list
sentences_arr, lab_arr = text_to_words('../train.csv')#获取分词后的数据及标签
print(sentences_arr[0])
stopwords = load_stopwords('../stopwords.txt')#加载停用词
word_dic = get_dict(sentences_arr,stopwords)#生成词典
train_data_list, test_data_list, train_class_list, test_class_list = model_selection.train_test_split(sentences_arr,lab_arr,test_size=0.1)#数据集划分
feature_words = get_feature_words(word_dic,1000)#生成特征词列表
train_feature_list,test_feature_list = get_text_features(train_data_list,test_data_list,feature_words)#生成特征向量
from sklearn.metrics import accuracy_score,classification_report
#获取朴素贝叶斯分类器
classifier = MultinomialNB(alpha=1.0, # 拉普拉斯平滑
fit_prior=True, #否要考虑先验概率
class_prior=None)
print(type(train_feature_list))
print(type(train_class_list))
classifier.fit(train_feature_list, train_class_list)#进行训练
predict = classifier.predict(test_feature_list)# 在验证集上进行验证
test_accuracy = accuracy_score(predict,test_class_list)
print("准确率 accuracy_score: %.4lf"%(test_accuracy))
print("模型评估报告 Classification report for classifier:\n",classification_report(test_class_list, predict))
joblib.dump(classifier, "NewsClassification.model")
myModel = joblib.load("NewsClassification.model")
def load_sentence(sentence):
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()《》:]+", " ",sentence) # sub是代替,这里是把标点符号换成空格
sentence = sentence.split(' ')
return sentence
p_data = 'We had a big earthquake here and many houses collapsed'
sentence = load_sentence(p_data)
sentence= [sentence]
print('分词结果:', sentence)
p_words = get_text_features(sentence,sentence,feature_words)#形成特征向量
res = myModel.predict(p_words[0])
print("所属类型:",int(res))
cnt=0
id=[]
target=[]
myTest = pd.read_csv('../test.csv')
for i in range(len(myTest.values)):
sentence = myTest.values[i, 3].split(' ')[-1].strip() # 得到句子
sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()《》:]+", " ", sentence) # sub是代替,这里是把标点符号换成空格
sentence = sentence.split(' ')
sentence = [sentence]
print('分词结果:', sentence)
p_words = get_text_features(sentence, sentence, feature_words) # 形成特征向量
res = myModel.predict(p_words[0])
print("所属类型:", int(res))
id.append(myTest.values[i, 0])
target.append(int(res))
cnt=cnt+1
if cnt%1000 ==0:
print(cnt)
myAns = pd.DataFrame({'id': id, 'target': target})
myAns.to_csv("myAns.csv", index=False, sep=',')
标签:推特,Natural,sentence,text,list,Kaggle,feature,words,word
From: https://www.cnblogs.com/wljss/p/18129880