1.导包
import pandas as pd
import numpy as np
2.
class NaiveBayes:
def __init__(self):
self.model = {}#key 为类别名 val 为字典PClass表示该类的该类,PFeature:{}对应对于各个特征的概率
def calEntropy(self, y): # 计算熵
valRate = y.value_counts().apply(lambda x : x / y.size) # 频次汇总 得到各个特征对应的概率
valEntropy = np.inner(valRate, np.log2(valRate)) * -1
return valEntropy
def fit(self, xTrain, yTrain = pd.Series()):
if not yTrain.empty:#如果不传,自动选择最后一列作为分类标签
xTrain = pd.concat([xTrain, yTrain], axis=1)
self.model = self.buildNaiveBayes(xTrain)
return self.model
def buildNaiveBayes(self, xTrain):
yTrain = xTrain.iloc[:,-1]
yTrainCounts = yTrain.value_counts()# 频次汇总 得到各个特征对应的概率
yTrainCounts = yTrainCounts.apply(lambda x : (x + 1) / (yTrain.size + yTrainCounts.size)) #使用了拉普拉斯平滑
retModel = {}
for nameClass, val in yTrainCounts.items():
retModel[nameClass] = {'PClass': val, 'PFeature':{}}
propNamesAll = xTrain.columns[:-1]
allPropByFeature = {}
for nameFeature in propNamesAll:
allPropByFeature[nameFeature] = list(xTrain[nameFeature].value_counts().index)
#print(allPropByFeature)
for nameClass, group in xTrain.groupby(xTrain.columns[-1]):
for nameFeature in propNamesAll:
eachClassPFeature = {}
propDatas = group[nameFeature]
propClassSummary = propDatas.value_counts()# 频次汇总 得到各个特征对应的概率
for propName in allPropByFeature[nameFeature]:
if not propClassSummary.get(propName):
propClassSummary[propName] = 0#如果有属性灭有,那么自动补0
Ni = len(allPropByFeature[nameFeature])
propClassSummary = propClassSummary.apply(lambda x : (x + 1) / (propDatas.size + Ni))#使用了拉普拉斯平滑
for nameFeatureProp, valP in propClassSummary.items():
eachClassPFeature[nameFeatureProp] = valP
retModel[nameClass]['PFeature'][nameFeature] = eachClassPFeature
return retModel
def predictBySeries(self, data):
curMaxRate = None
curClassSelect = None
for nameClass, infoModel in self.model.items():
rate = 0
rate += np.log(infoModel['PClass'])
PFeature = infoModel['PFeature']
for nameFeature, val in data.items():
propsRate = PFeature.get(nameFeature)
if not propsRate:
3.
不使用sklearn包编写朴素贝叶斯算法程序,对输入数据进行预测
dataset = [['青绿','蜷缩','浊响','清晰','凹陷','碍滑','是'],
['乌黑','蜷缩','沉闷','清晰','凹陷','碍滑','是'],
['乌黑','蜷缩','浊响','清晰','凹陷','碍滑','是'],
['青绿','蜷缩','沉闷','清晰','凹陷','碍滑','是'],
['浅白','蜷缩','浊响','清晰','凹陷','碍滑','是'],
['青绿','稍蜷','浊响','清晰','稍凹','软粘','是'],
['乌黑','稍蜷','浊响','稍糊','稍凹','软粘','是'],
['乌黑','稍蜷','浊响','清晰','稍凹','硬滑','是'],
['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑','否'],
['青绿','硬挺','清脆','清晰','平坦','软粘','否'],
['浅白','硬挺','清脆','模糊','平坦','硬滑','否'],
['浅白','蜷缩','浊响','模糊','平坦','软粘','否'],
['青绿','稍蜷','浊响','稍糊','凹陷','硬滑','否'],
['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑','否'],
['乌黑','稍蜷','浊响','清晰','稍凹','软粘','否'],
['浅白','蜷缩','浊响','模糊','平坦','硬滑','否'],
['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑','否']
]
labels=['色泽','根蒂','敲声','纹理','脐部','触感','好瓜']
dataTrain=pd.DataFrame(dataset,columns=labels)
naiveBayes = NaiveBayes()
treeData = naiveBayes.fit(dataTrain)
4.使用sklearn包编写朴素贝叶斯算法程序,对输入数据进行预测
#输入数据集
datasets1=[['0','0','0','0','0','0','1'],
['1','0','1','0','0','0','1'],
['1','0','0','0','0','0','1'],
['0','0','1','0','0','0','1'],
['2','0','0','0','0','0','1'],
['0','1','0','0','1','1','1'],
['1','1','0','1','1','1','1'],
['1','1','0','0','1','2','1'],
['1','1','1','1','1','2','0'],
['0','2','2','0','2','1','0'],
['2','2','2','2','2','2','0'],
['2','0','0','2','2','1','0'],
['0','1','0','1','0','2','0'],
['2','1','1','1','0','2','0'],
['1','1','0','0','1','1','0'],
['2','0','0','2','2','2','0'],
['0','0','1','1','1','2','0']
]
#青绿:0 乌黑:1 浅白:2
# 蜷缩 0 稍蜷 1 硬挺 2
# 浊响 0 沉闷 1 清脆 2
# 清晰 0 稍糊 1 模糊 2
# 凹陷 0 稍凹 1 平坦 2
# 碍滑 0 软粘 1 硬滑 2
# 是 1 否 0
labels=['色泽','根蒂','敲声','纹理','脐部','触感','好瓜']
5
#样本数据
import pandas as pd
#将数据集转换为DataFrame数据
data1=pd.DataFrame(datasets1,columns=labels)
data1
6.
import pandas as pd
target = np.array([0,1,2,3,4,5,6],dtype='float32')
data = np.array(dataset,dtype='float32')
7.
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data.T,target,random_state=1) #按比例分割数据
nb_clf = GaussianNB() #实例化模型
nb_clf.fit(x_train,y_train) #模型训练
a=nb_clf.predict(x_test) #预测
acc_score = nb_clf.score(x_test,y_test) #查看模型分数
x_train
x_test
8.
from sklearn.model_selection import train_test_split #将原始数据划分为数据集与测试集两个部分
from sklearn.naive_bayes import BernoulliNB
X=data1.iloc[:,:-1]
y=data1.iloc[:,-1]
#X_train训练样本, X_test测试样本, y_train训练样本分类, y_test测试样本分类
#X样本数据分类集, y分类结果集, test_size=3测试样本数量,random_state=1 生成数据随机
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=3,random_state=None)
标签:xTrain,nameFeature,self,贝叶斯,浊响,算法,train,实验,test From: https://www.cnblogs.com/1759945451qq/p/16885625.html