各函数包与函数模块之间的所属关系如图:
注意,所有函数包以及Notbook文件都是所属父文件夹的同级别文件,只有这样才能顺利调用所需函数
各函数包如下:
kNN
1 import numpy as np 2 from math import sqrt 3 from collections import Counter 4 5 6 class KNNClassifier: 7 8 def __init__(self, k): 9 """初始化kNN分类器""" 10 assert k >= 1, "k must be valid" 11 self.k = k 12 self._X_train = None 13 self._y_train = None 14 15 def fit(self, X_train, y_train): 16 """根据训练数据集X_train和y_train训练kNN分类器""" 17 assert X_train.shape[0] == y_train.shape[0],\ 18 "the size of X_train must be equal to the size of y_train" 19 assert self.k <= X_train.shape[0],\ 20 "the size of X_train must be at least k ." 21 22 self._X_train = X_train 23 self._y_train = y_train 24 return self 25 26 def predict(self, X_predict): # predict(self, X_predict) 27 """给定待预测数据集X_predict,返回X_predict的结果向量""" 28 assert self._X_train is not None and self._y_train is not None,\ 29 " must fit before predict!" 30 assert X_predict.shape[1] == self._X_train.shape[1],\ 31 " the feature number of X_predict must be equal to X_train" 32 33 y_predict = [self._predict(x) for x in X_predict] 34 return np.array(y_predict) 35 36 def _predict(self, x): 37 """给定单个待测数据x,返回x的预测结果值""" 38 assert x.shape[0] == self._X_train.shape[1],\ 39 "the feature number of x must be equal to x_train" 40 distances = [sqrt(np.sum((x_train - x) ** 2)) 41 for x_train in self._X_train] 42 nearest = np.argsort(distances) 43 44 topK_y = [self._y_train[i] for i in nearest[:self.k]] 45 votes = Counter(topK_y) 46 47 return votes.most_common(1)[0][0] 48 49 def __repr__(self): 50 return "KNN(k=%d)" % self.k
KNN_function
1 # KNN_classify() 2 import numpy as np 3 from math import sqrt 4 from collections import Counter 5 6 7 def KNN_classify (k, X_trian, y_trian, x): 8 assert 1 <= k <= X_trian.shape[0], ' k must be valid' 9 assert X_trian.shape[0] == y_trian.shape[0],\ 10 'the size of X_trian must equal to the size of y_trian ' 11 assert X_trian.shape[1] == x.shape[0],\ 12 "the feature number of must be equal to X_trian " 13 14 distances = [sqrt(np.sum(x_trian - x) ** 2) for x_trian in X_trian] 15 nearest = np.argsort(distances) 16 topk_y = [y_trian[i] for i in nearest[:k]] 17 votes = Counter(topk_y) 18 return votes.most_common(1)[0][0] 19 20 21 print(" KNN_classify 已加载.")
playML:
kNN.py
1 import numpy as np 2 from math import sqrt 3 from collections import Counter 4 from .metrics import accuracy_score # from .metrics 报错 5 6 7 class KNNClassifier: 8 9 def __init__(self, k): 10 """初始化kNN分类器""" 11 assert k >= 1, "k must be valid" 12 self.k = k 13 self._X_train = None 14 self._y_train = None 15 16 def fit(self, X_train, y_train): 17 """根据训练数据集X_train和y_train训练kNN分类器""" 18 assert X_train.shape[0] == y_train.shape[0],\ 19 "the size of X_train must be equal to the size of y_train" 20 assert self.k <= X_train.shape[0],\ 21 "the size of X_train must be at least k ." 22 23 self._X_train = X_train 24 self._y_train = y_train 25 return self 26 27 def predict(self, X_predict): # predict(self, X_predict) 28 """给定待预测数据集X_predict,返回X_predict的结果向量""" 29 assert self._X_train is not None and self._y_train is not None,\ 30 " must fit before predict!" 31 assert X_predict.shape[1] == self._X_train.shape[1],\ 32 " the feature number of X_predict must be equal to X_train" 33 34 y_predict = [self._predict(x) for x in X_predict] 35 return np.array(y_predict) 36 37 def _predict(self, x): 38 """给定单个待测数据x,返回x的预测结果值""" 39 assert x.shape[0] == self._X_train.shape[1],\ 40 "the feature number of x must be equal to x_train" 41 distances = [sqrt(np.sum((x_train - x) ** 2)) 42 for x_train in self._X_train] 43 nearest = np.argsort(distances) 44 45 topK_y = [self._y_train[i] for i in nearest[:self.k]] 46 votes = Counter(topK_y) 47 48 return votes.most_common(1)[0][0] 49 50 def score(self, X_test, y_test): 51 """根据测试数据集 X_test 和 y_test 确定当前模型的准确度""" 52 y_predict = self.predict(X_test) # self._predict(X_test),大意了直接采纳第一个提示 53 return accuracy_score(y_test, y_predict) 54 55 def __repr__(self): 56 return "KNN(k=%d)" % self.k
LinearRegression.py
1 import numpy as np 2 from .metrics import r2_score 3 # 源名需加kNN 4 5 class LinearRegression: 6 7 def __int__(self): 8 """初始化 Linear Regression 模型""" 9 self.coef_ = None 10 self.interception_ = None 11 self._theta = None 12 13 def fit_normal(self, X_train, y_train): 14 """根据训练数据集 X_train, y_train 训练 Linear Regression 模型""" 15 assert X_train.shape[0] == y_train.shape[0], \ 16 "the size of X_train must be equal to the size of y_train" 17 18 X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 19 self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train) 20 21 self.interception_ = self._theta[0] 22 self.coef_ = self._theta[1:] 23 24 return self 25 26 def fit_gd(self, X_train, y_train, eta=0.001, n_iters=1e4): 27 """根据训练数据集 X_trian, y_train,使用梯度下降法训练 Linear Regression模型""" 28 assert X_train.shape[0] == y_train.shape[0], \ 29 "the size of X_train must equal to the size of y_train" 30 31 def J(theta, X_b, y): 32 try: 33 return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b) 34 except: 35 return float('inf') 36 37 def dJ(theta, X_b, y): 38 # res = np.empty(len(theta)) 39 # res[0] = np.sum(X_b.dot(theta) - y) 40 # for i in range(1, len(theta)): 41 # res[i] = (X_b.dot(theta) - y).dot(X_b[:, i]) 42 # return res * 2 / len(theta) 43 return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(y) 44 45 def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e5, epsilon=1e-8): 46 47 theta = initial_theta 48 # theta_history.append(initial_theta) 49 i_iters = 0 50 51 while i_iters < n_iters: 52 gradient = dJ(theta, X_b, y) 53 last_theta = theta 54 theta = theta - eta * gradient 55 # theta_history.append(theta) 56 57 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 58 break 59 i_iters += 1 60 61 return theta 62 63 X_b = np.hstack([np.ones((len(X_trian), 1)), X_trian]) 64 initial_theta = np.zeros(X_b.shape[1]) 65 self._theta = gradient_descent(X_b, y_train, initial_theta, eta) 66 67 self.interception_ = self._theta[0] 68 self.coef_ = self._theta[1:] 69 70 return self 71 72 def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50): 73 """根据训练数据集 X_trian, y_train,使用梯度下降法训练 Linear Regression模型""" 74 assert X_train.shape[0] == y_train.shape[0], \ 75 "the size of X_train must equal to the size of y_train" 76 assert n_iters >= 1,\ 77 "the size of n_iters must >= 1" 78 def dJ_sgd(theta, X_b_i, y_i): 79 80 return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2 81 82 def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50): 83 84 def learning_rate(t): 85 return t0 / (t + t1) 86 87 theta = initial_theta 88 m = len(X_b) 89 90 for cur_iter in range(n_iters): 91 indexes = np.random.permutation(m) 92 X_b_new = X_b[indexes] 93 y_new = y[indexes] 94 for i in range(m): 95 gradient = dJ_sgd(theta, X_b_new[i], y_new[i]) 96 theta = theta - learning_rate(cur_iter * m + i) * gradient 97 return theta 98 99 100 X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 101 initial_theta = np.zeros(X_b.shape[1]) 102 self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1) 103 104 self.interception_ = self._theta[0] 105 self.coef_ = self._theta[1:] 106 return self 107 108 109 110 def predict(self, X_predict): 111 """给定待测数据集 X_predict,返回表示 X_predict 的结果向量 """ 112 assert self.interception_ is not None and self.coef_ is not None, \ 113 "must fit before predict!" 114 assert X_predict.shape[1] == len(self.coef_), \ 115 "the feature number of X_predict must be equal to X_train" 116 117 X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 118 return X_b.dot(self._theta) 119 120 def score(self, X_test, y_test): 121 """根据测试数据集 X_test 和 y_test 确定当前模型的准确度""" 122 123 y_predict = self.predict(X_test) 124 return r2_score(y_test, y_predict) 125 126 def __repr__(self): 127 return "LinearRegression()"
LogistcRegression.py
1 import numpy as np 2 from .metrics import accuracy_score 3 # 源名需加kNN 4 5 class LogisticRegression: 6 7 def __int__(self): 8 """初始化 LogisticRegression 模型""" 9 self.coef_ = None 10 self.interception_ = None 11 self._theta = None 12 13 def _sigmoid(self, t): 14 return 1 / (1 + np.exp(-t)) 15 16 def fit(self, X_train, y_train, eta=0.001, n_iters=1e4): 17 """根据训练数据集 X_trian, y_train,使用梯度下降法训练 LogisticRegression模型""" 18 assert X_train.shape[0] == y_train.shape[0], \ 19 "the size of X_train must equal to the size of y_train" 20 21 def J(theta, X_b, y): 22 y_hat = self._sigmoid(X_b.dot(theta)) 23 try: 24 return - np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y) 25 except: 26 return float('inf') 27 28 def dJ(theta, X_b, y): 29 return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) * 2 / len(y) 30 31 def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e5, epsilon=1e-8): 32 33 theta = initial_theta 34 # theta_history.append(initial_theta) 35 i_iters = 0 36 37 while i_iters < n_iters: 38 gradient = dJ(theta, X_b, y) 39 last_theta = theta 40 theta = theta - eta * gradient 41 # theta_history.append(theta) 42 43 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon): 44 break 45 i_iters += 1 46 47 return theta 48 49 X_b = np.hstack([np.ones((len(X_train), 1)), X_train]) 50 initial_theta = np.zeros(X_b.shape[1]) 51 self._theta = gradient_descent(X_b, y_train, initial_theta, eta) 52 53 self.interception_ = self._theta[0] 54 self.coef_ = self._theta[1:] 55 56 return self 57 58 def predict_proba(self, X_predict): 59 """给定待测数据集 X_predict,返回表示 X_predict 的结果概率向量 """ 60 assert self.interception_ is not None and self.coef_ is not None, \ 61 "must fit before predict!" 62 assert X_predict.shape[1] == len(self.coef_), \ 63 "the feature number of X_predict must be equal to X_train" 64 65 X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict]) 66 return self._sigmoid(X_b.dot(self._theta)) 67 68 def predict(self, X_predict): 69 """给定待测数据集 X_predict,返回表示 X_predict 的结果向量 """ 70 assert self.interception_ is not None and self.coef_ is not None, \ 71 "must fit before predict!" 72 assert X_predict.shape[1] == len(self.coef_), \ 73 "the feature number of X_predict must be equal to X_train" 74 75 proba = self.predict_proba(X_predict) 76 return np.array(proba >= 0.5, dtype='int') 77 78 def score(self, X_test, y_test): 79 """根据测试数据集 X_test 和 y_test 确定当前模型的准确度""" 80 81 y_predict = self.predict(X_test) 82 return accuracy_score(y_test, y_predict) 83 84 def __repr__(self): 85 return "LogisticRegression()"
metrics.py
1 import numpy as np 2 from math import sqrt 3 4 5 def accuracy_score(y_ture, y_predict): 6 """计算 y_ture与 y_predict之间的准确度""" 7 assert y_ture.shape[0] == y_predict.shape[0] 8 "the size of y_ture must be equal to the size of y_predict" 9 10 return sum(y_ture == y_predict)/len(y_ture) 11 12 13 def mean_squared_error(y_ture, y_predict): 14 """计算 y_ture,与 y_predict 之间的MSE """ 15 assert len(y_ture) == len(y_predict), \ 16 "the size of y_ture must be equal to the size of y_predict " 17 return np.sum((y_ture - y_predict) ** 2) / len(y_ture) 18 19 20 def root_mean_squared_error(y_ture, y_predict): 21 """计算 y_ture与 y_predict之间的RMSE""" 22 assert len(y_ture) == len(y_predict), \ 23 "the size of y_ture must be equal to the size of y_predict " 24 return sqrt(mean_squared_error(y_ture, y_predict)) 25 26 27 def mean_absolute_error(y_ture, y_predict): 28 """计算 y_ture,与 y_predict 之间的MAE """ 29 assert len(y_ture) == len(y_predict), \ 30 "the size of y_ture must be equal to the size of y_predict " 31 return np.sum(np.absolute(y_ture - y_predict)) / len(y_predict) 32 33 34 def r2_score(y_ture, y_predict): 35 """计算 y_ture,与 y_predict 之间的 R Square """ 36 return 1 - mean_squared_error(y_ture, y_predict) / np.var(y_ture) 37 38 def TN(y_ture, y_predict): 39 assert len(y_ture) == len(y_predict) 40 return np.sum((y_ture == 0) & (y_predict == 0)) 41 42 def FP(y_ture, y_predict): 43 assert len(y_ture) == len(y_predict) 44 return np.sum((y_ture == 0) & (y_predict == 1)) 45 46 def FN(y_ture, y_predict): 47 assert len(y_ture) == len(y_predict) 48 return np.sum((y_ture == 1) & (y_predict == 0)) 49 50 def TP(y_ture, y_predict): 51 assert len(y_ture) == len(y_predict) 52 return np.sum((y_ture == 1) & (y_predict == 1)) 53 54 def confusion_matrix(y_true, y_predict): 55 return np.array([ 56 [TN(y_true,y_predict), FP(y_true,y_predict)], 57 [FN(y_true,y_predict), TP(y_true,y_predict)] 58 ]) 59 60 def precision_score(y_true, y_predict): 61 tp = TP(y_true, y_predict) 62 fp = FP(y_true, y_predict) 63 try: 64 return tp / (tp + fp) 65 except: 66 return 0.0 67 68 def recall_score(y_true, y_predict): 69 tp = TP(y_true, y_predict) 70 fn = FN(y_true, y_predict) 71 try: 72 return tp / (tp + fn) 73 except: 74 return 0.0 75 76 def f1_score(precision, recall): 77 try: 78 return 2 * precision * recall / ( precision + recall) 79 except: 80 return 0.0 81 82 def TPR(y_true, y_predict): 83 tp = TP(y_true, y_predict) 84 fn = FN(y_true, y_predict) 85 try: 86 return tp / (tp + fn) 87 except: 88 return 0.0 89 90 def FPR(y_true, y_predict): 91 tp = TP(y_true, y_predict) 92 tn = TN(y_true, y_predict) 93 try: 94 return tp / (tp + tn) 95 except: 96 return 0.0
model_selection.py
1 import numpy as np 2 3 4 def train_test_split(X, y, test_radio=0.2, seed=None): 5 """将数据X和y按照test_radio分割成X_train、y_train、X_test、y_test""" 6 assert X.shape[0] == y.shape[0],\ 7 "the size of X must be equal to the size of y" 8 assert 0.0 <= test_radio <= 1.0,\ 9 "test_ration must be valid" 10 11 if seed: 12 np.random.seed(seed) 13 14 shuffled_indexes = np.random.permutation(len(X)) 15 16 test_size = int(test_radio * len(X)) 17 test_indexes = shuffled_indexes[:test_size] 18 train_indexes = shuffled_indexes[test_size:] 19 20 X_train = X[train_indexes] 21 y_train = y[train_indexes] 22 23 X_test = X[test_indexes] 24 y_test = y[test_indexes] 25 26 return X_train, X_test, y_train, y_test
PCA.py
1 import numpy as np 2 3 4 class PCA: 5 6 def __init__(self, n_components): 7 """ 初始化""" 8 assert n_components >= 1, "n_components must be valid" 9 self.n_components = n_components 10 self.components = None 11 12 def fit(self, X, eta=0.01, n_iters=1e4): 13 """获得数据集 X 的前 n 个主成分""" 14 assert self.n_components <= X.shape[1], \ 15 "n_components must not be greater than feature number of X" 16 17 def demean(X): 18 return X - np.mean(X, axis=0) 19 20 def f(w, X): 21 return np.sum((X.dot(w) ** 2)) / len(X) 22 23 def df(w, X): 24 return X.T.dot(X.dot(w)) * 2.0 / len(X) 25 26 def direction(w): 27 return w / np.linalg.norm(w) 28 29 def first_componet(X, initial_w, eta, n_iters=1e4, epsilon=1e-8): 30 w = direction(initial_w) 31 i_iters = 0 32 33 while i_iters < n_iters: 34 gradient = df(w, X) 35 last_w = w 36 w = w + eta * gradient 37 w = direction(w) # 注意1:每次求一个单位方向 38 if (abs(f(w, X) - f(last_w, X)) < epsilon): 39 break 40 41 i_iters += 1 42 43 return w 44 45 X_pca = demean(X) 46 self.components_ = np.empty(shape=(self.n_components, X.shape[1])) 47 res = [] 48 for i in range(self.n_components): 49 initial_w = np.random.random(X_pca.shape[1]) 50 w = first_componet(X_pca, initial_w, eta, n_iters) 51 self.components_[i, :] = w 52 53 X_pca = X_pca - X_pca.dot(w).reshape(-1, 1) * w 54 55 return self 56 57 def transform(self, X): 58 """将X给定的,映射到各个主成分分量中""" 59 assert X.shape[1] == self.components_.shape[1] 60 61 return X.dot(self.components_.T) 62 63 def inverse_transform(self, X): 64 """将给定的X,反向映射回原来的特征空间""" 65 assert X.shape[1] == self.components_.shape[0] 66 67 return X.dot(self.components_) 68 69 def __repr__(self): 70 return "PCA(n_components=%d)" % self.n_components
preprocessing.py
1 import numpy as np 2 3 4 class StandardScaler: 5 6 def __int__(self): 7 self.mean_ = None 8 self.scale_ = None 9 10 def fit(self, X): 11 """根据训练数据集X获得数据的均值和方差""" 12 assert X.ndim == 2, "The dimension of X must be 2" 13 14 self.mean_ = np.array(np.mean(X[:, i]) for i in range(X.shape[1])) 15 self.scale_ = np.array(np.std(X[:, i]) for i in range(X.shape[1])) 16 17 return self 18 19 def transform(self, X): 20 """ 将 X 根据这个StandardScaler进行均值方差归一化处理""" 21 assert X.ndim == 2, "The dimension of X must be 2" 22 assert self.mean_ is not None and self.scale_ is not None,\ 23 "must fit before transform!" 24 assert X.shape[1] == len(self.mean_), \ 25 "The feature number of X must be equal to mean_ and std_" 26 27 resX = np.empty(shape=X.shape, dtype=float) 28 for col in range(X.shape[1]): 29 resX[:, col] = (X[:, col] - self.mean_[col]) / self.scale_[col] 30 return resX
SimpleLinearRegression.py
1 import numpy as np 2 from .metrics import r2_score # 加点下标运行报错,不加点下标jupyter 能运行 3 4 5 class SimpleLinearRegression1: 6 7 def __int__(self): 8 """初始化 Simple Linear Regression 模型""" 9 self.a_ = None 10 self.b_ = None 11 12 def fit(self, x_train, y_train): 13 """根据训练数据集x_train,y_train 训练 Simple Linear Regression 模型""" 14 assert x_train.ndim == 1, \ 15 " Simple Linear Regression can only solve single feature training data" 16 assert len(x_train) == len(y_train), \ 17 "the size of x_train must be equal to the size of y_train" 18 19 x_mean = np.mean(x_train) 20 y_mean = np.mean(y_train) 21 22 num = 0.0 23 d = 0.0 24 for x, y in zip(x_train, y_train): 25 num += (x - x_mean) * (y - y_mean) 26 d += (x - x_mean) ** 2 27 28 self.a_ = num / d 29 self.b_ = y_mean - self.a_ * x_mean 30 31 return self 32 33 def predict(self, x_predict): 34 """给定待测数据集x_predict,返回表示x_predict的结果向量""" 35 # print(x_predict.ndim) 36 assert x_predict.ndim == 1, \ 37 "Simple Linear Regression can only solve single feature training data" 38 assert self.a_ is not None and self.b_ is not None, \ 39 "must fit before predict!" 40 41 return np.array([self._predict(x) for x in x_predict]) # predict(x)无下划线问题严重 42 43 def _predict(self, x_single): 44 """给定单个待测数据 x_single,返回x_single的预测结果值""" 45 return self.a_ * x_single + self.b_ 46 47 def __repr__(self): 48 return "Simple Linear Regression1()" 49 50 51 class SimpleLinearRegression2: 52 53 def __int__(self): 54 """初始化 Simple Linear Regression 模型""" 55 self.a_ = None 56 self.b_ = None 57 58 def fit(self, x_train, y_train): 59 """根据训练数据集x_train,y_train 训练 Simple Linear Regression 模型""" 60 assert x_train.ndim == 1, \ 61 " Simple Linear Regression can only solve single feature training data" 62 assert len(x_train) == len(y_train), \ 63 "the size of x_train must be equal to the size of y_train" 64 65 x_mean = np.mean(x_train) 66 y_mean = np.mean(y_train) 67 68 num = 0.0 69 d = 0.0 70 num = (x_train - x_mean).dot(y_train - y_mean) 71 d = (x_train - x_mean).dot(x_train - x_mean) 72 self.a_ = num / d 73 self.b_ = y_mean - self.a_ * x_mean 74 75 return self 76 77 def predict(self, x_predict): 78 """给定待测数据集x_predict,返回表示x_predict的结果向量""" 79 # print(x_predict.ndim) 80 assert x_predict.ndim == 1, \ 81 "Simple Linear Regression can only solve single feature training data" 82 assert self.a_ is not None and self.b_ is not None, \ 83 "must fit before predict!" 84 85 return np.array([self._predict(x) for x in x_predict]) # predict(x)无下划线问题严重 86 87 def _predict(self, x_single): 88 """给定单个待测数据 x_single,返回x_single的预测结果值""" 89 return self.a_ * x_single + self.b_ 90 91 def score(self, x_test, y_test): 92 """根据测试数据集 x_test 和 y_test 确定当前模型的准确度""" 93 y_predict = self.predict(x_test) 94 return r2_score(y_test, y_predict) 95 96 def __repr__(self): 97 return "Simple Linear Regression2()"
标签:封装,函数库,predict,self,return,train,theta,Pycharm,def From: https://www.cnblogs.com/Cai-Gbro/p/16839885.html