目录
1. t检验
1.1 单样本t检验
from scipy import stats
import numpy as np
# 准备数据
sample = np.array([1, 2, 3, 4, 5]) # 样本数据
popmean = 3 # 假设的总体均值
# 执行单样本t检验
t_stat, p_value = stats.ttest_1samp(sample, popmean)
# 输出结果
print("t-statistic:", t_stat)
print("p-value:", p_value)
# 根据p值判断结果
if p_value < 0.05:
print("The sample mean is significantly different from the population mean.")
else:
print("There is no significant difference between the sample mean and the population mean.")
t-statistic: 0.0
p-value: 1.0
There is no significant difference between the sample mean and the population mean.
1.2 两个独立样本T检验
from scipy import stats
import numpy as np
# 准备数据
sample1 = np.array([1, 2, 3, 4, 5])
sample2 = np.array([2, 3, 4, 5, 6])
# 执行t检验
t_stat, p_value = stats.ttest_ind(sample1, sample2)
# 输出结果
print("t-statistic:", t_stat)
print("p-value:", p_value)
# 根据p值判断结果
if p_value < 0.05:
print("There is a significant difference between the two samples.")
else:
print("There is no significant difference between the two samples.")
1.3 配对样本T检验
from scipy import stats
import numpy as np
# 准备数据
sample1 = np.array([1, 2, 3, 4, 5]) # 第一次测量的结果
sample2 = np.array([1.1, 2.1, 2.9, 4.1, 5.1]) # 第二次测量的结果
# 执行配对样本t检验
t_stat, p_value = stats.ttest_rel(sample1, sample2)
# 输出结果
print("t-statistic:", t_stat)
print("p-value:", p_value)
# 根据p值判断结果
if p_value < 0.05:
print("There is a significant difference between the two related samples.")
else:
print("There is no significant difference between the two related samples.")
t-statistic: -1.4999999999999971
p-value: 0.20800000000000057
There is no significant difference between the two related samples.
2.方差分析
2.1 单因素方差分析
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
# 假设我们有三组样本数据
data = {
'group': ['group1']*5 + ['group2']*5 + ['group3']*5,
'value': [85, 90, 88, 92, 87] + [78, 82, 84, 79, 80] + [92, 94, 96, 91, 95]
}
df = pd.DataFrame(data)
# 使用ols函数建立模型
model = ols('value ~ group', data=df).fit()
# 执行方差分析
anova_results = anova_lm(model, typ=2)
# 输出结果
print(anova_results)
sum_sq df F PR(>F)
group 428.133333 2.0 36.908046 0.000007
Residual 69.600000 12.0 NaN NaN
3.卡方检验
from scipy.stats import chi2_contingency
import pandas as pd
# 假设我们有一个2x2的列联表,表示两个分类变量的频数
# 例如,观察值是两个变量的频数矩阵
# 变量1 | 变量2
# 正例 | 反例
# -------------------
# 100 | 50
# 80 | 70
# 创建一个2x2的列联表
observed = [[100, 50], [80, 70]]
# 进行卡方检验
chi2, p, dof, expected = chi2_contingency(observed)
# 输出结果
print(f"Chi-squared: {chi2:.2f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:")
print(pd.DataFrame(expected, columns=['Variable 2 - Positive', 'Variable 2 - Negative'],
index=['Variable 1 - Positive', 'Variable 1 - Negative']))
Chi-squared: 5.01
P-value: 0.0251
Degrees of freedom: 1
Expected frequencies:
Variable 2 - Positive Variable 2 - Negative
Variable 1 - Positive 90.0 60.0
Variable 1 - Negative 90.0 60.0
4 相关分析
4.1 Pearson相关
使用 Pandas
import pandas as pd
# 假设我们有两个变量的数据
data = {
'Variable1': [1, 2, 3, 4, 5],
'Variable2': [5, 4, 3, 2, 1]
}
# 将数据转换为DataFrame
df = pd.DataFrame(data)
# 计算皮尔逊相关系数
correlation = df['Variable1'].corr(df['Variable2'], method='pearson')
print(f"Pearson correlation coefficient: {correlation}")
Pearson correlation coefficient: -0.9999999999999999
使用 Scipy
from scipy.stats import pearsonr
# 假设我们有两个变量的数据
variable1 = [1, 2, 3, 4, 5]
variable2 = [5, 4, 3, 2, 1]
# 计算皮尔逊相关系数和p值
correlation, p_value = pearsonr(variable1, variable2)
print(f"Pearson correlation coefficient: {correlation}")
print(f"P-value: {p_value}")
Pearson correlation coefficient: -1.0
P-value: 0.0
4.2 偏相关分析
import pandas as pd
import pingouin as pg
# 创建一个示例数据集
data = pd.DataFrame({
'currentGrade': [82, 88, 75, 74, 93, 97, 83, 90, 90, 80],
'hours': [4, 3, 6, 5, 4, 5, 8, 7, 4, 6],
'examScore': [88, 85, 76, 70, 92, 94, 89, 85, 90, 93],
})
# 计算偏相关系数,控制'currentGrade'的影响
partial_corr_result = pg.partial_corr(data=data, x='hours', y='examScore', covar='currentGrade')
# 输出结果
print(partial_corr_result)
n r CI95% p-val
pearson 10 0.190626 [-0.54, 0.76] 0.623228
5.线性回归
# 导入必要的库
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# 生成合成数据
np.random.seed(0) # 为了结果的可重复性
X = 2 * np.random.rand(100, 1) # 100个样本点,每个样本1个特征
y = 4 + 3 * X + np.random.randn(100, 1) # 线性关系 y = 4 + 3x + 噪声
# 将数据转换为DataFrame
df = pd.DataFrame(X, columns=['Feature'])
df['Target'] = y
# 创建线性回归模型实例
model = LinearRegression()
# 准备特征矩阵X和目标变量Y
X = df[['Feature']] # 特征矩阵需要是二维的
Y = df['Target'] # 目标变量可以是一维的
# 拟合模型
model.fit(X, Y)
# 进行预测
Y_pred = model.predict(X)
# 可视化结果
plt.scatter(X, Y, color='blue', label='Actual data') # 绘制实际数据点
plt.plot(X, Y_pred, color='red', label='Regression line') # 绘制回归线
plt.title('Linear Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.show()
# 查看模型参数
print(f"Intercept: {model.intercept_}")
print(f"Coefficient: {model.coef_[0]}")
# 评估模型性能
from sklearn.metrics import mean_squared_error, r2_score
# 计算均方误差和R^2分数
mse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
Intercept: 4.222151077447232
Coefficient: 2.968467510701018
Mean Squared Error: 0.9924386487246482
R^2 Score: 0.7469629925504755
6. 逻辑回归
# 导入必要的库
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
# 创建数据
data = {
'Feature1': [1, 2, 2, 3, 4, 5, 5, 6, 7],
'Feature2': [1, 2, 3, 4, 2, 5, 6, 7, 8],
'Target': [0, 0, 0, 0, 1, 1, 1, 1, 1]
}
# 将数据转换为DataFrame
df = pd.DataFrame(data)
# 查看数据
print("Data:")
print(df)
# 准备特征矩阵X和目标变量Y
X = df[['Feature1', 'Feature2']] # 特征矩阵需要是二维的
Y = df['Target'] # 目标变量可以是一维的
# 划分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# 创建逻辑回归模型实例
model = LogisticRegression()
# 拟合模型
model.fit(X_train, Y_train)
# 进行预测
Y_pred = model.predict(X_test)
# 计算准确率
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")
# 打印混淆矩阵
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)
# 打印分类报告
class_report = classification_report(Y_test, Y_pred)
print("Classification Report:")
print(class_report)
# 可视化决策边界(仅适用于2D特征)
if X.shape[1] == 2:
x_min, x_max = X['Feature1'].min() - 1, X['Feature1'].max() + 1
y_min, y_max = X['Feature2'].min() - 1, X['Feature2'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X['Feature1'], X['Feature2'], c=Y, edgecolors='k', s=20)
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.title('Logistic Regression Decision Boundary')
plt.show()
Data:
Feature1 Feature2 Target
0 1 1 0
1 2 2 0
2 2 3 0
3 3 4 0
4 4 2 1
5 5 5 1
6 5 6 1
7 6 7 1
8 7 8 1
Accuracy: 1.0
Confusion Matrix:
[[1 0]
[0 1]]
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1
1 1.00 1.00 1.00 1
accuracy 1.00 2
macro avg 1.00 1.00 1.00 2
weighted avg 1.00 1.00 1.00 2
u:\Users\Enlink\miniconda3\envs\myenv\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
warnings.warn(
7. 生存分析
# 导入必要的库
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from lifelines.utils import median_survival_times
# 创建示例数据
data = {
'PatientID': range(1, 11),
'Time': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], # 生存时间
'Event': [1, 0, 1, 1, 0, 1, 0, 1, 0, 1] # 事件是否发生(1为发生,0为删失)
}
# 将数据转换为DataFrame
df = pd.DataFrame(data)
# 初始化Kaplan-Meier生存函数估计器
kmf = KaplanMeierFitter()
# 使用数据进行生存分析
kmf.fit(df['Time'], event_observed=df['Event'])
# 绘制生存曲线
kmf.plot_survival_function()
median_ = kmf.median_survival_time_
median_confidence_interval_ = median_survival_times(kmf.confidence_interval_)
print(f"Median survival time: {median_}, with 95% confidence interval: {median_confidence_interval_}")
# 执行Log-rank检验
# 假设我们有两个组,这里我们简单地将数据分为两组进行演示
group_1 = df[:5]
group_2 = df[5:]
# 拟合两个组的生存曲线
kmf_1 = KaplanMeierFitter().fit(group_1['Time'], event_observed=group_1['Event'])
kmf_2 = KaplanMeierFitter().fit(group_2['Time'], event_observed=group_2['Event'])
# 执行Log-rank检验
results = logrank_test(group_1['Time'], group_2['Time'], event_observed_A=group_1['Event'], event_observed_B=group_2['Event'])
print("Log-rank test results:")
print(results)
# 显示图表
Median survival time: 40.0, with 95% confidence interval: KM_estimate_lower_0.95 KM_estimate_upper_0.95
0.5 5.0 50.0
Log-rank test results:
<lifelines.StatisticalResult: logrank_test>
t_0 = -1
null_distribution = chi squared
degrees_of_freedom = 1
test_name = logrank_test
---
test_statistic p -log2(p)
4.91 0.03 5.23
8.主成因分析
8.1 主成因分析
# 导入必要的库
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# 加载数据集
iris = load_iris()
X = iris.data
y = iris.target
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 创建PCA对象,指定降维后的维度为2
pca = PCA(n_components=2)
# 对数据集进行PCA分析
X_pca = pca.fit_transform(X_scaled)
# 输出降维后的数据
print("降维后的数据:")
print(X_pca)
# 可视化结果
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', marker='o', edgecolor='k')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Visualization')
plt.colorbar(label='Class')
plt.show()
降维后的数据:
[[-2.26470281 0.4800266 ]
[-2.08096115 -0.67413356]
[-2.36422905 -0.34190802]
[-2.29938422 -0.59739451]
[-2.38984217 0.64683538]
[-2.07563095 1.48917752]
[-2.44402884 0.0476442 ]
[-2.23284716 0.22314807]
[-2.33464048 -1.11532768]
[-2.18432817 -0.46901356]
[-2.1663101 1.04369065]
[-2.32613087 0.13307834]
[-2.2184509 -0.72867617]
[-2.6331007 -0.96150673]
[-2.1987406 1.86005711]
[-2.26221453 2.68628449]
[-2.2075877 1.48360936]
[-2.19034951 0.48883832]
[-1.898572 1.40501879]
[-2.34336905 1.12784938]
[-1.914323 0.40885571]
[-2.20701284 0.92412143]
[-2.7743447 0.45834367]
[-1.81866953 0.08555853]
[-2.22716331 0.13725446]
[-1.95184633 -0.62561859]
[-2.05115137 0.24216355]
[-2.16857717 0.52714953]
[-2.13956345 0.31321781]
[-2.26526149 -0.3377319 ]
[-2.14012214 -0.50454069]
[-1.83159477 0.42369507]
[-2.61494794 1.79357586]
[-2.44617739 2.15072788]
[-2.10997488 -0.46020184]
[-2.2078089 -0.2061074 ]
[-2.04514621 0.66155811]
[-2.52733191 0.59229277]
[-2.42963258 -0.90418004]
[-2.16971071 0.26887896]
[-2.28647514 0.44171539]
[-1.85812246 -2.33741516]
[-2.5536384 -0.47910069]
[-1.96444768 0.47232667]
[-2.13705901 1.14222926]
[-2.0697443 -0.71105273]
[-2.38473317 1.1204297 ]
[-2.39437631 -0.38624687]
[-2.22944655 0.99795976]
[-2.20383344 0.00921636]
[ 1.10178118 0.86297242]
[ 0.73133743 0.59461473]
[ 1.24097932 0.61629765]
[ 0.40748306 -1.75440399]
[ 1.0754747 -0.20842105]
[ 0.38868734 -0.59328364]
[ 0.74652974 0.77301931]
[-0.48732274 -1.85242909]
[ 0.92790164 0.03222608]
[ 0.01142619 -1.03401828]
[-0.11019628 -2.65407282]
[ 0.44069345 -0.06329519]
[ 0.56210831 -1.76472438]
[ 0.71956189 -0.18622461]
[-0.0333547 -0.43900321]
[ 0.87540719 0.50906396]
[ 0.35025167 -0.19631173]
[ 0.15881005 -0.79209574]
[ 1.22509363 -1.6222438 ]
[ 0.1649179 -1.30260923]
[ 0.73768265 0.39657156]
[ 0.47628719 -0.41732028]
[ 1.2341781 -0.93332573]
[ 0.6328582 -0.41638772]
[ 0.70266118 -0.06341182]
[ 0.87427365 0.25079339]
[ 1.25650912 -0.07725602]
[ 1.35840512 0.33131168]
[ 0.66480037 -0.22592785]
[-0.04025861 -1.05871855]
[ 0.13079518 -1.56227183]
[ 0.02345269 -1.57247559]
[ 0.24153827 -0.77725638]
[ 1.06109461 -0.63384324]
[ 0.22397877 -0.28777351]
[ 0.42913912 0.84558224]
[ 1.04872805 0.5220518 ]
[ 1.04453138 -1.38298872]
[ 0.06958832 -0.21950333]
[ 0.28347724 -1.32932464]
[ 0.27907778 -1.12002852]
[ 0.62456979 0.02492303]
[ 0.33653037 -0.98840402]
[-0.36218338 -2.01923787]
[ 0.28858624 -0.85573032]
[ 0.09136066 -0.18119213]
[ 0.22771687 -0.38492008]
[ 0.57638829 -0.1548736 ]
[-0.44766702 -1.54379203]
[ 0.25673059 -0.5988518 ]
[ 1.84456887 0.87042131]
[ 1.15788161 -0.69886986]
[ 2.20526679 0.56201048]
[ 1.44015066 -0.04698759]
[ 1.86781222 0.29504482]
[ 2.75187334 0.8004092 ]
[ 0.36701769 -1.56150289]
[ 2.30243944 0.42006558]
[ 2.00668647 -0.71143865]
[ 2.25977735 1.92101038]
[ 1.36417549 0.69275645]
[ 1.60267867 -0.42170045]
[ 1.8839007 0.41924965]
[ 1.2601151 -1.16226042]
[ 1.4676452 -0.44227159]
[ 1.59007732 0.67624481]
[ 1.47143146 0.25562182]
[ 2.42632899 2.55666125]
[ 3.31069558 0.01778095]
[ 1.26376667 -1.70674538]
[ 2.0377163 0.91046741]
[ 0.97798073 -0.57176432]
[ 2.89765149 0.41364106]
[ 1.33323218 -0.48181122]
[ 1.7007339 1.01392187]
[ 1.95432671 1.0077776 ]
[ 1.17510363 -0.31639447]
[ 1.02095055 0.06434603]
[ 1.78834992 -0.18736121]
[ 1.86364755 0.56229073]
[ 2.43595373 0.25928443]
[ 2.30492772 2.62632347]
[ 1.86270322 -0.17854949]
[ 1.11414774 -0.29292262]
[ 1.2024733 -0.81131527]
[ 2.79877045 0.85680333]
[ 1.57625591 1.06858111]
[ 1.3462921 0.42243061]
[ 0.92482492 0.0172231 ]
[ 1.85204505 0.67612817]
[ 2.01481043 0.61388564]
[ 1.90178409 0.68957549]
[ 1.15788161 -0.69886986]
[ 2.04055823 0.8675206 ]
[ 1.9981471 1.04916875]
[ 1.87050329 0.38696608]
[ 1.56458048 -0.89668681]
[ 1.5211705 0.26906914]
[ 1.37278779 1.01125442]
[ 0.96065603 -0.02433167]]
8.2 因子分析
import pandas as pd
from sklearn import datasets
from factor_analyzer import FactorAnalyzer
from sklearn.preprocessing import StandardScaler
# 加载鸢尾花数据集
iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# 数据标准化
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_data, columns=iris.feature_names)
# 初始化因子分析对象,这里我们假设需要提取2个因子
fa = FactorAnalyzer(n_factors=2, rotation='varimax')
# 拟合因子分析模型
fa.fit(df_scaled)
# 查看公因子方差,即每个变量的方差被因子解释的比例
communalities = fa.get_communalities()
print("公因子方差:\n", communalities)
# 查看成分矩阵,即因子载荷矩阵
loadings = fa.loadings_
print("\n成分矩阵:\n", loadings)
# 查看解释的总方差
total_variance = fa.get_factor_variance()
print("\n解释的总方差(即贡献率):\n", total_variance)
# 计算因子得分
factor_scores = fa.transform(df_scaled)
factor_scores_df = pd.DataFrame(factor_scores, columns=[f'Factor{i+1}' for i in range(fa.n_factors)])
print("\n因子得分:\n", factor_scores_df.head())
公因子方差:
[0.81249138 0.99520493 1.01060002 0.90338341]
成分矩阵:
[[ 0.90120548 0.01789008]
[-0.15041281 0.98619517]
[ 0.96415345 -0.28461929]
[ 0.92140124 -0.23324486]]
解释的总方差(即贡献率):
(array([2.61336746, 1.10831228]), array([0.65334187, 0.27707807]), array([0.65334187, 0.93041994]))
因子得分:
Factor1 Factor2
0 -1.157234 0.851609
1 -1.583209 -0.384376
2 -1.434077 0.110196
3 -1.302248 -0.088663
4 -1.032635 1.109801
标签:plt,df,基础,value,统计学,print,import,data
From: https://www.cnblogs.com/redufa/p/18587235