关于随机森林的定义就不赘叙
import findspark
from numpy import frompyfunc
from pyspark.ml import classification
from pyspark.sql.functions import spark_partition_id
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test1').getOrCreate()
df = spark.read.csv('D:/PythonCode/Python_dataming/pythons/.vscode/机器学习/affairs.csv',inferSchema=True,header=True)
print(df.count())
print(df.printSchema())
from pyspark.ml.feature import VectorAssembler
df_ass=VectorAssembler(inputCols=['rate_marriage','age','yrs_married','children','religious'],outputCol='features')
df= df_ass.transform(df)
print(df.printSchema())
model_df = df.select(['features','affairs'])
train_df,test_df= model_df.randomSplit([0.75,0.25])
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
rf= RandomForestClassifier(labelCol='affairs',numTrees=50).fit(train_df)
rf_pre=rf.transform(test_df)
print(rf_pre.show(100))
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
rf_acc=MulticlassClassificationEvaluator(labelCol='affairs',metricName='accuracy').evaluate(rf_pre)
print(rf_acc)
rf_auc = MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_pre)
print(rf_auc)
print(rf.featureImportances)
from pyspark.ml.classification import RandomForestClassificationModel
#rf.save('./model')
#rf=RandomForestClassificationModel.load('./model')
#new_pre= rf.transform(new_df)
rate_marriage,age,yrs_married,children,religious,affairs
5,32,6,1,3,0
4,22,2.5,0,2,0
3,32,9,3,3,1
3,27,13,3,1,1
4,22,2.5,0,1,1
4,37,16.5,4,3,1
5,27,9,1,1,1
4,27,9,0,2,1
5,37,23,5.5,2,1
5,37,23,5.5,2,1
3,22,2.5,0,2,1
3,27,6,0,1,1
2,27,6,2,1,1
5,27,6,2,3,1
3,37,16.5,5.5,1,1
5,27,6,0,2,1
4,22,6,1,1,1
4,37,9,2,2,1
4,27,6,1,1,1
1,37,23,5.5,4,1
2,42,23,2,2,1
4,37,6,0,2,1
5,22,2.5,0,2,1
3,37,16.5,5.5,2,1
3,42,23,5.5,3,1
2,27,9,2,4,1
4,27,6,1,2,1
5,27,2.5,0,3,1
2,27,6,2,2,1
5,37,13,1,3,1
2,32,16.5,2,2,1
3,27,6,1,1,1
3,32,16.5,4,3,1
3,27,9,2,1,1
3,37,16.5,3,3,1
4,32,16.5,5.5,4,1
5,42,16.5,4,3,1
3,27,9,2,2,1
3,17.5,0.5,0,1,1
4,42,23,5.5,2,1
5,37,16.5,3,3,1
4,22,2.5,1,2,1
4,27,2.5,0,2,1
4,22,2.5,0,2,1
4,37,13,3,2,1
4,22,2.5,0,2,1
4,22,2.5,0,1,1
5,22,2.5,0,3,1
5,22,2.5,0,3,1
3,42,23,4,3,1
5,32,13,3,3,1
5,22,6,2,2,1
3,27,2.5,1,4,1
2,42,23,3,3,1
4,22,2.5,0,1,1
2,42,23,3,3,1
4,42,23,2,2,1
4,42,23,3,3,1
4,37,16.5,2,3,1
4,27,2.5,0,2,1
2,32,9,2,2,1
4,42,13,0,1,1
4,22,6,2,1,1
5,32,16.5,3,3,1
4,42,13,0,2,1
5,27,9,1,3,1
5,22,6,2,2,1
2,27,16.5,2,3,1
5,37,13,2,1,1
5,27,6,0,2,1
2,27,2.5,1,1,1
5,42,23,5.5,4,1
5,27,6,0,2,1
5,37,16.5,3,2,1
2,32,9,2,2,1
3,37,16.5,5.5,3,1
5,27,6,2,2,1
5,32,16.5,3,1,1
5,27,9,2,1,1
4,22,2.5,0,2,1
5,32,16.5,2,4,1
2,22,6,2,2,1
3,32,13,3,1,1
5,32,16.5,3,2,1
3,27,6,2,1,1
5,22,2.5,0,1,1
3,32,9,2,3,1
3,22,2.5,1,2,1
3,22,2.5,0,2,1
5,27,9,2,1,1
3,42,23,2,2,1
3,37,16.5,3,2,1
5,32,13,2,2,1
1,27,13,2,2,1
5,27,2.5,0,2,1
5,27,9,0,1,1
5,32,13,2,2,1
4,27,9,2,2,1
3,22,2.5,1,3,1
2,37,23,2,2,1
2,27,6,2,1,1
4,27,2.5,1,2,1
3,37,13,5.5,2,1
5,37,16.5,4,1,1
5,22,2.5,0,1,1
4,42,13,0,1,1
5,27,9,1,1,1
4,22,2.5,0,1,1
4,22,2.5,1,2,1
4,27,6,2,3,1
2,32,16.5,1,2,1
3,27,13,2,1,1
4,22,0.5,0,2,1
5,27,6,1,1,1
2,27,9,2,2,1
3,27,9,2,2,1
4,32,16.5,3,3,1
5,22,2.5,0,1,1
4,22,2.5,1,2,1
1,22,2.5,0,2,1
3,32,13,2,1,1