今日学习了spark的数据清洗ui
#数据去重标签:总结,name,show,df,每日,dropna,fillna,age From: https://www.cnblogs.com/syhxx/p/17726762.html
df.dropDuplicates().show()
#带参数去重
df.dropDuplicates(['age','job']).show()
#去除空值
df.dropna().show()
#最少满足三个有效列
df.dropna(thresh=3).show()
#满足name和age有效
df.dropna(thresh=2,subset=['name','age']).show()
#填充
df.fillna("loss").show()
df.fillna("loss",subset=['job']).show()
#指定填充
df.fillna({"name":"weizhi","age":2,"job":"daw"})