val sparkConf = new SparkConf().setMaster("local[2]").setAppName("tran")
val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val seq:Seq[Student] = Array(Student("zs",20,"男"),Student("ls",21,"女"),Student("ww",22,"男"))
val rdd:RDD[Student] = sparkSession.sparkContext.makeRDD(seq)
val dataFrame:DataFrame = sparkSession.createDataFrame(rdd,classOf[Student])
val seq1:Seq[StudentScore] = Array(StudentScore("zs",50),StudentScore("ls",50),StudentScore("ml",50))
val rdd1:RDD[StudentScore] = sparkSession.sparkContext.makeRDD(seq1)
val dataFrame1:DataFrame = sparkSession.createDataFrame(rdd1,classOf[StudentScore])
case class Student(@BeanProperty var name:String,@BeanProperty var age:Int,@BeanProperty var sex:String)
case class StudentScore(@BeanProperty var name:String,@BeanProperty var score:Int)
join
/**
* select name,age,score from student inner join studentscore on student.name = studentscore.name
* join操作是内连接,但是DataFrame的join函数也支持左连接和右连接
* 三个参数:
* 1、关联的另外一个DataFrame
* 2、两个DataFrame的关联字段 Seq类型 关联字段可能有多个
* 3、连接类型 left right inner(默认) full
*/
def joinOpt(dataFrame: DataFrame, dataFrame1: DataFrame) = {
println("-------------------------join操作开始----------------------")
val frame = dataFrame.join(dataFrame1, Array("name"), "left").select("name", "age", "score")
frame.show()
println("-------------------------join操作结束----------------------")
}
groupBy
/**
* groupBy算子 分组函数后可以传递一些聚合函数算子进行聚合计算
* 后可接max(col)、min(col)、count()、sum(col)、avg(col)算子进行分组后的聚合运算
* 而且只能跟一个聚合算子 如果要跟多个聚合算子,使用以下的算子
* agg(Map集合)
* Map("列名"->"sum/min/max/avg/count","列名"->"sum/min/max/avg/count")
*
* groupBy函数执行完成之后,得到DataFrame结果集中只存在age字段了
*/
def groupByOpt(dataFrame: DataFrame, dataFrame1: DataFrame) = {
println("-------------------------groupBy1操作开始----------------------")
val frame = dataFrame.groupBy("sex").agg(Map("*" -> "count", "age" -> "max")).select("*")
frame.show()
println("-------------------------groupBy1操作结束----------------------")
println("-------------------------groupBy2操作开始----------------------")
val dataFrame4 = dataFrame.groupBy("sex").count()
dataFrame4.show()
println("-------------------------groupBy2操作结束----------------------")
}
sort
def sort(dataFrame: DataFrame, dataFrame1: DataFrame) = {
println("-------------------------sort操作开始----------------------")
val frame = dataFrame.sort("age").select("*")
frame.show()
println("-------------------------sort操作结束----------------------")
}
标签:转换,val,DataFrame,----------------------,dataFrame,算子,println,-------------------
From: https://www.cnblogs.com/jsqup/p/16637892.html