<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>hnkjzy.cn</groupId> <artifactId>weather15</artifactId> <version>1.0-SNAPSHOT</version> <dependencies> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.4.5</version> </dependency> </dependencies> </project>
import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object Weather15 { def main(args: Array[String]): Unit = { //指定hadoop的目录 // System.setProperty("hadoop.home.dir","E:\\junior\\Hadoop\\hadoop-2.6.4") //1.创建SparkConf对象,设置appName和Master地址 //spark-shell --master local[2] val sparkconf=new SparkConf().setAppName("Weather15").setMaster("local[1]") //在本机Windows上运行 //2.创建SparkContext对象,它是所有任务计算的源头,它会创建DAGScheduler和TaskScheduler val sparkContext=new SparkContext(sparkconf) //3.读取数据文件,RDD可以简单的理解为是一个集合,集合中存放的元素是String类型 //.textFile("") val data:RDD[String]=sparkContext.textFile("E:\\inputweather") //4.切分每一行,获取所有的单词 val words:RDD[Array[String]]=data.map(_.split(" ")) val words2:RDD[String]=words.map(x=>{ //"阴/小雨"--->"阴" if (x(1).indexOf("/")>0) x(1).substring(0,x(1).indexOf("/")) else x(1) }) //5.每种天气情况记为1,转换为(单词,1) val wordAndOne:RDD[(String,Int)]=words2.map(x=>(x,1)) //6.相同天气情况汇总,前一个下划线表示累加数据,后一个下划线表示新数据 val result:RDD[(String,Int)]=wordAndOne.reduceByKey(_+_) //7.收集打印结果数据 // val finalResult:Array[(String,Int)]=result.collect() // println(finalResult.toBuffer) //保存为文本文件 result.coalesce(1,true).saveAsTextFile("E:\\output15") //8.关闭sparkContext对象 sparkContext.stop() } }
import pandas as pd import matplotlib import matplotlib.pyplot as plt matplotlib.rcParams['font.family']='SimHei' matplotlib.rcParams['font.sans-serif'] = ['SimHei'] data=pd.read_csv(r"E:\output15\part-00000",delimiter=',',header=None) data.columns=['天气情况','天数'] data['天气情况']=data['天气情况'].map(lambda x: x.split('(',1)).str[1] data['天数']=data['天数'].map(lambda x: x.split(')',1)).str[0] data.head() plt.figure(figsize=(10,8)) plt.pie(data['天数'],labels=data['天气情况'],autopct='%1.2f%%') plt.title("2018年长沙全年各种类型天气占比数量图") plt.legend(loc='upper right',bbox_to_anchor=(1.7,1.05),fontsize=10,borderaxespad=0.3) plt.show()标签:plt,String,val,Scala,天气情况,RDD,可视化,2018,data From: https://www.cnblogs.com/modikasi/p/16665601.html