import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.DoubleWritable; import java.io.IOException; public class Job52Mapper extends Mapper<LongWritable,Text,Text, DoubleWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //supper.map(key,value,context); //一行一行读,然后将每一行文本转成字符串 String line=value.toString(); //分割每一行 String[] arr=line.split( "\t"); //[大数据开发工程师 上海吉祥航空股份有限公司 上海] String city = null; String salary = null; String post = null; if (arr.length > 1){ post = arr[0]; if (post.indexOf("大数据") != -1) { if (arr.length > 2){ city = arr[2]; int index = city.indexOf("-"); if (index > 0){ city = city.substring(0,index); } if (arr.length > 3){ double avg = 0; salary = arr[3]; if (salary.indexOf("万/年") != -1 || salary.indexOf("万以上/年") != -1){ String str1 = salary.split("万")[0]; String[] arr1 = str1.split("-"); avg = (Double.parseDouble(arr1[0]) + Double.parseDouble(arr1[arr1.length - 1])) * 10 / 12 / 2; } if (salary.indexOf("万/月") != -1){ String str1 = salary.replace("万/月",""); String[] arr1 = str1.split("-"); avg = (Double.parseDouble(arr1[0]) + Double.parseDouble(arr1[arr1.length - 1])) * 10 / 2; } if (salary.indexOf("千/月") != -1 || salary.indexOf("千以下/月") != -1){ String str1 = salary.split("千")[0]; String[] arr1 = str1.split("-"); avg = (Double.parseDouble(arr1[0]) + Double.parseDouble(arr1[arr1.length - 1])) / 2; } context.write(new Text(city),new DoubleWritable(avg)); } } } } } }
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; //输入:Mapper的输出<上海,1> <苏州,1> <上海,1> //<上海,<1,1,1>> //Reducer的输出 <上海,2300> public class Job52Reducer extends Reducer<Text,DoubleWritable,Text,DoubleWritable> { @Override protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { //super.reduce(key, values, context); int sum=0; int index = 0; for(DoubleWritable i :values){ sum+=i.get(); index++; } context.write(key,new DoubleWritable(sum/index)); //reducer的输出结果 } }
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.FileOutputStream; import java.io.IOException; public class Job52Runner { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); //创建job Job job= Job.getInstance(conf,"job52"); //设置输入输出路径 FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置运行类 job.setJarByClass(Job52Runner.class); job.setMapperClass(Job52Mapper.class); job.setReducerClass(Job52Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true)?0:1); } }
import pandas as pd import matplotlib import matplotlib.pyplot as plt matplotlib.rcParams['font.family']='SimHei' matplotlib.rcParams['font.sans-serif'] = ['SimHei'] data = pd.read_csv("D:\oc\shiyun1\yuan\output2\part-r-00000",encoding="utf-8",delimiter='\t') data.columns = ['地区名称','平均薪资'] data.head() data = data.head() plt.figure(figsize=(8,5)) x=data["地区名称"] y=data["平均薪资"] #制作柱状图 plt.bar(x,y,width=0.5,color="g") plt.xticks(x,data["地区名称"]) plt.title("大数据相关职位地区前五的平均薪资") plt.xlabel("城市名称") plt.ylabel("平均薪资") plt.legend(labels=["千/月"]) plt.show()
标签:String,51JOB,hadoop,apache,可视化,io,org,薪资,import From: https://www.cnblogs.com/modikasi/p/16641867.html