Hive职位岗位数据分析实战
文章目录
- Hive职位岗位数据分析实战
- 创建数据库
- 使用创建的数据库
- 创建表
- 在hdfs上加载数据
- 如果不在hdfs,在本地加载数据
- 查看数据
创建数据库
create database jobdata;
使用创建的数据库
use jobdata;
创建表
create table jobdata_origin(
city string comment "城市",
salary array<String> comment "薪资",
company array<String> comment "福利",
kill array<String> comment "技能")
comment "原始职位数据库"
row format delimited fields terminated by ','
collection items terminated by '-'
stored as textfile;
在hdfs上加载数据
load data inpath '/JobData/output/part-r-00000' overwrite into table jobdata_origin;
如果不在hdfs,在本地加载数据
load data local inpath '/root/part-r-00000' overwrite into table jobdata_origin;
查看数据
select*from jobdata_origin;
下面的方法创建jobdata_detail表,并且加载jobdata_origin表里面的数据
下面的方法可以加载数据:
create table jobdata_detail as select city,salary,company,kill,salary[0] low_salary,salary[1] high_salary,(salary[0]+salary[1])/2 avg_salary from jobdata_origin;
#查询细化的薪资
select*,salary[0],salary[1],(salary[0]+salary[1])/2 from jobdata_origin limit 10;
#创建表tmp_salay,扁平化处理后的薪资临时表
create table tmp_salary as select explode(jo.salary) from jobdata_origin jo;
#创建tmp_company,扁平化处理后的公司临时表
create table tmp_company as select explode(jo.company) from jobdata_origin jo;
#tmp_kill 扁平化处理后的技能临时表
create table tmp_kill as select explode(jo.kill) from jobdata_origin jo;
#对tmp_salary表的每一条数据进行泛化处理,将处理结果存储到中间表tm_salarylist中,命令如下:
create table tmp_salarylist as
select col,case when col>=0 and col<=5 then "0-5"
when col>=6 and col<=10 then "6-10"
when col>=11 and col<=15 then "11-15"
when col>=16 and col<=20 then "16-20"
when col>=21 and col<=25 then "21-25"
when col>=26 and col<=30 then "26-30"
when col>=31 and col<=35 then "31-35"
when col>=36 and col<=40 then "36-40"
when col>=41 and col<=45 then "41-45"
when col>=46 and col<=50 then "46-50"
when col>=51 and col<=55 then "51-55"
when col>=56 and col<=60 then "56-60"
when col>=61 and col<=65 then "61-65"
when col>=66 and col<=70 then "66-70"
when col>=71 and col<=75 then "71-75"
when col>=76 and col<=80 then "76-80"
when col>=81 and col<=85 then "81-85"
when col>=86 and col<=90 then "86-90"
when col>=91 and col<=95 then "91-95"
when col>=96 and col<=100 then "96-100"
when col>=101 then ">101" end from tmp_salary;
#查询泛化后的数据
select*from tmp_salarylist limit 10;
分析1,职位区域分析:
查询不同城市大数据岗位的需求量,查询结果保存到t_city_detail表
select city,count(*) from jobdata_origin group by city;
建表插入
create table t_city_detail as
select city,count(*) from jobdata_origin group by city;
查看创建的表并且排序
select*from t_city_detail sort by count desc;
职位薪资分析(全国薪资分布情况)
统计不同工资区间频率和占比
1查询
select c1,count(*) from tmp_salarylist group by c1;
2:建表
create table t_salary_detail as
select c1,count(1) from tmp_salarylist group by c1;
3.查看表结构
desc t_salary_detail;
3:验证
select * from t_salary_detail sort by c1 desc;
职位薪资分析(薪资的平均值、中位数和众数)
求薪资的平均值,平均值是统计中的一个重要概念。为集中趋势的最常用测度值,目的是确定一组数据的均衡点。
求薪资的众数,众数是指统计分布上具有明显集中趋势点的数值,代表数据的一般水平,也是一组数据中出现次数最多的数值。
求薪资的中位数,中位数又称中值,是统计学中的专有名词,是按顺序排列的一组数据中居于中间位置的数,代表一个样本、种群或概率分布中的一个数值。
薪资的平均值
select avg(avg_salary) from jobdata_detail;
薪资的众数
select avg_salary,count (*) cnt from jobdata_detail group by avg_salary sort by cnt desc limit 1 ;
薪资的中位数
select percentile(cast(avg_salary as BIGINT),0.5) from jobdata_detail;
求各个城市的招聘岗位数量和城市的平均薪资
select city,count(city),round(avg(avg_salary),2) as cnt from jobdata_detail group by city sort by cnt desc;
公司福利分析
对公司福利字段进行分析,统计大数据职位相关公司对员工福利常用的标签有哪些。即统计不同福利标签频率。结果存储到tmp_ods_company表中
select col,count(*) from tmp_company;
create table t_company_detail as select col company,count(*)count from tmp_company
group by col;
select * from t_company_detail sort by count desc limit 5;
职位技能要求分析
通过对技能标签分析,了解要从事大数据相关工作需要掌握哪些技能,招聘公司比较重视哪些技能。即统计不同技能标签出现的频率
select col,count(*) from tm_kill;
create table t_kill_detail as select col kill,count(*) count from tmp_kill
group by col;
select * from t_kill_detail sort by count desc limit 10;
能标签出现的频率
select col,count(*) from tm_kill;
create table t_kill_detail as select col kill,count(*) count from tmp_kill
group by col;
select * from t_kill_detail sort by count desc limit 10;