1.使用hive实现WordCount
(1) 创建数据库
create database wordcount;
(2) 创建外部表
create external table word_data(line string) row format delimited fields terminated by ',' location '/home/hadoop/worddata';
(3) 映射数据表
load data inpath '/home/hadoop/worddata' into table word_data;
(4) 这里假设我们的数据存放在hadoop下,路径为:/home/hadoop/worddata,里面主要是一些单词文件,内容大概为:
hello man what are you doing now my running hello kevin hi man
执行了上述hql就会创建一张表src_wordcount,内容是这些文件的每行数据,每行数据存在字段line中,select * from word_data;就可以看到这些数据
(5) 根据MapReduce的规则,我们需要进行拆分,把每行数据拆分成单词,这里需要用到一个hive的内置表生成函数(UDTF):explode(array),参数是array,其实就是行变多列:
create table words(word string); insert into table words select explode(split(line, " ")) as word from word_data;
(6) 查看words表内容
OK hello man what are you doing now my running hello kevin hi man
split是拆分函数,跟java的split功能一样,这里是按照空格拆分,所以执行完hql语句,words表里面就全部保存的单个单词
(7) group by统计单词
select word, count(*) from wordcount.words group by word;
wordcount.words 库名称.表名称,group by word这个word是create table words(word string) 命令创建的word string
结果:
are 1 doing 1 hello 2 hi 1 kevin 1 man 2 my 1 now 1 running 1 what 1 you 1
2.使用hive求TOP N
rank() over() dense_rank() over() row_number() over()
3.使用Hive进行行列转换
1、问题
hive如何将 a b 1 a b 2 a b 3 c d 4 c d 5 c d 6 变为: a b 1,2,3 c d 4,5,6
2、数据
test.txt
a b 1 a b 2 a b 3 c d 4 c d 5 c d 6
3、答案
(1).建表
drop table tmp_jiangzl_test; create table tmp_jiangzl_test ( col1 string, col2 string, col3 string ) row format delimited fields terminated by '\t' stored as textfile; -- 加载数据 load data local inpath '/home/jiangzl/shell/test.txt' into table tmp_jiangzl_test;
(2).处理
select col1,col2,concat_ws(',',collect_set(col3)) from tmp_jiangzl_test group by col1,col2;
二、列转行
1、问题
hive如何将 a b 1,2,3 c d 4,5,6 变为: a b 1 a b 2 a b 3 c d 4 c d 5 c d 6
2、答案
(1). 建表
drop table tmp_jiangzl_test; create table tmp_jiangzl_test ( col1 string, col2 string, col3 string ) row format delimited fields terminated by '\t' stored as textfile;
(2). 处理:
select col1, col2, col5 from tmp_jiangzl_test a lateral view explode(split(col3,',')) b AS col5;
4.使用Hive进留存率统计
游戏公司等会很关注用户留存率问题,这里给出一个模板
SET mapreduce.job.queuename=xxx; SET mapreduce.job.name=xxx; SET mapreduce.job.reduces=19;
select '日期', '注册用户数', '次日留存率', '2日留存率', '3日留存率', dim_date ,total_cnt ,concat_ws('% | ', cast(round(dif_1cnt*100/total_cnt, 2) as string), cast(dif_1cnt as string)) ,concat_ws('% | ', cast(round(dif_2cnt*100/total_cnt, 2) as string), cast(dif_2cnt as string)) ,concat_ws('% | ', cast(round(dif_3cnt*100/total_cnt, 2) as string), cast(dif_3cnt as string)) ,concat_ws('% | ', cast(round(dif_4cnt*100/total_cnt, 2) as string), cast(dif_4cnt as string)) from ( select p1.state dim_date ,p1.device_os ,count(distinct p1.user_id) total_cnt ,count(distinct if(datediff(p3.state,p1.state) = 1, p1.user_id, null)) dif_1cnt ,count(distinct if(datediff(p3.state,p1.state) = 2, p1.user_id, null)) dif_2cnt ,count(distinct if(datediff(p3.state,p1.state) = 3, p1.user_id, null)) dif_3cnt ,count(distinct if(datediff(p3.state,p1.state) = 4, p1.user_id, null)) dif_4cnt from ( select from_unixtime(unix_timestamp(cast(partition_date as string), 'yyyyMMdd'), 'yyyy-MM-dd') state, user_id from user_active_day where partition_date between date1 and date2 and user_is_new = 1 group by 1,2 )p1 --日新增用户名单(register_date,user_id) left outer join ( select from_unixtime(unix_timestamp(cast(partition_date as string), 'yyyyMMdd'), 'yyyy-MM-dd') state, user_id from active_users where partition_date between date1 and date2 group by 1,2 )p3 --期间活跃用户(active_date,user_id) on (p3.user_id = p1.user_id) group by 1,2 ) p4;
标签:实战,p1,word,string,dif,Hive,state,user From: https://www.cnblogs.com/yeyuzhuanjia/p/16834197.html