首页 > 其他分享 >Hive 练习题

Hive 练习题

时间:2022-11-13 12:00:17浏览次数:41  
标签:练习题 category views Hive t1 gulivideo orc select

image
image
准备数据

create table gulivideo_ori(
 videoId string,
 uploader string,
 age int,
 category array<string>,
 length int,
 views int,
 rate float,
 ratings int,
 comments int,
 relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

创建原始数据表:gulivideo_user_ori

create table gulivideo_user_ori(
 uploader string,
 videos int,
 friends int)
row format delimited 
fields terminated by "\t" 
stored as textfile;

创建 orc 存储格式带 snappy 压缩的表
gulivideo_orc

create table gulivideo_orc(
 videoId string, 
 uploader string, 
 age int, 
 category array<string>, 
 length int, 
 views int, 
 rate float, 
 ratings int, 
 comments int,
 relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");

gulivideo_user_orc

create table gulivideo_user_orc(
 uploader string,
 videos int,
 friends int)
row format delimited 
fields terminated by "\t" 
stored as orc
tblproperties("orc.compress"="SNAPPY");

向 ori 表插入数据

load data local inpath "/opt/module/data/video" into table gulivideo_ori;
load data local inpath "/opt/module/user" into table gulivideo_user_ori;

向 orc 表插入数据

insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;

业务分析

统计视频观看数 Top10

思路:使用 order by 按照 views 字段做一个全局排序即可,同时我们设置只显示前 10 条。

select
    videoId,
    views
from
    gulivideo_orc
order by 
    views desc
limit 10;

统计视频类别热度 Top10

类别热度:指定的每个类别下的视频数
1.由于类别是数组,要把类别给炸出来

selelct
    category_name
from
    gulivideo_orc laternal view expolode(category) category_tmp as category;t1

-- 如果与原数据有关系就用上面的,没有关系就用下面的

select
     explode(category) category_name
from
     gulivideo_orc;t1

2.按照类别分组,求count 并按照count排序,取前十

select 
    categorty_name,
	count(*) ct
from
   t1
gruop by category_name
order by ct desc
limit 10;

3.最终SQL:

select
    categorty_name,
	count(*) ct
from
   (select
       explode(category) category_name
    from
       gulivideo_orc)t1
gruop by category_name
order by ct desc
limit 10;

统计出视频观看数最高的 20 个视频的所属类别以及类别包含Top20 视频的个数

  1. 求出视频观看数最高的20个视频所属类别
select
     category,
	 viwes
from
    gulivideo_orc
order by views desc
limit 20;t1
  1. 所属类别炸开
select
    explode(category) category_ame
from
    t1;t2
  1. 计算各个类别包含Top20 的个数
select
    category_name
	count(*) ct
from
   t2
   group by category_name
  1. 最终sql
select
    category_name,
	COUNT(*) ct
from
    (SELECT
    explode(category) category_name
     from
	(SELECT
	   category,
		views
		from gulivideo_orc
		order by biews DESC
		limit 20)t1)t2
	group by category_name;

统计视频观看数 Top50 所关联视频的所属类别排序

  1. 求出视频观看数Top50的视频所关联视频的个数
SELECT 
   relatedId,
   views
from gulivideo_orc
order by biews DESC
limit 50;t1
  1. 将关联视频炸开
SELECT 
   explode(relatedId) related_id
from 
   t1;t2
  1. join 原表,取出关联视频所属的类别(数组)
SELECT 
     g.category
from 
     t2
join gulivideo_orc g
on t2.related_id = g.videoId;t3
  1. 炸裂类别字段
SELECT
   explode(category) category_name
from
    t3;t4
  1. 按照类别分组,求count,并按照count排序
select 
    category_name,
	COUNT(*)ct
from
    t4
group by category_name
order by ct desc;
  1. 最终SQL
select 
    category_name,
	COUNT(*)ct
from 
    ( SELECT
   explode(category) category_name
    from
	(select
	   g.category
	 FROM
	   (SELECT
		   explode(relatedId) related_id
		from (SELECT
		   relatedId,
		   views
		from gulivideo_orc
		order by biews DESC
		limit 50)t1)t2
	join gulivideo_orc g
	on t2.related_id = g.videoId)t3)t4
group by category_name
order by ct desc;

统计每个类别中的视频热度Top10,以Music为例子

  1. 重新建表(因为后面几个需要都有这个子查询表,所以不如直接建出表,直接使用)
create table gulivideo_orc_category(
 videoId string,
 uploader string,
 age int, 
 category sring,   -- 仅仅这个地方改变
 length int, 
 views int, 
 rate float, 
 ratings int, 
 comments int,
 relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
insert into table gulivideo_orc_category
-- 查询不带类型
select
 videoId,
 uploader,
 age,
 category,   -- 仅仅这个地方改变
 length,
 views,
 rate,
 ratings,
 comments,
 relatedId)
from gulivideo_orc lateral view explode(category) category_tmp as category_name;
  1. 最终SQL
select
     videoId,
	 views
from gulivideo_orc_category
where category=''Music
order bvy views desc
limlt 10;

统计每个类别视频观看数 Top3

  1. 求出每个类别视频观看数排名
select 
     category,
	 videoId,
	 views,
	 rank()over(partition by category order by views desc) rk
from gulivideo_orc_category;t1
  1. 取出前三名
select
    category,
	videoId,
	views
from
    t1
where tk <=3;
  1. 最终SQL
select
    category,
	videoId,
	views
from
    (select
     category,
	 videoId,
	 views,
	 rank()over(partition by category order by views desc) rk
      from gulivideo_orc_category)t1
where tk <=3;

统计上传视频最多的用户 Top10以及他们上传的视频观看次数在前20的视频

  1. 求出上传视频最多的20个用户
 SELECT
   uploader
 from gulivideo_user_orc
 order by videos DESC
 LIMIT 10;t1
  1. 跟视频表join,取出前十用户所上传的所有视频
 select
     t1.uploader,
	 videoId,
	 views
 FROM
    t1
join gulivideo_orc g
ON t1.uploader = g.uploader;t2
  1. 根据观看次数,对用户上传的视频进行排名
SELECT
     uploader,
	 videoId,
	 views,
	 rank()over(PARTITION BY uploader order BY views desc)rk
FROM
    t2;t3
  1. 取出每个上传者观看数前20的视频
SELECT
    uploader,
	videoId,
	views
from 
    t3
where rk<=20;
  1. 最终sql
SELECT
    uploader,
	videoId,
	views
	rank()over(PARTITION BY uploader order BY views desc)rk
FROM 
    (
	 select
		 t1.uploader,
		 videoId,
		 views
         from
	        (select
		      uploader
		 from gulivideo_user_orc
		 order by videos DESC
		 limit 10)t1
     join gulivideo_orc g
     ON t1.uploader = g.uploader)t2)t3
where rk<=20;

标签:练习题,category,views,Hive,t1,gulivideo,orc,select
From: https://www.cnblogs.com/catch-autumn/p/16885710.html

相关文章

  • Hive3源码总结2
    大数据技术之Hive源码2接上文2.4HQL生成AST(抽象语法树)2.5对AST进一步解析 接下来的步骤包括:1)将AST转换为QueryBlock进一步转换为OperatorTree;2)对OperatorTree进行逻辑优......
  • Hive性能优化
    一、Hive表设计优化1.1分区表1.1.1Hive查询基本原理Hive的设计思想是通过元数据将HDFS上的文件映射成表,基本的查询原理是当用户通过HQL语句对Hive中的表进行复杂数......
  • Hive实战
    1需求描述统计硅谷影音视频网站的常规指标,各种TopN指标:统计视频观看数Top10统计视频类别热度Top10统计出视频观看数最高的20个视频的所属类别以及类别包含Top20......
  • 1.练习题 2.31
    练习题2.31补码溢出的判断inttadd_ok(inta;intb){intneg_over=a>0&&b>0&&a+b<0;intpos_over=a<0&&b<0&&a+b>0;return!n......
  • Hive拓展项目之Youtube
    一、需求描述统计Youtube视频网站的常规指标,各种TopN指标:--统计视频观看数Top10--统计视频类别热度Top10--统计视频观看数Top20所属类别包含这Top20视频的个数--统计视频观......
  • (转)Hive中JOIN的用法以及一些注意事项总结。
    原文:https://www.dandelioncloud.cn/article/details/1529381803362369537Hive表连接的语法支持如下:join_table:table_referenceJOINtable_factor[join_condition]......
  • Spark3.1.2与Iceberg0.12.1整合-hadoop和hive的catalog,DDL,隐藏分区(按年,月,天,小时),create
    Spark3.1.2与Iceberg0.12.1整合Spark可以操作Iceberg数据湖,这里使用的Iceberg的版本为0.12.1,此版本与Spark2.4版本之上兼容。由于在Spark2.4版本中在操作Iceberg时不支持D......
  • Hive函数重要应用案例(窗口函数、拉链表)
    五、窗口函数应用实例5.1连续登陆用户需求当前有一份用户登录数据如下图所示,数据中有两个字段,分别是userId和loginTime。userId表示唯一的用户ID,唯一标识一个用户,log......
  • 学生之家-6道练习题
    让用户输入一个数判断其是奇数还是偶数并把结果输出输入一个溶液的ph值试判断该溶液是酸性还是碱性或是中性溶液并把结果输出(常温25℃条件下)给定一个年份判断是否是......
  • hive ODBC 安装
    Togettothisstage,thesearethestepsIfollowed(Istartedwiththeinstructionshere:​​http://wiki.apache.org/hadoop/Hive/HiveODBC):​​​UnixODBCBui......