本周继续进行了大数据的相关操作
分区表
-- 修改表名 alter table course_temp rename to course_common; -- 修改表属性 desc formatted course; -- 内部表改为外部表 alter table course set tblproperties ('EXTERNAL'='TRUE'); -- 修改表注释 alter table course set tblproperties ('comment'='this is a table comment'); /*添加表分区*/ -- 添加表分区 alter table score2 add partition (year='2019',month='01',day='10'); -- 修改表分区 alter table score2 partition (year='2023',month='08',day='05') rename to partition (year='2022',month='01',day='01'); -- 删除分区(删除元数据,数据本身还在) alter table score drop partition (year='2022',month='01',day='01'); select * from score2; -- 添加新列 alter table score add columns (v1 int,v2 string); -- 修改列明 alter table score change v2 v2new string; -- 删除表 drop table score; -- 清空表数据(无法清空外部表) truncate table course;
array类型
-- 创建一个array测试表 create table test.array1(name string,work_locations array<string>) row format delimited fields terminated by '\t' collection items terminated by ','; load data local inpath '/home/hadoop/test/data_for_array_type.txt' into table test.array1; -- 正常查询 select * from array1; -- 数组中第几个 select name,work_locations[1] from array1; -- 数组中个数 select name,size(work_locations) from array1; -- 数组里是否包含 select * from array1 where array_contains(work_locations,'tianjin');
map数据类型
-- 创建map册数表 create table test.test_map( id int, name string, member map<string,string>, age int )row format delimited fields terminated by ',' collection items terminated by "#" map keys terminated by ':'; /*collection items terminated by "#" 每个键值对之间的分隔符*/ /*map keys terminated by ':' 单个键值对内部,k和v的分隔符*/ load data local inpath '/home/hadoop/test/data_for_map_type.txt' into table test.test_map; select * from test.test_map; -- 查看成员中每个人个父亲是谁 select id,name,member['father'],member['mother'] from test.test_map; -- 取出map的全部key,返回类型是array select map_keys(member) from test.test_map; -- 去除map的全部value,返回类型是array select map_values(member) from test.test_map; -- size查看map的元素(K-V对)的个数 select size(member) from test.test_map; -- 查看谁有这个key select * from test.test_map where array_contains(map_keys(member),'sister'); -- 查看谁有这个value select * from test.test_map where array_contains(map_values(member),'王林');
struct数据类型
create table test.text_struct( id int, info struct<name:string,age:int> )row format delimited fields terminated by "#" collection items terminated by ':'; /*collection items terminated by ':' 表示struct中二级列的数据之间的分隔符*/ load data local inpath '/home/hadoop/test/data_for_struct_type.txt' into table test.text_struct; select * from test.text_struct; select id,info.name,info.age from test.text_struct;
正则表达式
create database itheima; use itheima; CREATE DATABASE itheima; USE itheima; CREATE TABLE itheima.orders ( orderId bigint COMMENT ' 订单 id', orderNo string COMMENT ' 订单编号 ', shopId bigint COMMENT ' 门店 id', userId bigint COMMENT ' 用户 id', orderStatus tinyint COMMENT ' 订单状态 -3: 用户拒收 -2: 未付款的订单 -1 :用户取消 0: 待发货 1: 配送中 2: 用户确认收货 ', goodsMoney double COMMENT ' 商品金额 ', deliverMoney double COMMENT ' 运费 ', totalMoney double COMMENT ' 订单金额(包括运费) ', realTotalMoney double COMMENT ' 实际订单金额(折扣后金额) ', payType tinyint COMMENT ' 支付方式 ,0: 未知 ;1: 支付宝, 2 :微信 ;3 、现金; 4 、其他 ', isPay tinyint COMMENT ' 是否支付 0: 未支付 1: 已支付 ', userName string COMMENT ' 收件人姓名 ', userAddress string COMMENT ' 收件人地址 ', userPhone string COMMENT ' 收件人电话 ', createTime timestamp COMMENT ' 下单时间 ', payTime timestamp COMMENT ' 支付时间 ', totalPayFee int COMMENT ' 总支付金额 ' ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; load data local inpath '/home/hadoop/test/itheima_orders.txt' into table itheima.orders; CREATE TABLE itheima.users ( userId int, loginName string, loginSecret int, loginPwd string, userSex tinyint, userName string, trueName string, brithday date, userPhoto string, userQQ string, userPhone string, userScore int, userTotalScore int, userFrom tinyint, userMoney double, lockMoney double, createTime timestamp, payPwd string, rechargeMoney double ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'; load data local inpath '/home/hadoop/test/itheima_users.txt' into table itheima.users; -- 查找广东省数据 select * from itheima.orders where useraddress rlike '.*广东.*'; -- 查找××省。。。。 select * from orders where userAddress rlike '..省 ..市 ..县'; -- 查找用户姓为张王郑 select * from orders where userName rlike '[张王郑]\\S+'; -- 查找用户手机符合:188****0*** select * from orders where userPhone rlike '188\\S{4}[0-9]{3}';
UNION联合查询
-- 基础UNION select * from itheima.course where t_id = '周杰伦' UNION select * from itheima.course where t_id = '王力宏'; -- 去重演示 select * from itheima.orders union select * from itheima.orders; -- 不去重 select * from itheima.orders union all select * from itheima.orders; -- UNION写在FROM中,UNION写在子查询中 select t_id, count(*) from (select * from itheima.orders where t_id = '周杰伦' union all select * from itheima.orders where t_id = '王力宏') as u group by t_id; -- 用于INSERT SELECT insert overwrite table itheima.orders select * from orders union select * from orders;
数据抽样
-- 抽样查询 -- lie/其他条件不变,结果一致 select username, orderid, totalmoney from itheima.orders tablesample (bucket 3 out of 10 on username); -- hang/完全随机 select * from orders tablesample (bucket 3 out of 10 on rand()); -- 抽取100条/结果一致,从前往后找 select * from orders tablesample (100 rows); -- 取10%的数据 select * from orders tablesample (10 percent); -- 取1kb的数据 select * from orders tablesample (1K );
虚拟列
-- 虚拟列 -- 打开ROW__OFFSET__INSIDE__BLOCK set hive.exec.rowoffset=true; -- 第一个INPUT__FILE__NAME为显示数据行所在的具体文件 -- 第二个BLOCK__OFFSET__INSIDE__FILE为显示数据所在文件的偏移量 -- 第三个ROW__OFFSET__INSIDE__BLOCK显示数据所在HDFS块的偏移量 select id, icd, score, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK from test.score; SELECT *, BLOCK__OFFSET__INSIDE__FILE FROM itheima.orders WHERE BLOCK__OFFSET__INSIDE__FILE < 1000; SELECT orderid, username, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK FROM itheima.orders_bucket; SELECT INPUT__FILE__NAME, COUNT(*) FROM itheima.orders_bucket GROUP BY INPUT__FILE__NAME;
基本函数
-- 查看所有可用函数 show functions ; -- 查看函数的使用方式 describe function extended !; -- 数值函数 select round(3.14159265357906285,5); --四舍五入 -- 随机数,设置种子后每次随机数一致 select rand(); select rand(0); -- 绝对值 select abs(-9); -- 求PI select pi(); -- 集合函数 -- 求元素个数 select size(work_locations) from array1; select size(member) from test_map; -- 取出map的全部key select map_keys(member) from test_map; -- 取出map的全部value select map_values(member) from test_map; -- 查询array内是否包含指定元素,是就返回True select * from array1 where ARRAY_CONTAINS(work_locations, 'tianjin'); -- 排序 select *, sort_array(work_locations) from array1; -- 类型转换函数 -- 转二进制 select binary('hadoop'); -- 自由转换,类型转换失败报错或返回NULL select cast('1' as bigint); -- 日期函数 -- 当前时间戳 select current_timestamp(); -- 当前日期 select current_date(); -- 时间戳转日期 select to_date(current_timestamp()); -- 年月日季度等 select year('2020-01-11'); select month('2020-01-11'); select day('2020-01-11'); select quarter('2020-05-11'); select dayofmonth('2020-05-11'); select hour('2020-05-11 10:36:59'); select minute('2020-05-11 10:36:59'); select second('2020-05-11 10:36:59'); select weekofyear('2020-05-11 10:36:59'); -- 日期之间的天数 select datediff('2022-12-31', '2019-12-31'); -- 日期相加、相减 select date_add('2022-12-31', 5); select date_sub('2022-12-31', 5);
其他函数
-- 条件函数 -- if判断 select if(truename is null, 'niubi', truename) from users; -- null判断 select isnull(truename) from users; select isnotnull(truename) from users; -- nvl:如果value是null,返回 default value,否则返回value select nvl(truename, 'bu') from users; -- 返回第一个不是null的v,如果所有v都是null,则返回null select coalesce(truename, brithday) from users; -- 当a=b时返回c,当a=d时返回e,否则返回f select username, case username when '周杰伦' then '知名歌星' when '张鲁依' then '知名演员' else '不知道身份' end from users; select username, case when truename is null then '不知道名字' else '明星' end from users; -- a= true,return b;c=true,return d;else returns e; -- a可以是表达式 select truename, nullif(truename, null) from users; -- 返回结果不是true报错 select assert_true(2 < 1); -- 字符串函数 -- 连接字符串 select concat(loginname, username) from users; select concat_ws("------",loginname,username) from users; -- 统计长度 select username,length(loginname) from users; -- 转大小写 select lower('ASSasAS'); --转小写 select upper('sssss'); --转大写 -- 去除首尾空格 select trim(' asad '); -- 字符串分隔 select split('hadoop,bigdata,hdfs,dfs',','); -- 数据脱敏函数 -- hash加密(结果是16进制字符串) select mask_hash('hadoop'); -- 其他函数 -- hash加密,数字结果 select hash('hadoop'); -- 当前用户 select current_user(); -- 当前数据库 select current_database(); -- hive版本 select version(); -- 计算md5 select md5('hadoop');
案例
加载数据
-- 创建数据库 create database db_msg; -- 选择数据库 use db_msg; -- 如果表已存在就删除 drop table if exists db_msg.tb_msg_source ; -- 建表 create table db_msg.tb_msg_source( msg_time string comment "消息发送时间", sender_name string comment "发送人昵称", sender_account string comment "发送人账号", sender_sex string comment "发送人性别", sender_ip string comment "发送人ip地址", sender_os string comment "发送人操作系统", sender_phonetype string comment "发送人手机型号", sender_network string comment "发送人网络类型", sender_gps string comment "发送人的GPS定位", receiver_name string comment "接收人昵称", receiver_ip string comment "接收人IP", receiver_account string comment "接收人账号", receiver_os string comment "接收人操作系统", receiver_phonetype string comment "接收人手机型号", receiver_network string comment "接收人网络类型", receiver_gps string comment "接收人的GPS定位", receiver_sex string comment "接收人性别", msg_type string comment "消息类型", distance string comment "双方距离", message string comment "消息内容" ); -- 加载数据到表中,基于HDFS加载 load data local inpath '/home/hadoop/chat_data-30W.csv' into table tb_msg_source; -- 验证数据加载 select * from tb_msg_source tablesample(100 rows); -- 验证一下表的数量 select count(*) from tb_msg_source;
ETL数据清洗
create table db_msg.tb_msg_etl( msg_time string comment "消息发送时间", sender_name string comment "发送人昵称", sender_account string comment "发送人账号", sender_sex string comment "发送人性别", sender_ip string comment "发送人ip地址", sender_os string comment "发送人操作系统", sender_phonetype string comment "发送人手机型号", sender_network string comment "发送人网络类型", sender_gps string comment "发送人的GPS定位", receiver_name string comment "接收人昵称", receiver_ip string comment "接收人IP", receiver_account string comment "接收人账号", receiver_os string comment "接收人操作系统", receiver_phonetype string comment "接收人手机型号", receiver_network string comment "接收人网络类型", receiver_gps string comment "接收人的GPS定位", receiver_sex string comment "接收人性别", msg_type string comment "消息类型", distance string comment "双方距离", message string comment "消息内容", msg_day string comment "消息日", msg_hour string comment "消息小时", sender_lng double comment "经度", sender_lat double comment "纬度" ); INSERT OVERWRITE TABLE db_msg.tb_msg_etl SELECT *, DATE(msg_time) AS msg_day, HOUR(msg_time) AS msg_hour, SPLIT(sender_gps, ',')[0] AS sender_lng, SPLIT(sender_gps, ',')[1] AS sender_lat FROM db_msg.tb_msg_source WHERE LENGTH(sender_gps) > 0;
指标统计
--保存结果表 CREATE TABLE IF NOT EXISTS tb_rs_total_msg_cnt COMMENT "每日消息总量" AS SELECT msg_day, COUNT(*) AS total_msg_cnt FROM db_msg.tb_msg_etl GROUP BY msg_day; --保存结果表 CREATE TABLE IF NOT EXISTS tb_rs_hour_msg_cnt COMMENT "每小时消息量趋势" AS SELECT msg_hour, COUNT(*) AS total_msg_cnt, COUNT(DISTINCT sender_account) AS sender_user_cnt, COUNT(DISTINCT receiver_account) AS receiver_user_cnt FROM db_msg.tb_msg_etl GROUP BY msg_hour; CREATE TABLE IF NOT EXISTS tb_rs_loc_cnt COMMENT '今日各地区发送消息总量' AS SELECT msg_day, sender_lng, sender_lat, COUNT(*) AS total_msg_cnt FROM db_msg.tb_msg_etl GROUP BY msg_day, sender_lng, sender_lat; --保存结果表 CREATE TABLE IF NOT EXISTS tb_rs_user_cnt COMMENT "今日发送消息人数、接受消息人数" AS SELECT msg_day, COUNT(DISTINCT sender_account) AS sender_user_cnt, COUNT(DISTINCT receiver_account) AS receiver_user_cnt FROM db_msg.tb_msg_etl GROUP BY msg_day; --保存结果表 CREATE TABLE IF NOT EXISTS db_msg.tb_rs_s_user_top10 COMMENT "发送消息条数最多的Top10用户" AS SELECT sender_name AS username, COUNT(*) AS sender_msg_cnt FROM db_msg.tb_msg_etl GROUP BY sender_name ORDER BY sender_msg_cnt DESC LIMIT 10; CREATE TABLE IF NOT EXISTS db_msg.tb_rs_r_user_top10 COMMENT "接收消息条数最多的Top10用户" AS SELECT receiver_name AS username, COUNT(*) AS receiver_msg_cnt FROM db_msg.tb_msg_etl GROUP BY receiver_name ORDER BY receiver_msg_cnt DESC LIMIT 10; CREATE TABLE IF NOT EXISTS db_msg.tb_rs_sender_phone COMMENT "发送人的手机型号分布" AS SELECT sender_phonetype, COUNT(sender_account) AS cnt FROM db_msg.tb_msg_etl GROUP BY sender_phonetype; --保存结果表 CREATE TABLE IF NOT EXISTS db_msg.tb_rs_sender_os COMMENT "发送人的OS分布" AS SELECT sender_os, COUNT(sender_account) AS cnt FROM db_msg.tb_msg_etl GROUP BY sender_os
标签:comment,sender,第七,--,快乐,暑假,msg,select,string From: https://www.cnblogs.com/JJTyyds/p/17608106.html