首页 > 其他分享 >快乐暑假第七周

快乐暑假第七周

时间:2023-08-11 18:46:56浏览次数:43  
标签:comment sender 第七 -- 快乐 暑假 msg select string

本周继续进行了大数据的相关操作

分区表

-- 修改表名
alter table course_temp rename to course_common;

-- 修改表属性
desc formatted course;
-- 内部表改为外部表
alter table course set tblproperties ('EXTERNAL'='TRUE');
-- 修改表注释
alter table course set tblproperties ('comment'='this is a table comment');

/*添加表分区*/
-- 添加表分区
alter table score2 add partition (year='2019',month='01',day='10');
-- 修改表分区
alter table score2 partition (year='2023',month='08',day='05') rename to partition (year='2022',month='01',day='01');
-- 删除分区(删除元数据,数据本身还在)
alter table score drop partition (year='2022',month='01',day='01');
select * from score2;

-- 添加新列
alter table score add columns (v1 int,v2 string);
-- 修改列明
alter table score change v2 v2new string;
-- 删除表
drop table score;
-- 清空表数据(无法清空外部表)
truncate table course;

 

array类型

-- 创建一个array测试表
create table test.array1(name string,work_locations array<string>)
row format delimited fields terminated by '\t'
collection items terminated by ',';

load data local inpath '/home/hadoop/test/data_for_array_type.txt' into table test.array1;

-- 正常查询
select * from array1;
-- 数组中第几个
select name,work_locations[1] from array1;
-- 数组中个数
select name,size(work_locations) from array1;
-- 数组里是否包含
select * from array1 where array_contains(work_locations,'tianjin');

 

 

map数据类型

-- 创建map册数表
create table test.test_map(
id int,
name string,
member map<string,string>,
age int
)row format delimited fields terminated by ','
collection items terminated by "#"
map keys terminated by ':';
/*collection items terminated by "#" 每个键值对之间的分隔符*/
/*map keys terminated by ':' 单个键值对内部,k和v的分隔符*/

load data local inpath '/home/hadoop/test/data_for_map_type.txt' into table test.test_map;

select * from test.test_map;

-- 查看成员中每个人个父亲是谁
select id,name,member['father'],member['mother'] from test.test_map;

-- 取出map的全部key,返回类型是array
select map_keys(member) from test.test_map;

-- 去除map的全部value,返回类型是array
select map_values(member) from test.test_map;
-- size查看map的元素(K-V对)的个数
select size(member) from test.test_map;

-- 查看谁有这个key
select * from test.test_map where array_contains(map_keys(member),'sister');
-- 查看谁有这个value
select * from test.test_map where array_contains(map_values(member),'王林');

 

struct数据类型

create table test.text_struct(
id int,
info struct<name:string,age:int>
)row format delimited fields terminated by "#"
collection items terminated by ':';

/*collection items terminated by ':' 表示struct中二级列的数据之间的分隔符*/

load data local inpath '/home/hadoop/test/data_for_struct_type.txt' into table test.text_struct;

select * from test.text_struct;
select id,info.name,info.age from test.text_struct;

 

 

正则表达式

create database itheima;
use itheima;

CREATE DATABASE itheima;
USE itheima;
CREATE TABLE itheima.orders (
orderId bigint COMMENT ' 订单 id',
orderNo string COMMENT ' 订单编号 ',
shopId bigint COMMENT ' 门店 id',
userId bigint COMMENT ' 用户 id',
orderStatus tinyint COMMENT ' 订单状态 -3: 用户拒收 -2: 未付款的订单 -1 :用户取消 0: 待发货 1: 配送中 2: 用户确认收货 ',
goodsMoney double COMMENT ' 商品金额 ',
deliverMoney double COMMENT ' 运费 ',
totalMoney double COMMENT ' 订单金额(包括运费) ',
realTotalMoney double COMMENT ' 实际订单金额(折扣后金额) ',
payType tinyint COMMENT ' 支付方式 ,0: 未知 ;1: 支付宝, 2 :微信 ;3 、现金; 4 、其他 ',
isPay tinyint COMMENT ' 是否支付 0: 未支付 1: 已支付 ',
userName string COMMENT ' 收件人姓名 ',
userAddress string COMMENT ' 收件人地址 ',
userPhone string COMMENT ' 收件人电话 ',
createTime timestamp COMMENT ' 下单时间 ',
payTime timestamp COMMENT ' 支付时间 ',
totalPayFee int COMMENT ' 总支付金额 '
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

load data local inpath '/home/hadoop/test/itheima_orders.txt' into table itheima.orders;


CREATE TABLE itheima.users (
userId int,
loginName string,
loginSecret int,
loginPwd string,
userSex tinyint,
userName string,
trueName string,
brithday date,
userPhoto string,
userQQ string,
userPhone string,
userScore int,
userTotalScore int,
userFrom tinyint,
userMoney double,
lockMoney double,
createTime timestamp,
payPwd string,
rechargeMoney double
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

load data local inpath '/home/hadoop/test/itheima_users.txt' into table itheima.users;

-- 查找广东省数据
select * from itheima.orders where useraddress rlike '.*广东.*';

-- 查找××省。。。。
select * from orders where userAddress rlike '..省 ..市 ..县';

-- 查找用户姓为张王郑
select * from orders where userName rlike '[张王郑]\\S+';

-- 查找用户手机符合:188****0***
select * from orders where userPhone rlike '188\\S{4}[0-9]{3}';

 

 

UNION联合查询

-- 基础UNION
select *
from itheima.course
where t_id = '周杰伦'
UNION
select *
from itheima.course
where t_id = '王力宏';
-- 去重演示
select *
from itheima.orders
union
select *
from itheima.orders;
-- 不去重
select *
from itheima.orders
union all
select *
from itheima.orders;
-- UNION写在FROM中,UNION写在子查询中
select t_id, count(*)
from (select *
from itheima.orders
where t_id = '周杰伦'
union all
select *
from itheima.orders
where t_id = '王力宏') as u
group by t_id;
-- 用于INSERT SELECT
insert overwrite table itheima.orders
select *
from orders
union
select *
from orders;

 

 

数据抽样

-- 抽样查询

-- lie/其他条件不变,结果一致
select username, orderid, totalmoney
from itheima.orders tablesample (bucket 3 out of 10 on username);

-- hang/完全随机
select *
from orders tablesample (bucket 3 out of 10 on rand());

-- 抽取100条/结果一致,从前往后找
select *
from orders tablesample (100 rows);

-- 取10%的数据
select *
from orders tablesample (10 percent);

-- 取1kb的数据
select *
from orders tablesample (1K );

 

 

虚拟列

-- 虚拟列

-- 打开ROW__OFFSET__INSIDE__BLOCK
set hive.exec.rowoffset=true;

-- 第一个INPUT__FILE__NAME为显示数据行所在的具体文件
-- 第二个BLOCK__OFFSET__INSIDE__FILE为显示数据所在文件的偏移量
-- 第三个ROW__OFFSET__INSIDE__BLOCK显示数据所在HDFS块的偏移量
select id, icd, score, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE, ROW__OFFSET__INSIDE__BLOCK
from test.score;

SELECT *, BLOCK__OFFSET__INSIDE__FILE FROM itheima.orders WHERE
BLOCK__OFFSET__INSIDE__FILE < 1000;
SELECT orderid, username, INPUT__FILE__NAME, BLOCK__OFFSET__INSIDE__FILE,
ROW__OFFSET__INSIDE__BLOCK FROM itheima.orders_bucket;
SELECT INPUT__FILE__NAME, COUNT(*) FROM itheima.orders_bucket GROUP BY
INPUT__FILE__NAME;

 

 

基本函数

-- 查看所有可用函数
show functions ;
-- 查看函数的使用方式
describe function extended !;
-- 数值函数
select round(3.14159265357906285,5); --四舍五入
-- 随机数,设置种子后每次随机数一致
select rand();
select rand(0);
-- 绝对值
select abs(-9);
-- 求PI
select pi();

-- 集合函数
-- 求元素个数
select size(work_locations) from array1;
select size(member) from test_map;
-- 取出map的全部key
select map_keys(member) from test_map;
-- 取出map的全部value
select map_values(member) from test_map;
-- 查询array内是否包含指定元素,是就返回True
select * from array1 where ARRAY_CONTAINS(work_locations, 'tianjin');
-- 排序
select *, sort_array(work_locations) from array1;

-- 类型转换函数
-- 转二进制
select binary('hadoop');
-- 自由转换,类型转换失败报错或返回NULL
select cast('1' as bigint);
-- 日期函数
-- 当前时间戳
select current_timestamp();
-- 当前日期
select current_date();
-- 时间戳转日期
select to_date(current_timestamp());
-- 年月日季度等
select year('2020-01-11');
select month('2020-01-11');
select day('2020-01-11');
select quarter('2020-05-11');
select dayofmonth('2020-05-11');
select hour('2020-05-11 10:36:59');
select minute('2020-05-11 10:36:59');
select second('2020-05-11 10:36:59');
select weekofyear('2020-05-11 10:36:59');
-- 日期之间的天数
select datediff('2022-12-31', '2019-12-31');
-- 日期相加、相减
select date_add('2022-12-31', 5);
select date_sub('2022-12-31', 5);

 

 

其他函数

-- 条件函数
-- if判断
select if(truename is null, 'niubi', truename)
from users;
-- null判断
select isnull(truename)
from users;
select isnotnull(truename)
from users;
-- nvl:如果value是null,返回 default value,否则返回value
select nvl(truename, 'bu')
from users;
-- 返回第一个不是null的v,如果所有v都是null,则返回null
select coalesce(truename, brithday)
from users;
-- 当a=b时返回c,当a=d时返回e,否则返回f
select username, case username when '周杰伦' then '知名歌星' when '张鲁依' then '知名演员' else '不知道身份' end
from users;
select username, case when truename is null then '不知道名字' else '明星' end
from users;
-- a= true,return b;c=true,return d;else returns e;
-- a可以是表达式
select truename, nullif(truename, null)
from users;
-- 返回结果不是true报错
select assert_true(2 < 1);

-- 字符串函数
-- 连接字符串
select concat(loginname, username)
from users;
select concat_ws("------",loginname,username) from users;
-- 统计长度
select username,length(loginname)  from users;
-- 转大小写
select lower('ASSasAS');    --转小写
select upper('sssss');      --转大写
-- 去除首尾空格
select trim('   asad    ');
-- 字符串分隔
select split('hadoop,bigdata,hdfs,dfs',',');

-- 数据脱敏函数
-- hash加密(结果是16进制字符串)
select mask_hash('hadoop');


-- 其他函数
-- hash加密,数字结果
select hash('hadoop');
-- 当前用户
select current_user();
-- 当前数据库
select current_database();
-- hive版本
select version();
-- 计算md5
select md5('hadoop');

 

 

案例

 

加载数据

-- 创建数据库
create database db_msg;
-- 选择数据库
use db_msg;
-- 如果表已存在就删除
drop table if exists db_msg.tb_msg_source ;
-- 建表
create table db_msg.tb_msg_source(
msg_time string comment "消息发送时间",
sender_name string comment "发送人昵称",
sender_account string comment "发送人账号",
sender_sex string comment "发送人性别",
sender_ip string comment "发送人ip地址",
sender_os string comment "发送人操作系统",
sender_phonetype string comment "发送人手机型号",
sender_network string comment "发送人网络类型",
sender_gps string comment "发送人的GPS定位",
receiver_name string comment "接收人昵称",
receiver_ip string comment "接收人IP",
receiver_account string comment "接收人账号",
receiver_os string comment "接收人操作系统",
receiver_phonetype string comment "接收人手机型号",
receiver_network string comment "接收人网络类型",
receiver_gps string comment "接收人的GPS定位",
receiver_sex string comment "接收人性别",
msg_type string comment "消息类型",
distance string comment "双方距离",
message string comment "消息内容"
);
-- 加载数据到表中,基于HDFS加载
load data  local inpath '/home/hadoop/chat_data-30W.csv' into table tb_msg_source;
-- 验证数据加载
select * from tb_msg_source tablesample(100 rows);
-- 验证一下表的数量
select count(*) from tb_msg_source;

 

 

ETL数据清洗

create table db_msg.tb_msg_etl(
msg_time string comment "消息发送时间",
sender_name string comment "发送人昵称",
sender_account string comment "发送人账号",
sender_sex string comment "发送人性别",
sender_ip string comment "发送人ip地址",
sender_os string comment "发送人操作系统",
sender_phonetype string comment "发送人手机型号",
sender_network string comment "发送人网络类型",
sender_gps string comment "发送人的GPS定位",
receiver_name string comment "接收人昵称",
receiver_ip string comment "接收人IP",
receiver_account string comment "接收人账号",
receiver_os string comment "接收人操作系统",
receiver_phonetype string comment "接收人手机型号",
receiver_network string comment "接收人网络类型",
receiver_gps string comment "接收人的GPS定位",
receiver_sex string comment "接收人性别",
msg_type string comment "消息类型",
distance string comment "双方距离",
message string comment "消息内容",
msg_day string comment "消息日",
msg_hour string comment "消息小时",
sender_lng double comment "经度",
sender_lat double comment "纬度"
);
INSERT OVERWRITE TABLE db_msg.tb_msg_etl
SELECT
*,
DATE(msg_time) AS msg_day,
HOUR(msg_time) AS msg_hour,
SPLIT(sender_gps, ',')[0] AS sender_lng,
SPLIT(sender_gps, ',')[1] AS sender_lat
FROM db_msg.tb_msg_source
WHERE LENGTH(sender_gps) > 0;

 

 

指标统计

--保存结果表
CREATE TABLE IF NOT EXISTS tb_rs_total_msg_cnt
    COMMENT "每日消息总量" AS
SELECT msg_day,
       COUNT(*) AS total_msg_cnt
FROM db_msg.tb_msg_etl
GROUP BY msg_day;

--保存结果表
CREATE TABLE IF NOT EXISTS tb_rs_hour_msg_cnt
    COMMENT "每小时消息量趋势" AS
SELECT msg_hour,
       COUNT(*)                         AS total_msg_cnt,
       COUNT(DISTINCT sender_account)   AS sender_user_cnt,
       COUNT(DISTINCT receiver_account) AS receiver_user_cnt
FROM db_msg.tb_msg_etl
GROUP BY msg_hour;


CREATE TABLE IF NOT EXISTS tb_rs_loc_cnt
    COMMENT '今日各地区发送消息总量' AS
SELECT msg_day,
       sender_lng,
       sender_lat,
       COUNT(*) AS total_msg_cnt
FROM db_msg.tb_msg_etl
GROUP BY msg_day, sender_lng, sender_lat;


--保存结果表
CREATE TABLE IF NOT EXISTS tb_rs_user_cnt
    COMMENT "今日发送消息人数、接受消息人数" AS
SELECT msg_day,
       COUNT(DISTINCT sender_account)   AS sender_user_cnt,
       COUNT(DISTINCT receiver_account) AS receiver_user_cnt
FROM db_msg.tb_msg_etl
GROUP BY msg_day;


--保存结果表
CREATE TABLE IF NOT EXISTS db_msg.tb_rs_s_user_top10
    COMMENT "发送消息条数最多的Top10用户" AS
SELECT sender_name AS username,
       COUNT(*)    AS sender_msg_cnt
FROM db_msg.tb_msg_etl
GROUP BY sender_name
ORDER BY sender_msg_cnt DESC
LIMIT 10;


CREATE TABLE IF NOT EXISTS db_msg.tb_rs_r_user_top10
    COMMENT "接收消息条数最多的Top10用户" AS
SELECT receiver_name AS username,
       COUNT(*)      AS receiver_msg_cnt
FROM db_msg.tb_msg_etl
GROUP BY receiver_name
ORDER BY receiver_msg_cnt DESC
LIMIT 10;


CREATE TABLE IF NOT EXISTS db_msg.tb_rs_sender_phone
    COMMENT "发送人的手机型号分布" AS
SELECT sender_phonetype,
       COUNT(sender_account) AS cnt
FROM db_msg.tb_msg_etl
GROUP BY sender_phonetype;


--保存结果表
CREATE TABLE IF NOT EXISTS db_msg.tb_rs_sender_os
    COMMENT "发送人的OS分布" AS
SELECT sender_os,
       COUNT(sender_account) AS cnt
FROM db_msg.tb_msg_etl
GROUP BY sender_os

 

标签:comment,sender,第七,--,快乐,暑假,msg,select,string
From: https://www.cnblogs.com/JJTyyds/p/17608106.html

相关文章

  • 第七周总结
    下载好linux的远程连接软件Finalshell,开始学习。ls命令:展示当前目录内容ls-a:展示隐藏内容ls-l:展示内容ls-h:展示内容(文件大小)cd命令:切换当前目录cd文件位置pwd命令:查看当前工作目录mkdir命令:创建目录mkdir路径:mkdir-p路径:自动创建路径中没有的文件夹touch命令......
  • 每日汇报 第七周第四天 JAVA开学试题&pta
    今日学习:JAVA试题完成了主界面框架,使用while循环和switch语句实现了界面选项跳转功能,把ScoreInformation类完成了,之后就差把三个功能完善。明日计划:JAVA考试题,PTA遇到困难:JAVA怎么搞输出居中对齐啊,直接/t输出得了......
  • 暑假生活第四周
    本周我致力于深入学习SQLServer,并尝试了一些更复杂的知识点。以下是我一周的学习情况总结:学习时间:我每天投入平均4个小时学习SQLServer,总共学习了20个小时。学习内容:我着重学习了SQLServer高级查询技巧和性能优化方面的知识。具体包括以下几个方面:高级查询:我学习了如......
  • 8.5--暑假第四周总结
    对上周学习的关于MapReduce的内容进行了更加细致的学习,并学习了编写wordCount的案例讲解  ......
  • 23 暑假友谊赛 No.4(UKIEPC 2017)
    23暑假友谊赛No.4(UKIEPC2017)ProblemAAlienSunsethh,开始一眼差分,但是写寄了qwq,后来换枚举过了(Orz,但是看学长差分是能做的,我就说嘛,差分肯定能做(说下枚举思路吧,就是把每个区间都存起来,选出自转周期的最大值为\(ma\),然后去枚举\(0\simma\times1825\),每次看......
  • hfyz2023暑假集训邮寄
    Day1去晚2分钟,开始一直在换位置,平常坐位被占了,而且发现没有高二的(乐然后老师在上面……&@¥%……%#×&!发了件特别丑的衣服(hfyz出息了,四机房翻新还发定制衣服,还有新键鼠!!虽然没装上去),去拍了个照老师是szr佬,讲的ST表,树状数组,线段树,感觉就是新高一有点拉,没有一个会的,还有就是太简单......
  • 每天都很快乐,原来快乐如此简单!
        1.一年365天,一年有52周,每周的周五周六周日都是超级开心的日子,按照这个进行计算,有52*3=156天超级开心的日子; 2.每到发薪日都很快乐,每月有一天发薪日,发薪日的前一天和发薪日的后一天都很快乐,每年基本有12个发薪日,按此计算,有3*12=36个超级开心日; 3.每到假期我们都超......
  • 每日汇报 第七周第二天 JAVA复习&PTA
    今日学习:继续从网站中复习,PTA刷题时又复习了一下C++的vector用法和unordered_mapunordered_map是一个将key和value关联起来的容器,它可以高效的根据单个key值查找对应的value。key值应该是唯一的,key和value的数据类型可以不相同。unordered_map存储元素时是没有顺序的,......
  • 2023第七场牛客多校-We Love Strings
    I-WeLoveStrings_2023牛客暑期多校训练营7题意 做法:根号分治+容斥原理将字符串分为两类:len<=20直接位运算枚举出可能的所有答案,看是否存在符合的len>20采用容斥原理,计算出所有长度为i的字符串中(假设为n个),1个字符串可以表示的(1个元素的交集) ,2个字符串可以表示的......
  • 「赛后总结」暑假 CSP 模拟赛系列 2(8.1~8.3)
    「赛后总结」暑假CSP模拟赛系列2(8.1~8.3)点击查看目录目录「赛后总结」暑假CSP模拟赛系列2(8.1~8.3)20230801(letitdownround)T2神(eldenring)T4动(genshin)20230802(Max_QAQround2)T1随T3AT4C20230803(zero4338round)T2sT3pT4m20230801(letitdownround)蚌。整活大......