Hive表操作–增删改:
(1)创建一个自定义分隔符的普通内部表:
CREATE TABLE IF NOT EXISTS employee ( #employee为举例表名,根据实际情况进行替换,下同
name string,
work_place ARRAY<string>,
sex_age STRUCT<sex:string,age:int>,
skills_score MAP<string,int>,
depart_title MAP<STRING,ARRAY<STRING>> #小括号内均为“列名 类型,”格式,此处列举了多种典型情况
)
COMMENT 'This is a table' #COMMENT后面替换为你想要加的注释
ROW FORMAT DELIMITED
#行格式化限制后面几行为设定的分隔符和存储格式,根据数据实际需要进行替换,以下几项均可不写,不写时,使用默认分隔符
FIELDS TERMINATED BY '|' #字段分隔符'|'
COLLECTION ITEMS TERMINATED BY ',' #集合分隔符
MAP KEYS TERMINATED BY ':' #映射分隔符
STORED AS TEXTFILE #存储为文本形式
tblproperties ("skip.header.line.count"="1"); #跳过表头(可选操作)
(2)创建一个导入数据文件类型为csv的内部表:
CREATE TABLE my_table(
a string,
b string, ...) #a,b为列名;string为列类型
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' #用于导入csv文件的特殊类
WITH SERDEPROPERTIES (
"separatorChar" = "\t", #分隔符
"quoteChar" = "'", #引用符,即''内的内容作为一个整体
"escapeChar" = "\\"#忽略符,当遇到引用符时,就自动忽略分隔符,不论''里面的数据是否包含分隔符(separatorChar)
)
STORED AS TEXTFILE;
separatorChar,quoteChar,escapeChar均为可选属性默认情况下:
separatorChar为 ,
quoteChar为 "
escapeChar为 \
(3)创建外部表:
CREATE EXTERNAL TABLE my_table_external(
a string,
b string, ...) #a,b为列名;string为列类型
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' #用于导入csv文件的特殊类
WITH SERDEPROPERTIES (
"separatorChar" = "\t", #分隔符
"quoteChar" = "'", #引用符,即''内的内容作为一个整体
"escapeChar" = "\\"#忽略符,当遇到引用符时,就自动忽略分隔符,不论''里面的数据是否包含分隔符(separatorChar)
)
STORED AS TEXTFILE;
location hdfs_path
(4)创建临时表:
CREATE temporary TABLE my_table(
a string,
b string, ...) #a,b为列名;string为列类型
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' #用于导入csv文件的特殊类
WITH SERDEPROPERTIES (
"separatorChar" = "\t", #分隔符
"quoteChar" = "'", #引用符,即''内的内容作为一个整体
"escapeChar" = "\\"#忽略符,当遇到引用符时,就自动忽略分隔符,不论''里面的数据是否包含分隔符(separatorChar)
)
STORED AS TEXTFILE;
(5)向表中载入数据:
load data local inpath 'file_path' overwrite into table table_name; #file_path为你的本地文件路径,table_name为你要导入的表的名字
(6)向表中添加数据:
insert into table_name values (column1_value,column2_value,...) #column_value为每列的值
(7)修改表中数据:
update table_name set column_name1=column_newvalue1,column_name2=column_newvalue2,... where 条件
#支持同时修改多个列,条件可以为多个不同的条件用and或or进行连接
(8)删除表中数据:
delete from table_name where 条件 #条件可以为多个不同的条件用and或or进行连接
(9)清空表中数据:
truncate table table_name # truncate不能清空外部表中的数据
(10)删除表:
drop table_name
(11)创建静态#建表时定义分区,通过PARTITIONED BY定义分区
CREATE TABLE employee_partitioned(
name string,
work_place ARRAY<string>,
sex_age STRUCT<sex:string,age:int>,
skills_score MAP<string,int>,
depart_title MAP<STRING,ARRAY<STRING>> )
PARTITIONED BY (year INT, month INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
(12)静态分区表操作:
#ADD添加分区,DROP删除分区
ALTER TABLE employee_partitioned ADD PARTITION (year=2019,month=3) PARTITION (year=2019,month=4);
ALTER TABLE employee_partitioned DROP PARTITION (year=2019, month=4);
(13)创建动态分区表:
#先创建一个普通内部表即可
CREATE TABLE employee_partitioned(
name string,
work_place ARRAY<string>,
sex_age STRUCT<sex:string,age:int>,
skills_score MAP<string,int>,
depart_title MAP<STRING,ARRAY<STRING>> )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
#设置hive.exec.dynamic.partition.mode属性为nonstrice
set hive.exec.dynamic.partition.mode=nonstrict;
#insert方式添加动态分区
insert into table employee_partitioned partition(year, month)
select name,array('Toronto') as work_place,
named_struct("sex","male","age",30) as sex_age,
map("python",90) as skills_score,
map("r&d", array('developer')) as depart_title,
year(start_date) as year,month(start_date) as month
from employee_hr eh ;
Hive表操作–查
(1)简单查询表中数据:
select column1,column2,... from table_name where 条件1,条件2...
#column1,column2为列名;条件可以为一个或多个条件,不同条件间用and和or连接
#例如 where name='tom' and age = 20
(2)查询某列数据中不重复的数据:
select DISTINCT column1 from table_name #DISTINCT限定查询唯一数据
(3)CTE(Common Table Expression)查询方式:
WITH t1 AS (SELECT …) SELECT * FROM t1 # t1执行首次查询中查询的表,后面的select再从前面查询的结果中筛选出新的结果
#举例
#此句为统计订单最多的顾客的订单数,rank()是一种用于排序的开窗函数,后面会讲到
with a as(
select customer_id,count(transcation_id) as nums from transcation_details group by customer_id),
b as(
select *,rank() over(order by nums desc) rn from a)
select * from b;
(4)嵌套查询:
#此处为为一个嵌套查询样例,作用是与CTE方式相同,都是从上一级查询获得的结果中筛选出新的结果
SELECT * FROM (SELECT * FROM employee) a;
(5)列匹配正则表达式:
# Hive要想支持正则表达式查询需先将hive.support.quoted.identifiers设置为none
SET hive.support.quoted.identifiers = none;
SELECT `^o.*` FROM offers; #表示查询offers表中以字母o开头的,包含0个或多个换行符以外字符的列
(6)关联查询:
a. Inner Join(内连接):
从两表交集中查询结果(即区域C),隐式连接(Join)也是一种内连接。
select column1,column2 from table_a inner join table_b; #显示内连接
select column1,column2 from table_a join table_b #隐式内连接
b.Outer Join(外连接):
i.左外连接:
从整个左表中查询结果(即区域A)。
select column1,column2,... from table_a left join table_b;
select column1,column2,... from table_a left outer join table_b;
ii.右外连接:
从两表交集和右表其余部分中获取查询结果(即区域B)。
select column1,column2,... from table_a right join table_b;
select column1,column2,... from table_a right outer join table_b;
iii.全外连接:
从两表全部数据中查询结果(即AUBUC)。
select column1,column2,... from table_a full outer join table_b;
c.Cross Join(交叉连接,取笛卡尔积)
select column1,column2,... from table_a,table_b;
(7)合并多次查询结果(Union):
select column1 from table_a where column1='a'
union
select column2 from table_a where column2='b';
Hive数据排序:
(1).ORDER BY(默认正序即ASC)
select * from table_a order by column1; #按column1列从小到大进行排序
select * from table_a order by column1 desc; #按column1列从大到小进行排序
(2).SORT BY/DISTRIBUTE BY
SORT BY对每个Reducer中的数据进行排序:
当Reducer数量设置为1时,等于ORDER BY。
排序列必须出现在SELECT column列表中。
DISTRIBUTE BY类似于标准SQL中的GROUP BY:
确保具有匹配列值的行被分区到相同的Reducer。
不会对每个Reducer的输出进行排序。
通常使用在SORT BY语句之前。
例:
SELECT department_id , name, employee_id, evaluation_score
FROM employee_hr
DISTRIBUTE BY department_id SORT BY evaluation_score DESC;
(3).CLUSTER BY
CLUSTER BY大致相当于sort by和distribute by合在一起的效果,但它不支持ASC和DESC且排序列必须出现在SELECT column列表中。
SELECT name, employee_id FROM employee_hr CLUSTER BY name;
Hive聚合运算
(1)Group By(按列进行分组):
select category, max(offervalue) from offers group by category;
(2)Having(对GROUP BY聚合结果的条件过滤,不建议使用):
#以下为一个典型的Having语句
select sex_age.age from employee group by sex_age.age having count(*) <= 1;
(3)取某列的最大/最小值/平均值/和:
selects customer_id,max(price) `最大值` from offers;
selects customer_id,min(price) `最小值` from offers;
selects customer_id,avg(price) `平均值` from offers;
selects customer_id,sum(price) `总和` from offers;
Hive窗口函数
(1)ROW_NUMBER():对所有数值输出不同的序号,序号唯一连续。
(2)RANK():对相同数值,输出相同的序号,下一个序号跳过(1,1,3)。
(3)DENSE_RANK():对相同数值,输出相同的序号,下一个序号连续(1,1,2)。
(4)NLITE(n):将有序的数据集合平均分配到n个桶中, 将桶号分配给每一行,根据桶号,选取前或后 n分之几的数据。
(5)(目前排名- 1)/(总行数- 1),值相对于一组值的百分比排名。
SELECT
name, dept_num, salary,
ROW_NUMBER() OVER () AS row_num,
RANK() OVER (PARTITION BY dept_num ORDER BY salary) AS rank,
DENSE_RANK() OVER (PARTITION BY dept_num ORDER BY salary) AS dense_rank,
PERCENT_RANK() OVER(PARTITION BY dept_num ORDER BY salary) AS percent_rank,
NTILE(2) OVER(PARTITION BY dept_num ORDER BY salary) AS ntile
FROM employee_contract
ORDER BY dept_num, salary;
补:窗口类型:行类型窗口和范围类型窗口:
**行类型窗口:**以当前行为参照,通过向前或向后数任意行进行范围确定。
SELECT
name, dept_num AS dept, salary AS sal,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) win1 FROM employee_contract ORDER BY dept, name;
**范围类型窗口:**以当前行为参照,通过向前和向后数任意行进行范围确定。
SELECT department_id,SUM(salary) RANGE BETWEEN 500 PRECEDING AND 1000 FOLLOWING from salary group by department_id;
从本地导入表/导出表到本地
(1) 从本地导入表:
IMPORT TABLE employee FROM '/tmp/output3';
(2)导出表到本地:
EXPORT TABLE employee TO '/tmp/output3';
————————————————
原文链接:https://blog.csdn.net/welson650/article/details/105835663