一个值得记下来的窗口累计计算办法,使用的情况是:计算某个窗口内的累计值
1.Example
Part 1
CREATE TABLE test_table (
daily DATE,
person_num INT,
app_region STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;-- 一张包含了daily 日期、每天该区域登录人数 person_num 和区域 app_region 三个字段的 Hive 表
Part 2
DROP TABLE IF EXISTS test_table_cum;
CREATE TABLE IF NOT EXISTS test_table_cum AS
SELECT daily, app_region, person_num,
SUM(person_num) OVER (PARTITION BY app_region ORDER BY daily ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ttl_person_num
FROM test_table
;-- 计算每个日期下的累计登录人数(比如 第一天为10 第二天为20 则ttl_person_cum值为30
2.Explanation
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
是一个窗口定义子句,用于指定窗口函数计算的行范围。(感觉是限制窗口大小!!
意味着窗口函数将计算从当前行开始,回溯到当前分区中的第一行,即对当前行及其之前的所有行进行累加计算
UNBOUNDED PRECEDING 表示窗口的起始行是当前分区中的第一行
CURRENT ROW 表示窗口的结束行是当前行
3.Extension
-
ROWS BETWEEN
AND
指定从行到 行的范围
例如,ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING 表示窗口包括当前行及其前后各一行 -
RANGE BETWEEN
AND
基于值范围进行定义,而不是基于行数。这对于时间序列数据很有用 -
要计算一周最后一天对应的数据 可以这么写
DROP TABLE IF EXISTS test_table_cum_weekly;
CREATE TABLE IF NOT EXISTS test_table_cum_weekly AS
WITH test_table_cum_week AS (
SELECT daily,
YEAR(DATE_ADD(daily, 26 - WEEKOFYEAR(daily))) iso_year,
WEEKOFYEAR(daily) AS iso_week,
app_region,
ttl_person_num,
ROW_NUMBER() OVER (PARTITION BY iso_year,iso_week,app_region ORDER BY daily DESC) AS rn --每周的最后一天日期
FROM test_table_cum
)
SELECT iso_year,iso_week, app_region, ttl_person_num
FROM test_table_cum_week
WHERE rn = 1
;
标签:窗口,daily,累计,person,num,Hive,test,table,cum
From: https://www.cnblogs.com/hirateyuriovo/p/18189745