样例数据
点击查看代码
insert overwrite table test.ysj_lianxu_not
select 'a', 't1'
union all
select 'a', 't2'
union all
select 'a', 't3'
union all
select 'b', 't3'
union all
select 'a', 't4'
union all
select 'a', 't5'
union all
select 'a', 't6'
union all
select 'a', 't7'
;
连续登录超过4天的用户
点击查看代码
SELECT diff,count(DISTINCT user_id ) d_ct
from (
SELECT user_id,click_time,date_sub(click_time,rk) as diff
from (
SELECT user_id,click_time,row_number() over(PARTITION BY user_id ORDER BY click_time ) as rk
from test.ysj_lianxu_not
) aa
) temp
GROUP BY diff
having count(DISTINCT user_id ) >=4
;
连续登录用户,且中间没有其他用户登录
- 第一种方式
点击查看代码
select user_id,diff,count(*)
from
(select user_id,rk_1-rk_2 diff
from
(select user_id
,row_number() over(order by click_time asc ) rk_1
,row_number() over(partition by user_id order by click_time asc ) rk_2
from test.ysj_lianxu_not
) b
) c
group by user_id,diff
having count(*)=4
;
第一种方式有以下的问题,应该用第二种方法
点击查看代码
--连续登陆天数的用户,并且中间没有其他用户登陆,此方法有问题,如果两个用户a和B,a用户和b用户在同一天的全局排序的顺序不同,则会造成误差
-- user_id click_time rk_1 rk_2
-- a t1 1 1
-- a t2 2 2
-- b t3 3 1
-- a t3 4 3 比如t3这一天,a排在b的后边,则会造成这一天数据和之后的日期连续,计算出来的a的连续登陆天数就会多一天
-- a t4 5 4
-- a t5 6 5
-- a t6 7 6
-- a t7 8 7
- 第二种方式:
点击查看代码
WITH temp as (
SELECt click_time
from (
SELECT click_time,count(distinct user_id) as ct
from test.ysj_lianxu_not
GROUP BY click_time
) aa
where ct = 1
)
select diff,user_id,count(*)
from (
select user_id,cast(replace(click_time,'t','') as int) - rk as diff
from (
SELECT t2.user_id,t1.click_time,row_number() over(PARTITION BY t2.user_id ORDER BY t1.click_time ASC) as rk
from temp t1
left join test.ysj_lianxu_not t2
on t1.click_time = t2.click_time
) aa
) bb
GROUP BY diff,user_id
;
连续登录天数,允许间隔3天
间隔见天都可以,只需要将sql中的space函数的参数改成对应的天数减1就能满足
点击查看代码
with temp as (
select 111 as uid,'2023-01-01' as dt
union all
select 111 as uid,'2023-01-03' as dt
union all
select 111 as uid,'2023-01-05' as dt
union all
select 111 as uid,'2023-01-06' as dt
union all
select 111 as uid,'2023-01-09' as dt
union all
select 111 as uid,'2023-01-10' as dt
union all
select 111 as uid,'2023-01-11' as dt
union all
select 111 as uid,'2023-01-13' as dt
union all
select 111 as uid,'2023-01-17' as dt
union all
select 111 as uid,'2023-01-19' as dt
union all
select 111 as uid,'2023-01-20' as dt
union all
select 222 as uid,'2023-01-01' as dt
union all
select 222 as uid,'2023-01-02' as dt
union all
select 222 as uid,'2023-01-05' as dt
union all
select 222 as uid,'2023-01-07' as dt
union all
select 222 as uid,'2023-01-08' as dt
),
temp2 as (
select *,date_add(tt.dt,sp.pos) as last_dt
from temp tt
lateral view posexplode(split(space(2),'')) sp
) ,
temp3 as (
select uid,last_dt
from temp2
group by last_dt,uid
)
select uid,diff,count(*) as ct
from (
select uid,date_sub(last_dt,rk) as diff
from (
select uid,last_dt,row_number() over(partition by uid order by last_dt ) as rk
from temp3
) result
) ed
group by uid,diff