目录
0 问题描述
分析用户在每个会话内的行为序列,找出最常见的前 N 种行为模式,并按用户分群。
用户表结构和数据
假设有名为user_behavior_log
的用户行为日志表,包含以下字段:
字段名 | 数据类型 | 描述 |
---|---|---|
user_id | INT | 用户 ID |
behavior | STRING | 用户行为,例如view , click , purchase |
timestamp | BIGINT | 行为发生的时间戳 |
user_group | STRING | 用户分组,例如group_a , group_b |
示例数据,例如:
user_id | behavior | timestamp | user_group |
---|---|---|---|
1 | view | 1678886400 | group_a |
1 | click | 1678886460 | group_a |
2 | view | 1678886520 | group_b |
1 | purchase | 1678886580 | group_a |
2 | view | 1678886640 | group_b |
2 | click | 1678886700 | group_b |
3 | view | 1678886760 | group_a |
3 | click | 1678886820 | group_a |
3 | view | 1678886880 | group_a |
1 | view | 1678887000 | group_a |
1 | click | 1678887060 | group_a |
SQL 结果输出
user_group | pattern | frequency |
---|---|---|
group_a | view->click->view | 1 |
group_a | view->click->purchase | 1 |
group_b | view->view->click | 1 |
1 数据准备
CREATE TABLE user_behavior_log (
user_id INT,
behavior STRING,
timestamp BIGINT,
user_group STRING
);
INSERT INTO TABLE user_behavior_log
SELECT *
FROM (
SELECT stack(11,
1, 'view', 1678886400, 'group_a',
1, 'click', 1678886460, 'group_a',
2, 'view', 1678886520, 'group_b',
1, 'purchase', 1678886580, 'group_a',
2, 'view', 1678886640, 'group_b',
2, 'click', 1678886700, 'group_b',
3, 'view', 1678886760, 'group_a',
3, 'click', 1678886820, 'group_a',
3, 'view', 1678886880, 'group_a',
1, 'view', 1678887000, 'group_a',
1, 'click', 1678887060, 'group_a'
) AS (user_id, behavior, timestamp, user_group)
) subquery;
----------------------------
create TABLE user_behavior_log as
(SELECT stack(11,
1, 'view', 1678886400, 'group_a',
1, 'click', 1678886460, 'group_a',
2, 'view', 1678886520, 'group_b',
1, 'purchase', 1678886580, 'group_a',
2, 'view', 1678886640, 'group_b',
2, 'click', 1678886700, 'group_b',
3, 'view', 1678886760, 'group_a',
3, 'click', 1678886820, 'group_a',
3, 'view', 1678886880, 'group_a',
1, 'view', 1678887000, 'group_a',
1, 'click', 1678887060, 'group_a'
) AS (user_id, behavior, `timestamp`, user_group));
2 问题分析
求解步骤
- 会话划分:根据用户 ID 和时间戳,将会话划分出来。
- 构建会话内行为序列:按照会话 ID、用户ID、USER_GROUP进行分组,并将会话的行为连接成一个序列。
- 定义行为模式:根据业务规则,进行用户行为模式匹配。
- 统计模式频次:统计不同行为模式的出现次数。
- 按用户分群:按用户分群,并分别统计每个用户群体的最常见行为模式。
会话划分:根据用户 ID 和时间戳,将会话划分出来。
WITH user_sessions AS (select user_id,
behavior,
`timestamp`,
user_group,
sum(diff) over (partition by user_id order by `timestamp`) session_id
from (SELECT user_id,
behavior,
`timestamp`,
user_group,
CASE
WHEN (`timestamp` - LAG(`timestamp`, 1, 0)
OVER (PARTITION BY user_id ORDER BY `timestamp`)) >
1800 THEN 1
ELSE 0 END diff
FROM user_behavior_log) t
)
select *
from user_sessions;
构建会话内行为序列:按照会话 ID、用户ID、USER_GROUP进行分组,并将会话的行为连接成一个序列。
WITH user_sessions AS (select user_id,
behavior,
`timestamp`,
user_group,
sum(diff) over (partition by user_id order by `timestamp`) session_id
from (SELECT user_id,
behavior,
`timestamp`,
user_group,
CASE
WHEN (`timestamp` - LAG(`timestamp`, 1, 0)
OVER (PARTITION BY user_id ORDER BY `timestamp`)) >
1800 THEN 1
ELSE 0 END diff
FROM user_behavior_log) t
)
,
session_behavior_sequences AS (
SELECT
user_id,
session_id,
user_group,
collect_list(behavior) AS behavior_sequence
FROM
user_sessions
GROUP BY
user_id,
session_id,
user_group
)
select *
from session_behavior_sequences;
定义行为模式:根据业务规则,进行用户行为模式匹配。
WITH user_sessions AS (select user_id,
behavior,
`timestamp`,
user_group,
sum(diff) over (partition by user_id order by `timestamp`) session_id
from (SELECT user_id,
behavior,
`timestamp`,
user_group,
CASE
WHEN (`timestamp` - LAG(`timestamp`, 1, 0)
OVER (PARTITION BY user_id ORDER BY `timestamp`)) >
1800 THEN 1
ELSE 0 END diff
FROM user_behavior_log) t
)
,
session_behavior_sequences AS (
SELECT
user_id,
session_id,
user_group,
collect_list(behavior) AS behavior_sequence
FROM
user_sessions
GROUP BY
user_id,
session_id,
user_group
)
,
session_behavior_patterns AS (
SELECT
user_id,
session_id,
user_group,
-- 将数组转换为字符串
regexp_replace(concat_ws(',', behavior_sequence), '^([^,]+,[^,]+,[^,]+).*', '$1') AS pattern_str,
size(behavior_sequence) AS seq_length
FROM
session_behavior_sequences
)
select *
from session_behavior_patterns;
session_behavior_patterns
子查询:- 首先使用
concat_ws(',', behavior_sequence)
将数组转换为以逗号分隔的字符串。 - 然后使用
regexp_replace
函数,通过正则表达式^([^,]+,[^,]+,[^,]+).*
匹配字符串开头的前三个逗号分隔的部分,并将其替换为匹配到的内容,即模拟了对数组前三个元素的截取,得到pattern_str
。同时计算数组的长度seq_length
。
- 首先使用
统计模式频次:统计不同行为模式的出现次数
WITH user_sessions AS (select user_id,
behavior,
`timestamp`,
user_group,
sum(diff) over (partition by user_id order by `timestamp`) session_id
from (SELECT user_id,
behavior,
`timestamp`,
user_group,
CASE
WHEN (`timestamp` - LAG(`timestamp`, 1, 0)
OVER (PARTITION BY user_id ORDER BY `timestamp`)) >
1800 THEN 1
ELSE 0 END diff
FROM user_behavior_log) t
)
,
session_behavior_sequences AS (
SELECT
user_id,
session_id,
user_group,
collect_list(behavior) AS behavior_sequence
FROM
user_sessions
GROUP BY
user_id,
session_id,
user_group
)
,
session_behavior_patterns AS (
SELECT
user_id,
session_id,
user_group,
-- 将数组转换为字符串
regexp_replace(concat_ws(',', behavior_sequence), '^([^,]+,[^,]+,[^,]+).*', '$1') AS pattern_str,
size(behavior_sequence) AS seq_length
FROM
session_behavior_sequences
)
,
pattern_frequencies AS (
SELECT
user_group,
-- 将字符串转换回类似模式的格式
replace(pattern_str, ',', '->') AS pattern,
count(*) AS frequency
FROM
session_behavior_patterns
WHERE
seq_length >= 3
GROUP BY
user_group,
pattern_str
)
select * from pattern_frequencies;
按用户分群:按用户分群,并分别统计每个用户群体的最常见行为模式。
WITH user_sessions AS (select user_id,
behavior,
`timestamp`,
user_group,
sum(diff) over (partition by user_id order by `timestamp`) session_id
from (SELECT user_id,
behavior,
`timestamp`,
user_group,
CASE
WHEN (`timestamp` - LAG(`timestamp`, 1, 0)
OVER (PARTITION BY user_id ORDER BY `timestamp`)) >
1800 THEN 1
ELSE 0 END diff
FROM user_behavior_log) t
)
,
session_behavior_sequences AS (
SELECT
user_id,
session_id,
user_group,
collect_list(behavior) AS behavior_sequence
FROM
user_sessions
GROUP BY
user_id,
session_id,
user_group
)
,
session_behavior_patterns AS (
SELECT
user_id,
session_id,
user_group,
-- 将数组转换为字符串
regexp_replace(concat_ws(',', behavior_sequence), '^([^,]+,[^,]+,[^,]+).*', '$1') AS pattern_str,
size(behavior_sequence) AS seq_length
FROM
session_behavior_sequences
)
,
pattern_frequencies AS (
SELECT
user_group,
-- 将字符串转换回类似模式的格式
replace(pattern_str, ',', '->') AS pattern,
count(*) AS frequency
FROM
session_behavior_patterns
WHERE
seq_length >= 3
GROUP BY
user_group,
pattern_str
)
,
ranked_patterns AS (
SELECT
user_group,
pattern,
frequency,
row_number() OVER (PARTITION BY user_group ORDER BY frequency DESC) AS rn
FROM
pattern_frequencies
)
SELECT
user_group,
pattern,
frequency
FROM
ranked_patterns
WHERE
rn <= 2;
3 小结
本文核心技巧总结
regexp_replace(concat_ws(',', behavior_sequence), '^([^,]+,[^,]+,[^,]+).*', '$1') AS pattern_str
整体函数作用:
regexp_replace
是 Hive 中的函数,用于基于正则表达式进行字符串替换。它的语法是regexp_replace(string A, string B, string C)
,表示在字符串A
中,将匹配正则表达式B
的部分替换为字符串C
。
正则表达式部分:
^([^,]+,[^,]+,[^,]+).*
:
^
:表示匹配字符串的开始位置。这确保我们从字符串的开头进行匹配。([^,]+,[^,]+,[^,]+)
:这是一个捕获组。
[^,]+
:表示匹配除逗号,
以外的一个或多个字符。在这里,它用于匹配数组中的每个元素(因为我们之前用逗号将数组元素连接成了字符串)。- 整个捕获组
([^,]+,[^,]+,[^,]+)
表示匹配以逗号分隔的前三个元素,即模拟获取数组的前三个元素。.*
:表示匹配零个或多个任意字符,直到字符串的末尾。这部分用于匹配捕获组之后的所有剩余字符,以便在替换时将其丢弃。
-
替换部分:
$1
:在regexp_replace
函数中,$1
表示对第一个捕获组的反向引用。也就是说,我们将整个匹配到的字符串(从开头到剩余部分)替换为捕获组([^,]+,[^,]+,[^,]+)
所匹配到的内容,即行为序列字符串的前三个以逗号分隔的元素。
实际效果:
- 假设
behavior_sequence
数组被concat_ws(',', behavior_sequence)
拼接成字符串"view,click,purchase,add_to_cart"
,经过regexp_replace
函数处理后,将得到"view,click,purchase"
,这就模拟了从数组中提取前三个元素的操作。
通过这种方式,即使 Hive 中没有 array_slice
函数,也能通过字符串操作和正则表达式来获取行为序列中的特定部分,作为行为模式提取的一部分。
往期精彩
其实数据分析,只是在筛选没有准备的人,写给正在求职的你们。。。
SQL进阶技巧:如何根据座位距离查找员工?| 员工座位安排问题