Hive从小时分区中删除重复数据
- 一、小时分区数据去重
- 二、重写小时分区数据
一、小时分区数据去重
小时分区数据去重后,写入到hive临时表中
with to_json_map as (
select distinct _track_id
,time
,distinct_id
,to_json(lib) as lib
,event
,to_json(properties) as properties
,_flush_time
,map_id
,user_id
,login_id
,anonymous_id
,recv_time
,to_json(extractor) as extractor
,project_id
,project
,ver
,type
,device_id
from ods_tracking_data_kafka_prod.sensor_tracking_kafka_nioapp_prod_1h_i
where datetime = '2023061417'
),
from_json_map as (
select
_track_id as track_id
,time
,distinct_id
,from_json(lib,'map<string,string>') as lib
,event
,from_json(properties,'map<string,string>') as properties
from to_json_map
)
insert overwrite tmp.app_prod_20230614
partition (datetime = '2023061417')
select * from from_json_map
二、重写小时分区数据
从临时表中把数据写回小时分区中
insert overwrite ods_tracking_data_kafka_prod.sensor_tracking_kafka_nioapp_prod_1h_i
partition (datetime = '2023061417')
select
track_id as _track_id
,time
,distinct_id
,lib
,event
,properties
from tmp.app_prod_20230614
where datetime = '2023061417'