1、查询表结构
show create table data_lake_ods.test
CREATE TABLE spark_catalog.data_lake_ods.test (
`user_number` BIGINT NOT NULL,
`subclazz_number` BIGINT NOT NULL,
`clazz_number` BIGINT,
`clazz_lesson_number` BIGINT NOT NULL,
`lesson_live_property` BIGINT,
`lesson_video_property` BIGINT,
`lesson_live_length` BIGINT,
`lesson_video_length` BIGINT,
`lesson_standard_length` BIGINT,
`lesson_length` BIGINT,
`live_learn_duration` BIGINT,
`video_learn_duration` BIGINT,
`learn_duration` BIGINT,
`is_valid_live_learn` BIGINT,
`is_valid_learn` BIGINT,
`companion_learn_duration` BIGINT,
`learn_combine_duration` BIGINT,
`companion_lesson_length` BIGINT,
`is_should_attend_user` BIGINT,
`is_live_attend_user` BIGINT,
`is_combine_valid_learn_user` BIGINT,
`is_black_user` BIGINT)
USING iceberg
LOCATION '/user/hive/warehouse/data_lake_ods.db/test'
TBLPROPERTIES(
'catalog-database' = 'data_lake_ods',
'catalog-name' = 'spark_catalog',
'catalog-table' = 'test',
'catalog-type' = 'hive',
'connector' = 'iceberg',
'current-snapshot-id' = '5677214384524195741',
'format' = 'iceberg/parquet',
'format-version' = '2',
'identifier-fields' = '[clazz_lesson_number,subclazz_number,user_number]',
'table.drop.base-path.enabled' = 'true',
'uri' = 'thrift://127.0.0.1:7004,******',
'write-parallelism' = '16',
'write.distribution-mode' = 'hash',
'write.merge.mode' = 'merge-on-read',
'write.metadata.delete-after-commit.enabled' = 'true',
'write.metadata.metrics.default' = 'full',
'write.metadata.previous-versions-max' = '100',
'write.update.mode' = 'merge-on-read',
'write.upsert.enabled' = 'true')
2、得到表存储路径后查询最新的metadata.json文件
hdfs dfs -ls /user/hive/warehouse/data_lake_ods.db/test/metadata/
得到最新的文件是:/user/hive/warehouse/data_lake_ods.db/test/metadata/00059-1d8e6694-f873-45a3-9697-ce2762f78c3a.metadata.json
3、查看文件内容
hdfs dfs -cat /user/hive/warehouse/data_lake_ods.db/test/metadata/00059-1d8e6694-f873-45a3-9697-ce2762f78c3a.metadata.json
4、找到"current-snapshot-id" : 5677214384524195741。其实和建表中的id一样
"current-snapshot-id" : 5677214384524195741,
{
"sequence-number" : 59,
"snapshot-id" : 5677214384524195741,
"parent-snapshot-id" : 922737561337808536,
"timestamp-ms" : 1701683779323,
"summary" : {
"operation" : "overwrite",
"flink.operator-id" : "e883208d19e3c34f8aaf2a3168a63337",
"flink.job-id" : "000000001de734af0000000000000000",
"flink.max-committed-checkpoint-id" : "59",
"added-data-files" : "16",
"added-equality-delete-files" : "16",
"added-position-delete-files" : "16",
"added-delete-files" : "32",
"added-records" : "611418",
"added-files-size" : "19072474",
"added-position-deletes" : "245062",
"added-equality-deletes" : "366356",
"changed-partition-count" : "1",
"total-records" : "1084028212",
"total-files-size" : "41957333041",
"total-data-files" : "944",
"total-delete-files" : "1680",
"total-position-deletes" : "10749681",
"total-equality-deletes" : "1073278531"
},
"manifest-list" : "/user/hive/warehouse/data_lake_ods.db/test/metadata/snap-5677214384524195741-1-644d4e57-0953-40c0-915d-faf7dd8a3ab2.avro",
"schema-id" : 0
}
5、得到manifest-list文件路径,下载到本地分析
hadoop fs -get /user/hive/warehouse/data_lake_ods.db/test/metadata/snap-5677214384524195741-1-644d4e57-0953-40c0-915d-faf7dd8a3ab2.avro ~/test/iceberg
6、查看avro文件内容
java -jar ./avro-tools-1.8.1.jar tojson ./snap-5677214384524195741-1-644d4e57-0953-40c0-915d-faf7dd8a3ab2.avro | less |grep '5677214384524195741'
内容如下:
{"manifest_path":"/user/hive/warehouse/data_lake_ods.db/test/metadata/644d4e57-0953-40c0-915d-faf7dd8a3ab2-m0.avro","manifest_length":12233,"partition_spec_id":0,"content":0,"sequence_number":59,"min_sequence_number":59,"added_snapshot_id":567721438452419574,"added_data_files_count":16,"existing_data_files_count":0,"deleted_data_files_count":0,"added_rows_count":611418,"existing_rows_count":0,"deleted_rows_count":0,"partitions":{"array":[]}}
{"manifest_path":"/user/hive/warehouse/data_lake_ods.db/test/metadata/644d4e57-0953-40c0-915d-faf7dd8a3ab2-m1.avro","manifest_length":10241,"partition_spec_id":0,"content":1,"sequence_number":59,"min_sequence_number":59,"added_snapshot_id":567721438452419574,"added_data_files_count":32,"existing_data_files_count":0,"deleted_data_files_count":0,"added_rows_count":611418,"existing_rows_count":0,"deleted_rows_count":0,"partitions":{"array":[]}}
7、查询其中一个文件内容
hadoop fs -get /user/hive/warehouse/data_lake_ods.db/test/metadata/644d4e57-0953-40c0-915d-faf7dd8a3ab2-m0.avro ~/test/iceberg
java -jar ./avro-tools-1.8.1.jar tojson ./644d4e57-0953-40c0-915d-faf7dd8a3ab2-m0.avro | less
数据样例:
{"status":1,"snapshot_id":{"long":5677214384524195741},"sequence_number":null,"file_sequence_number":null,"data_file":{"content":0,"file_path":"/user/hive/warehouse/data_lake_ods.db/test/data/00009-0-a2f03c5f-eec9-4a15-bafe-9b360af4fde5-00162.parquet","file_format":"PARQUET","partition":{},"record_count":38138,"file_size_in_bytes":899517,"column_sizes":{"array":[{"key":1,"value":178768},{"key":2,"value":90942},{"key":3,"value":68674},{"key":4,"value":95511},{"key":5,"value":9520},{"key":6,"value":6952},{"key":7,"value":47567},{"key":8,"value":42993},{"key":9,"value":29052},{"key":10,"value":39705},{"key":11,"value":59926},{"key":12,"value":40560},{"key":13,"value":67603},{"key":14,"value":8252},{"key":15,"value":8365},{"key":16,"value":7477},{"key":17,"value":67708},{"key":18,"value":6603},{"key":19,"value":5850},{"key":20,"value":5150},{"key":21,"value":5096},{"key":22,"value":246}]},"value_counts":{"array":[{"key":1,"value":38138},{"key":2,"value":38138},{"key":3,"value":38138},{"key":4,"value":38138},{"key":5,"value":38138},{"key":6,"value":38138},{"key":7,"value":38138},{"key":8,"value":38138},{"key":9,"value":38138},{"key":10,"value":38138},{"key":11,"value":38138},{"key":12,"value":38138},{"key":13,"value":38138},{"key":14,"value":38138},{"key":15,"value":38138},{"key":16,"value":38138},{"key":17,"value":38138},{"key":18,"value":38138},{"key":19,"value":38138},{"key":20,"value":38138},{"key":21,"value":38138},{"key":22,"value":38138}]},"null_value_counts":{"array":[{"key":1,"value":0},{"key":2,"value":0},{"key":3,"value":0},{"key":4,"value":0},{"key":5,"value":11091},{"key":6,"value":11092},{"key":7,"value":11091},{"key":8,"value":11092},{"key":9,"value":11091},{"key":10,"value":11091},{"key":11,"value":11091},{"key":12,"value":11091},{"key":13,"value":11091},{"key":14,"value":11091},{"key":15,"value":11091},{"key":16,"value":11091},{"key":17,"value":11091},{"key":18,"value":11091},{"key":19,"value":0},{"key":20,"value":0},{"key":21,"value":0},{"key":22,"value":0}]},"nan_value_counts":{"array":[]},"lower_bounds":{"array":[{"key":1,"value":"'\u0006\u0002\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":3,"value":"\u0000\u0000<U+008D><Ú\u000B\t\u0000"},{"key":4,"value":",\u0000ß<Ú\u000B\t\u0000"},{"key":5,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":6,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":7,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":8,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":9,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":10,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":11,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":12,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":13,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":14,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":15,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":16,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":17,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":18,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":19,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":20,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":21,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":22,"value":"\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000"}]},"upper_bounds":{"array":[{"key":1,"value":"C<U+0095>\u0014\u0006<U+008E>å\u0000\u0000"},{"key":2,"value":"<U+0080>\u0004 <U+0084>´ÇW\u0000"},{"key":3,"value":"\u0000à`t_º{\u0005"},{"key":4,"value":"\u0000°ÃZ\"<U+0097>|\u0005"},{"key":5,"value":"\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":6,"value":"\u0003\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":7,"value":"<U+0081>L\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":8,"value":"zL\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":9,"value":"`T\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":10,"value":"$4\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":11,"value":"18\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":12,"value":"£8\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":13,"value":"~:\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":14,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":15,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":16,"value":"<U+0091>\u001A\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":17,"value":"~:\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":18,"value":"B\u001B\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":19,"value":"\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":20,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":21,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":22,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"}]},"key_metadata":null,"split_offsets":{"array":[4]},"equality_ids":null,"sort_order_id":{"int":0}}}
8、根据第六步得到的文件路径查看文件内容
(1)查topN数据
hadoop jar ./parquet-tools-1.11.0.jar head -n 10 /user/hive/warehouse/data_lake_ods.db/test/data/00009-0-a2f03c5f-eec9-4a15-bafe-9b360af4fde5-00162.parquet
数据样例:
user_number = 19100811
subclazz_number = 22388336842768640
clazz_number = 10995946454325888
clazz_lesson_number = 10995946455505548
lesson_live_property = 0
lesson_video_property = 0
lesson_live_length = 6091
lesson_video_length = 0
lesson_standard_length = 7200
lesson_length = 6091
live_learn_duration = 476
video_learn_duration = 0
learn_duration = 476
is_valid_live_learn = 0
is_valid_learn = 0
companion_learn_duration = 0
learn_combine_duration = 476
companion_lesson_length = 0
is_should_attend_user = 0
is_live_attend_user = 1
is_combine_valid_learn_user = 0
is_black_user = 0
(2)查元数据,表格式
hadoop jar ./parquet-tools-1.11.0.jar schema /user/hive/warehouse/data_lake_ods.db/test/data/00009-0-a2f03c5f-eec9-4a15-bafe-9b360af4fde5-00162.parquet
message table {
required int64 user_number = 1;
required int64 subclazz_number = 2;
optional int64 clazz_number = 3;
required int64 clazz_lesson_number = 4;
optional int64 lesson_live_property = 5;
optional int64 lesson_video_property = 6;
optional int64 lesson_live_length = 7;
optional int64 lesson_video_length = 8;
optional int64 lesson_standard_length = 9;
optional int64 lesson_length = 10;
optional int64 live_learn_duration = 11;
optional int64 video_learn_duration = 12;
optional int64 learn_duration = 13;
optional int64 is_valid_live_learn = 14;
optional int64 is_valid_learn = 15;
optional int64 companion_learn_duration = 16;
optional int64 learn_combine_duration = 17;
optional int64 companion_lesson_length = 18;
optional int64 is_should_attend_user = 19;
optional int64 is_live_attend_user = 20;
optional int64 is_combine_valid_learn_user = 21;
optional int64 is_black_user = 22;
}
(3)查看meta信息
hadoop jar /home/hadoop/test/iceberg/parquet-tools-1.11.0.jar meta /user/hive/warehouse/data_lake_ods.db/test/data/00009-0-a2f03c5f-eec9-4a15-bafe-9b360af4fde5-00162.parquet
file: /user/hive/warehouse/data_lake_ods.db/test/data/00009-0-a2f03c5f-eec9-4a15-bafe-9b360af4fde5-00162.parquet
creator: parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)
extra: iceberg.schema = {"type":"struct","schema-id":0,"identifier-field-ids":[1,2,4],"fields":[{"id":1,"name":"user_number","required":true,"type":"long"},{"id":2,"name":"subclazz_number","required":true,"type":"long"},{"id":3,"name":"clazz_number","required":false,"type":"long"},{"id":4,"name":"clazz_lesson_number","required":true,"type":"long"},{"id":5,"name":"lesson_live_property","required":false,"type":"long"},{"id":6,"name":"lesson_video_property","required":false,"type":"long"},{"id":7,"name":"lesson_live_length","required":false,"type":"long"},{"id":8,"name":"lesson_video_length","required":false,"type":"long"},{"id":9,"name":"lesson_standard_length","required":false,"type":"long"},{"id":10,"name":"lesson_length","required":false,"type":"long"},{"id":11,"name":"live_learn_duration","required":false,"type":"long"},{"id":12,"name":"video_learn_duration","required":false,"type":"long"},{"id":13,"name":"learn_duration","required":false,"type":"long"},{"id":14,"name":"is_valid_live_learn","required":false,"type":"long"},{"id":15,"name":"is_valid_learn","required":false,"type":"long"},{"id":16,"name":"companion_learn_duration","required":false,"type":"long"},{"id":17,"name":"learn_combine_duration","required":false,"type":"long"},{"id":18,"name":"companion_lesson_length","required":false,"type":"long"},{"id":19,"name":"is_should_attend_user","required":false,"type":"long"},{"id":20,"name":"is_live_attend_user","required":false,"type":"long"},{"id":21,"name":"is_combine_valid_learn_user","required":false,"type":"long"},{"id":22,"name":"is_black_user","required":false,"type":"long"}]}
file schema: table
--------------------------------------------------------------------------------
user_number: REQUIRED INT64 R:0 D:0
subclazz_number: REQUIRED INT64 R:0 D:0
clazz_number: OPTIONAL INT64 R:0 D:1
clazz_lesson_number: REQUIRED INT64 R:0 D:0
lesson_live_property: OPTIONAL INT64 R:0 D:1
lesson_video_property: OPTIONAL INT64 R:0 D:1
lesson_live_length: OPTIONAL INT64 R:0 D:1
lesson_video_length: OPTIONAL INT64 R:0 D:1
lesson_standard_length: OPTIONAL INT64 R:0 D:1
lesson_length: OPTIONAL INT64 R:0 D:1
live_learn_duration: OPTIONAL INT64 R:0 D:1
video_learn_duration: OPTIONAL INT64 R:0 D:1
learn_duration: OPTIONAL INT64 R:0 D:1
is_valid_live_learn: OPTIONAL INT64 R:0 D:1
is_valid_learn: OPTIONAL INT64 R:0 D:1
companion_learn_duration: OPTIONAL INT64 R:0 D:1
learn_combine_duration: OPTIONAL INT64 R:0 D:1
companion_lesson_length: OPTIONAL INT64 R:0 D:1
is_should_attend_user: OPTIONAL INT64 R:0 D:1
is_live_attend_user: OPTIONAL INT64 R:0 D:1
is_combine_valid_learn_user: OPTIONAL INT64 R:0 D:1
is_black_user: OPTIONAL INT64 R:0 D:1
row group 1: RC:38138 TS:1269627 OFFSET:4
--------------------------------------------------------------------------------
user_number: INT64 GZIP DO:4 FPO:112797 SZ:178768/238215/1.33 VC:38138 ENC:BIT_PACKED,PLAIN_DICTIONARY ST:[min: 132647, max: 252398150128963, num_nulls: 0]
subclazz_number: INT64 GZIP DO:178772 FPO:216044 SZ:90942/116042/1.28 VC:38138 ENC:BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 24707901098558592, num_nulls: 0]
clazz_number: INT64 GZIP DO:269714 FPO:287587 SZ:68674/79130/1.15 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 2546306737045504, max: 395114311462215680, num_nulls: 0]
clazz_lesson_number: INT64 GZIP DO:338388 FPO:374110 SZ:95511/115465/1.21 VC:38138 ENC:BIT_PACKED,PLAIN_DICTIONARY ST:[min: 2546306742419500, max: 395357041109217280, num_nulls: 0]
lesson_live_property: INT64 GZIP DO:433899 FPO:433948 SZ:9520/11698/1.23 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 3, num_nulls: 11091]
lesson_video_property: INT64 GZIP DO:443419 FPO:443469 SZ:6952/9243/1.33 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 3, num_nulls: 11092]
lesson_live_length: INT64 GZIP DO:450371 FPO:456873 SZ:47567/64261/1.35 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 19585, num_nulls: 11091]
lesson_video_length: INT64 GZIP DO:497938 FPO:505857 SZ:42993/70204/1.63 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 19578, num_nulls: 11092]
lesson_standard_length: INT64 GZIP DO:540931 FPO:543782 SZ:29052/49418/1.70 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 21600, num_nulls: 11091]
lesson_length: INT64 GZIP DO:569983 FPO:575653 SZ:39705/61757/1.56 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 13348, num_nulls: 11091]
live_learn_duration: INT64 GZIP DO:609688 FPO:625930 SZ:59926/99798/1.67 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 14385, num_nulls: 11091]
video_learn_duration: INT64 GZIP DO:669614 FPO:680817 SZ:40560/80714/1.99 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 14499, num_nulls: 11091]
learn_duration: INT64 GZIP DO:710174 FPO:728644 SZ:67603/106804/1.58 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 14974, num_nulls: 11091]
is_valid_live_learn: INT64 GZIP DO:777777 FPO:777819 SZ:8252/8680/1.05 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 11091]
is_valid_learn: INT64 GZIP DO:786029 FPO:786071 SZ:8365/8705/1.04 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 11091]
companion_learn_duration: INT64 GZIP DO:794394 FPO:795351 SZ:7477/12507/1.67 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 6801, num_nulls: 11091]
learn_combine_duration: INT64 GZIP DO:801871 FPO:820446 SZ:67708/107108/1.58 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 14974, num_nulls: 11091]
companion_lesson_length: INT64 GZIP DO:869579 FPO:869883 SZ:6603/9775/1.48 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 6978, num_nulls: 11091]
is_should_attend_user: INT64 GZIP DO:876182 FPO:876227 SZ:5850/9754/1.67 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 2, num_nulls: 0]
is_live_attend_user: INT64 GZIP DO:882032 FPO:882077 SZ:5150/5094/0.99 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
is_combine_valid_learn_user: INT64 GZIP DO:887182 FPO:887227 SZ:5096/5042/0.99 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
is_black_user: INT64 GZIP DO:892278 FPO:892323 SZ:246/213/0.87 VC:38138 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
标签:u0000,Iceberg,learn,value,38138,key,sql,spark,lesson From: https://www.cnblogs.com/robots2/p/17876930.html