首页 > 其他分享 >hive解析多层json嵌套

hive解析多层json嵌套

时间:2022-10-16 11:22:44浏览次数:82  
标签:OBJECT rate GET JSON amount hive 嵌套 json fixed

原有的json数据如下:

 

 

 现在如何取到两层嵌套里面的json?

 

思路分析:

1、重点是取到内层的json并解析,但是内层是一个json数组。怎么解析json数组?

2、将内层json数组的中括号替换成{},然后json分组内的分隔符逗号替换成为;方便之后的split。

3、到这一步就变成一个可以explode的json了。

 

第一种写法:

WITH temp_data AS (
SELECT '{
"fixed_charge": {
"discount": 1,
"fixed_fee": -1,
"end_amount": -1,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": -1
},
"float_charges": [
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 1000000,
"actual_rate": 0.01,
"origin_rate": 0.01,
"start_amount": 0
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 3000000,
"actual_rate": 0.006,
"origin_rate": 0.006,
"start_amount": 1000000
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 5000000,
"actual_rate": 0.002,
"origin_rate": 0.002,
"start_amount": 3000000
},
{
"discount": -1,
"fixed_fee": 1000,
"end_amount": -999,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": 5000000
}
],
"user_discount_flag": 0
}' AS doc)
SELECT
GET_JSON_OBJECT(doc, '$.fixed_charge.discount') as fixed_charge_discount,
GET_JSON_OBJECT(doc, '$.fixed_charge.fixed_fee') as fixed_charge_fixed_fee,
GET_JSON_OBJECT(doc, '$.fixed_charge.end_amount') as fixed_charge_end_amount,
GET_JSON_OBJECT(doc, '$.fixed_charge.actual_rate') as fixed_charge_actual_rate,
GET_JSON_OBJECT(doc, '$.fixed_charge.origin_rate') as fixed_charge_origin_rate,
GET_JSON_OBJECT(doc, '$.fixed_charge.start_amount') as fixed_charge_start_amount,
GET_JSON_OBJECT(float_charges, '$.discount') as float_charges_discount,
GET_JSON_OBJECT(float_charges, '$.fixed_fee') as float_charges_fixed_fee,
GET_JSON_OBJECT(float_charges, '$.end_amount') as float_charges_end_amount,
GET_JSON_OBJECT(float_charges, '$.actual_rate') as float_charges_actual_rate,
GET_JSON_OBJECT(float_charges, '$.origin_rate') as float_charges_origin_rate,
GET_JSON_OBJECT(float_charges, '$.start_amount') as float_charges_start_amount,
GET_JSON_OBJECT(doc, '$.user_discount_flag') as user_discount_flag
FROM temp_data T
LATERAL VIEW explode(SPLIT(regexp_replace(regexp_replace(GET_JSON_OBJECT(T.doc, '$.float_charges'),'\\}\\,\\{', '\\}\\;\\{'),'\\[|\\]',''),';')) as float_charges;

运行截图如下:

 

第二种方法:

UDTF函数解决

 

1、需要定义一个udtf函数批量取数组中的json串。这种的效率更高一些。因为没有那么多函数嵌套

代码如下:

WITH a_json
AS (SELECT '
{
"fixed_charge": {
"discount": 1,
"fixed_fee": -1,
"end_amount": -1,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": -1
},
"float_charges": [
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 1000000,
"actual_rate": 0.01,
"origin_rate": 0.01,
"start_amount": 0
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 3000000,
"actual_rate": 0.006,
"origin_rate": 0.006,
"start_amount": 1000000
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 5000000,
"actual_rate": 0.002,
"origin_rate": 0.002,
"start_amount": 3000000
},
{
"discount": -1,
"fixed_fee": 1000,
"end_amount": -999,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": 5000000
}
],
"user_discount_flag": 0
}' AS doc)
SELECT GET_JSON_OBJECT(doc, '$.fixed_charge.discount'),
GET_JSON_OBJECT(doc, '$.fixed_charge.fixed_fee'),
GET_JSON_OBJECT(doc, '$.fixed_charge.end_amount'),
GET_JSON_OBJECT(doc, '$.fixed_charge.actual_rate'),
GET_JSON_OBJECT(doc, '$.fixed_charge.origin_rate'),
GET_JSON_OBJECT(doc, '$.fixed_charge.start_amount'),
GET_JSON_OBJECT(float_charges, '$.discount'),
GET_JSON_OBJECT(float_charges, '$.fixed_fee'),
GET_JSON_OBJECT(float_charges, '$.end_amount'),
GET_JSON_OBJECT(float_charges, '$.actual_rate'),
GET_JSON_OBJECT(float_charges, '$.origin_rate'),
GET_JSON_OBJECT(float_charges, '$.start_amount'),
GET_JSON_OBJECT(doc, '$.float_charges')
FROM a_json t
LATERAL VIEW
EXPLODE_JSON_ARRAY(GET_JSON_OBJECT(doc, '$.float_charges')) tmp AS float_charges;

运行截图省略,
EXPLODE_JSON_ARRAY函数就是自定义的函数。


附录:UTDF函数

import java.util.ArrayList;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.json.JSONArray;


public class ExplodeJSONArray extends GenericUDTF {

private PrimitiveObjectInspector inputOI;

@Override
public void close() throws HiveException {

}

@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {

if(argOIs.length!=1){
throw new UDFArgumentException("explode_json_array函数只能接收1个参数");
}

ObjectInspector argOI = argOIs[0];

if(argOI.getCategory()!=ObjectInspector.Category.PRIMITIVE){
throw new UDFArgumentException("explode_json_array函数只能接收基本数据类型的参数");
}

PrimitiveObjectInspector primitiveOI = (PrimitiveObjectInspector) argOI;
inputOI=primitiveOI;

if(primitiveOI.getPrimitiveCategory()!=PrimitiveObjectInspector.PrimitiveCategory.STRING){
throw new UDFArgumentException("explode_json_array函数只能接收STRING类型的参数");
}


ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("item");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}

@Override
public void process(Object[] args) throws HiveException {

Object arg = args[0];
String jsonArrayStr = PrimitiveObjectInspectorUtils.getString(arg, inputOI);

JSONArray jsonArray = new JSONArray(jsonArrayStr);

for (int i = 0; i < jsonArray.length(); i++) {
String json = jsonArray.getString(i);

String[] result = {json};

forward(result);
}

}

}


 

标签:OBJECT,rate,GET,JSON,amount,hive,嵌套,json,fixed
From: https://www.cnblogs.com/miduofanxiang/p/16795822.html

相关文章