hive解析多层json嵌套

标签：OBJECT rate GET JSON amount hive 嵌套 json fixed

原有的json数据如下：

现在如何取到两层嵌套里面的json？

思路分析：

1、重点是取到内层的json并解析，但是内层是一个json数组。怎么解析json数组？

2、将内层json数组的中括号替换成{}，然后json分组内的分隔符逗号替换成为；方便之后的split。

3、到这一步就变成一个可以explode的json了。

第一种写法：

WITH temp_data AS (
    SELECT '{
   "fixed_charge": {
      "discount": 1,
      "fixed_fee": -1,
      "end_amount": -1,
      "actual_rate": -1,
      "origin_rate": -1,
      "start_amount": -1
   },
   "float_charges": [
      {
         "discount": 10,
         "fixed_fee": -1,
         "end_amount": 1000000,
         "actual_rate": 0.01,
         "origin_rate": 0.01,
         "start_amount": 0
      },
      {
         "discount": 10,
         "fixed_fee": -1,
         "end_amount": 3000000,
         "actual_rate": 0.006,
         "origin_rate": 0.006,
         "start_amount": 1000000
      },
      {
         "discount": 10,
         "fixed_fee": -1,
         "end_amount": 5000000,
         "actual_rate": 0.002,
         "origin_rate": 0.002,
         "start_amount": 3000000
      },
      {
         "discount": -1,
         "fixed_fee": 1000,
         "end_amount": -999,
         "actual_rate": -1,
         "origin_rate": -1,
         "start_amount": 5000000
      }
   ],
   "user_discount_flag": 0
}' AS doc)
SELECT
       GET_JSON_OBJECT(doc, '$.fixed_charge.discount') as fixed_charge_discount,
       GET_JSON_OBJECT(doc, '$.fixed_charge.fixed_fee') as fixed_charge_fixed_fee,
       GET_JSON_OBJECT(doc, '$.fixed_charge.end_amount') as fixed_charge_end_amount,
       GET_JSON_OBJECT(doc, '$.fixed_charge.actual_rate') as fixed_charge_actual_rate,
       GET_JSON_OBJECT(doc, '$.fixed_charge.origin_rate') as fixed_charge_origin_rate,
       GET_JSON_OBJECT(doc, '$.fixed_charge.start_amount') as fixed_charge_start_amount,
       GET_JSON_OBJECT(float_charges, '$.discount') as float_charges_discount,
       GET_JSON_OBJECT(float_charges, '$.fixed_fee') as float_charges_fixed_fee,
       GET_JSON_OBJECT(float_charges, '$.end_amount') as float_charges_end_amount,
       GET_JSON_OBJECT(float_charges, '$.actual_rate') as float_charges_actual_rate,
       GET_JSON_OBJECT(float_charges, '$.origin_rate') as float_charges_origin_rate,
       GET_JSON_OBJECT(float_charges, '$.start_amount') as float_charges_start_amount,
       GET_JSON_OBJECT(doc, '$.user_discount_flag') as user_discount_flag
FROM temp_data T
 LATERAL VIEW  explode(SPLIT(regexp_replace(regexp_replace(GET_JSON_OBJECT(T.doc, '$.float_charges'),'\\}\\,\\{', '\\}\\;\\{'),'\\[|\\]',''),';'))  as  float_charges;

运行截图如下：

第二种方法：

UDTF函数解决

1、需要定义一个udtf函数批量取数组中的json串。这种的效率更高一些。因为没有那么多函数嵌套

代码如下：

WITH a_json
         AS (SELECT '
{
   "fixed_charge": {
      "discount": 1,
      "fixed_fee": -1,
      "end_amount": -1,
      "actual_rate": -1,
      "origin_rate": -1,
      "start_amount": -1
   },
   "float_charges": [
      {
         "discount": 10,
         "fixed_fee": -1,
         "end_amount": 1000000,
         "actual_rate": 0.01,
         "origin_rate": 0.01,
         "start_amount": 0
      },
      {
         "discount": 10,
         "fixed_fee": -1,
         "end_amount": 3000000,
         "actual_rate": 0.006,
         "origin_rate": 0.006,
         "start_amount": 1000000
      },
      {
         "discount": 10,
         "fixed_fee": -1,
         "end_amount": 5000000,
         "actual_rate": 0.002,
         "origin_rate": 0.002,
         "start_amount": 3000000
      },
      {
         "discount": -1,
         "fixed_fee": 1000,
         "end_amount": -999,
         "actual_rate": -1,
         "origin_rate": -1,
         "start_amount": 5000000
      }
   ],
   "user_discount_flag": 0
}' AS doc)
SELECT GET_JSON_OBJECT(doc, '$.fixed_charge.discount'),
       GET_JSON_OBJECT(doc, '$.fixed_charge.fixed_fee'),
       GET_JSON_OBJECT(doc, '$.fixed_charge.end_amount'),
       GET_JSON_OBJECT(doc, '$.fixed_charge.actual_rate'),
       GET_JSON_OBJECT(doc, '$.fixed_charge.origin_rate'),
       GET_JSON_OBJECT(doc, '$.fixed_charge.start_amount'),
       GET_JSON_OBJECT(float_charges, '$.discount'),
       GET_JSON_OBJECT(float_charges, '$.fixed_fee'),
       GET_JSON_OBJECT(float_charges, '$.end_amount'),
       GET_JSON_OBJECT(float_charges, '$.actual_rate'),
       GET_JSON_OBJECT(float_charges, '$.origin_rate'),
       GET_JSON_OBJECT(float_charges, '$.start_amount'),
       GET_JSON_OBJECT(doc, '$.float_charges')
FROM a_json t
         LATERAL VIEW
             EXPLODE_JSON_ARRAY(GET_JSON_OBJECT(doc, '$.float_charges')) tmp AS float_charges;

运行截图省略，

EXPLODE_JSON_ARRAY函数就是自定义的函数。


附录：UTDF函数

import java.util.ArrayList;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.json.JSONArray;

public class ExplodeJSONArray extends GenericUDTF {

private PrimitiveObjectInspector inputOI;

@Override
public void close() throws HiveException {

}

@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {

if(argOIs.length!=1){
throw new UDFArgumentException("explode_json_array函数只能接收1个参数");
}

ObjectInspector argOI = argOIs[0];

if(argOI.getCategory()!=ObjectInspector.Category.PRIMITIVE){
throw new UDFArgumentException("explode_json_array函数只能接收基本数据类型的参数");
}

PrimitiveObjectInspector primitiveOI = (PrimitiveObjectInspector) argOI;
inputOI=primitiveOI;

if(primitiveOI.getPrimitiveCategory()!=PrimitiveObjectInspector.PrimitiveCategory.STRING){
throw new UDFArgumentException("explode_json_array函数只能接收STRING类型的参数");
}

ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("item");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}

@Override
public void process(Object[] args) throws HiveException {

Object arg = args[0];
String jsonArrayStr = PrimitiveObjectInspectorUtils.getString(arg, inputOI);

JSONArray jsonArray = new JSONArray(jsonArrayStr);

for (int i = 0; i < jsonArray.length(); i++) {
String json = jsonArray.getString(i);

String[] result = {json};

forward(result);
}

}

标签：OBJECT,rate,GET,JSON,amount,hive,嵌套,json,fixed
From： https://www.cnblogs.com/miduofanxiang/p/16795822.html

相关文章

赞助商

阅读排行