原有的json数据如下:
现在如何取到两层嵌套里面的json?
思路分析:
1、重点是取到内层的json并解析,但是内层是一个json数组。怎么解析json数组?
2、将内层json数组的中括号替换成{},然后json分组内的分隔符逗号替换成为;方便之后的split。
3、到这一步就变成一个可以explode的json了。
第一种写法:
WITH temp_data AS (
SELECT '{
"fixed_charge": {
"discount": 1,
"fixed_fee": -1,
"end_amount": -1,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": -1
},
"float_charges": [
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 1000000,
"actual_rate": 0.01,
"origin_rate": 0.01,
"start_amount": 0
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 3000000,
"actual_rate": 0.006,
"origin_rate": 0.006,
"start_amount": 1000000
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 5000000,
"actual_rate": 0.002,
"origin_rate": 0.002,
"start_amount": 3000000
},
{
"discount": -1,
"fixed_fee": 1000,
"end_amount": -999,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": 5000000
}
],
"user_discount_flag": 0
}' AS doc)
SELECT
GET_JSON_OBJECT(doc, '$.fixed_charge.discount') as fixed_charge_discount,
GET_JSON_OBJECT(doc, '$.fixed_charge.fixed_fee') as fixed_charge_fixed_fee,
GET_JSON_OBJECT(doc, '$.fixed_charge.end_amount') as fixed_charge_end_amount,
GET_JSON_OBJECT(doc, '$.fixed_charge.actual_rate') as fixed_charge_actual_rate,
GET_JSON_OBJECT(doc, '$.fixed_charge.origin_rate') as fixed_charge_origin_rate,
GET_JSON_OBJECT(doc, '$.fixed_charge.start_amount') as fixed_charge_start_amount,
GET_JSON_OBJECT(float_charges, '$.discount') as float_charges_discount,
GET_JSON_OBJECT(float_charges, '$.fixed_fee') as float_charges_fixed_fee,
GET_JSON_OBJECT(float_charges, '$.end_amount') as float_charges_end_amount,
GET_JSON_OBJECT(float_charges, '$.actual_rate') as float_charges_actual_rate,
GET_JSON_OBJECT(float_charges, '$.origin_rate') as float_charges_origin_rate,
GET_JSON_OBJECT(float_charges, '$.start_amount') as float_charges_start_amount,
GET_JSON_OBJECT(doc, '$.user_discount_flag') as user_discount_flag
FROM temp_data T
LATERAL VIEW explode(SPLIT(regexp_replace(regexp_replace(GET_JSON_OBJECT(T.doc, '$.float_charges'),'\\}\\,\\{', '\\}\\;\\{'),'\\[|\\]',''),';')) as float_charges;
运行截图如下:
第二种方法:
UDTF函数解决
1、需要定义一个udtf函数批量取数组中的json串。这种的效率更高一些。因为没有那么多函数嵌套
代码如下:
WITH a_json
AS (SELECT '
{
"fixed_charge": {
"discount": 1,
"fixed_fee": -1,
"end_amount": -1,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": -1
},
"float_charges": [
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 1000000,
"actual_rate": 0.01,
"origin_rate": 0.01,
"start_amount": 0
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 3000000,
"actual_rate": 0.006,
"origin_rate": 0.006,
"start_amount": 1000000
},
{
"discount": 10,
"fixed_fee": -1,
"end_amount": 5000000,
"actual_rate": 0.002,
"origin_rate": 0.002,
"start_amount": 3000000
},
{
"discount": -1,
"fixed_fee": 1000,
"end_amount": -999,
"actual_rate": -1,
"origin_rate": -1,
"start_amount": 5000000
}
],
"user_discount_flag": 0
}' AS doc)
SELECT GET_JSON_OBJECT(doc, '$.fixed_charge.discount'),
GET_JSON_OBJECT(doc, '$.fixed_charge.fixed_fee'),
GET_JSON_OBJECT(doc, '$.fixed_charge.end_amount'),
GET_JSON_OBJECT(doc, '$.fixed_charge.actual_rate'),
GET_JSON_OBJECT(doc, '$.fixed_charge.origin_rate'),
GET_JSON_OBJECT(doc, '$.fixed_charge.start_amount'),
GET_JSON_OBJECT(float_charges, '$.discount'),
GET_JSON_OBJECT(float_charges, '$.fixed_fee'),
GET_JSON_OBJECT(float_charges, '$.end_amount'),
GET_JSON_OBJECT(float_charges, '$.actual_rate'),
GET_JSON_OBJECT(float_charges, '$.origin_rate'),
GET_JSON_OBJECT(float_charges, '$.start_amount'),
GET_JSON_OBJECT(doc, '$.float_charges')
FROM a_json t
LATERAL VIEW
EXPLODE_JSON_ARRAY(GET_JSON_OBJECT(doc, '$.float_charges')) tmp AS float_charges;
运行截图省略,
EXPLODE_JSON_ARRAY函数就是自定义的函数。
附录:UTDF函数
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.json.JSONArray;
public class ExplodeJSONArray extends GenericUDTF {
private PrimitiveObjectInspector inputOI;
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
if(argOIs.length!=1){
throw new UDFArgumentException("explode_json_array函数只能接收1个参数");
}
ObjectInspector argOI = argOIs[0];
if(argOI.getCategory()!=ObjectInspector.Category.PRIMITIVE){
throw new UDFArgumentException("explode_json_array函数只能接收基本数据类型的参数");
}
PrimitiveObjectInspector primitiveOI = (PrimitiveObjectInspector) argOI;
inputOI=primitiveOI;
if(primitiveOI.getPrimitiveCategory()!=PrimitiveObjectInspector.PrimitiveCategory.STRING){
throw new UDFArgumentException("explode_json_array函数只能接收STRING类型的参数");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("item");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
Object arg = args[0];
String jsonArrayStr = PrimitiveObjectInspectorUtils.getString(arg, inputOI);
JSONArray jsonArray = new JSONArray(jsonArrayStr);
for (int i = 0; i < jsonArray.length(); i++) {
String json = jsonArray.getString(i);
String[] result = {json};
forward(result);
}
}
}
标签:OBJECT,rate,GET,JSON,amount,hive,嵌套,json,fixed From: https://www.cnblogs.com/miduofanxiang/p/16795822.html