优缺点:
1.JsonSerDe对json数据格式有严格要求,创建的表结果必须与json结构能对应上,否则会报错。
配置 org.openx.data.jsonserde.JsonSerDe 能解决格式异常报错导致整个任务终止的问题。
必须定义好详细字段嵌套的规则类型,不适合提取ODS原文。
2.get_json_object解析多列字段时性能较低,但可以配合json_tuple使用,能满足大部分场景。
测试数据:
{"error":0,"status":"success","results":[{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}]}
方法一:org.apache.hive.hcatalog.data.JsonSerDe
创建表:
create table test_json ( error int, status string, results array<STRUCT<currentCity:string,index:array<map<string,string>>>> ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE;
导入数据:
[hdfs@hadoop01 jsontest]$ vim testjson.txt [hdfs@hadoop01 jsontest]$ hadoop fs -put testjson.txt /warehouse/tablespace/managed/hive/test_json 0: jdbc:hive2://hadoop01:2181/default> show create table test_json; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `test_json`( | | `error` int COMMENT 'from deserializer', | | `status` string COMMENT 'from deserializer', | | `results` array<struct<currentcity:string,index:array<map<string,string>>>> COMMENT 'from deserializer') | | ROW FORMAT SERDE | | 'org.apache.hive.hcatalog.data.JsonSerDe' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.mapred.TextInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' | | LOCATION | | 'hdfs://hadoop01:8020/warehouse/tablespace/managed/hive/test_json' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1591669486') | +----------------------------------------------------+ 17 rows selected (0.085 seconds) 0: jdbc:hive2://hadoop01:2181/default> select * from test_json; +------------------+-------------------+----------------------------------------------------+ | test_json.error | test_json.status | test_json.results | +------------------+-------------------+----------------------------------------------------+ | 0 | success | [{"currentcity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] | +------------------+-------------------+----------------------------------------------------+ 1 row selected (0.116 seconds) 0: jdbc:hive2://hadoop01:2181/default> select results from test_json; +----------------------------------------------------+ | results | +----------------------------------------------------+ | [{"currentcity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] | +----------------------------------------------------+ 1 row selected (0.146 seconds) 0: jdbc:hive2://hadoop01:2181/default> select results[0].currentCity from test_json; +--------------+ | currentcity | +--------------+ | 青岛 | +--------------+ 1 row selected (0.132 seconds) 0: jdbc:hive2://hadoop01:2181/default> select results[0].index[0]['des'] from test_json; +----------------------------------+ | _c0 | +----------------------------------+ | 建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。 | +----------------------------------+ 1 row selected (0.109 second
方法二:get_json_object&json_tuple
创建表:
CREATE EXTERNAL TABLE IF NOT EXISTS default.test_json01( jsonStr String COMMENT 'jsonstr' ) COMMENT 'json test' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
0: jdbc:hive2://hadoop01:2181/default> select * from test_json01; +----------------------------------------------------+ | test_json01.jsonstr | +----------------------------------------------------+ | {"error":0,"status":"success","results":[{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}]} | +----------------------------------------------------+ 1 row selected (0.087 seconds) 0: jdbc:hive2://hadoop01:2181/default> select get_json_object(jsonstr, '$.results') from test_json01; +----------------------------------------------------+ | _c0 | +----------------------------------------------------+ | [{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] | +----------------------------------------------------+ 1 row selected (0.128 seconds) 0: jdbc:hive2://hadoop01:2181/default> select get_json_object(jsonstr, '$.results[0].currentCity') from test_json01; +------+ | _c0 | +------+ | 青岛 | +------+ 1 row selected (0.099 seconds) 0: jdbc:hive2://hadoop01:2181/default> select json_tuple(jsonstr, 'error', 'status', 'results') from test_json01; +-----+----------+----------------------------------------------------+ | c0 | c1 | c2 | +-----+----------+----------------------------------------------------+ | 0 | success | [{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] | +-----+----------+----------------------------------------------------+ 1 row selected (0.112 seconds) 0: jdbc:hive2://hadoop01:2181/default> select get_json_object(jsonstr, '$.results[0].currentCity') from test_json01; +------+ | _c0 | +------+ | 青岛 | +------+ 1 row selected (0.094 seconds) 0: jdbc:hive2://hadoop01:2181/default> select json_tuple(jsonstr, 'error', 'status', 'results[0]') from test_json01; +-----+----------+-------+ | c0 | c1 | c2 | +-----+----------+-------+ | 0 | success | NULL | +-----+----------+-------+ 1 row selected (0.109 seconds)