Hive写入JSON数据

发布时间 2024-01-08 16:53:42作者: 粒子先生

优缺点:

1.JsonSerDe对json数据格式有严格要求,创建的表结果必须与json结构能对应上,否则会报错。

配置 org.openx.data.jsonserde.JsonSerDe 能解决格式异常报错导致整个任务终止的问题。

必须定义好详细字段嵌套的规则类型,不适合提取ODS原文。

2.get_json_object解析多列字段时性能较低,但可以配合json_tuple使用,能满足大部分场景。

测试数据:

{"error":0,"status":"success","results":[{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}]}

方法一:org.apache.hive.hcatalog.data.JsonSerDe

创建表:

create table test_json
(
error int,
status string,
results array<STRUCT<currentCity:string,index:array<map<string,string>>>>
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;

导入数据:

[hdfs@hadoop01 jsontest]$ vim testjson.txt
[hdfs@hadoop01 jsontest]$ hadoop fs -put testjson.txt /warehouse/tablespace/managed/hive/test_json


0: jdbc:hive2://hadoop01:2181/default> show create table test_json;
+----------------------------------------------------+
| createtab_stmt |
+----------------------------------------------------+
| CREATE TABLE `test_json`( |
| `error` int COMMENT 'from deserializer', |
| `status` string COMMENT 'from deserializer', |
| `results` array<struct<currentcity:string,index:array<map<string,string>>>> COMMENT 'from deserializer') |
| ROW FORMAT SERDE |
| 'org.apache.hive.hcatalog.data.JsonSerDe' |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://hadoop01:8020/warehouse/tablespace/managed/hive/test_json' |
| TBLPROPERTIES ( |
| 'bucketing_version'='2', |
| 'transactional'='true', |
| 'transactional_properties'='insert_only', |
| 'transient_lastDdlTime'='1591669486') |
+----------------------------------------------------+
17 rows selected (0.085 seconds)
0: jdbc:hive2://hadoop01:2181/default> select * from test_json;
+------------------+-------------------+----------------------------------------------------+
| test_json.error | test_json.status | test_json.results |
+------------------+-------------------+----------------------------------------------------+
| 0 | success | [{"currentcity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] |
+------------------+-------------------+----------------------------------------------------+
1 row selected (0.116 seconds)
0: jdbc:hive2://hadoop01:2181/default> select results from test_json;
+----------------------------------------------------+
| results |
+----------------------------------------------------+
| [{"currentcity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] |
+----------------------------------------------------+
1 row selected (0.146 seconds)
0: jdbc:hive2://hadoop01:2181/default> select results[0].currentCity from test_json;
+--------------+
| currentcity |
+--------------+
| 青岛 |
+--------------+
1 row selected (0.132 seconds)
0: jdbc:hive2://hadoop01:2181/default> select results[0].index[0]['des'] from test_json;
+----------------------------------+
| _c0 |
+----------------------------------+
| 建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。 |
+----------------------------------+
1 row selected (0.109 second

方法二:get_json_object&json_tuple

创建表:

CREATE EXTERNAL TABLE IF NOT EXISTS default.test_json01(
jsonStr String COMMENT 'jsonstr'
) COMMENT 'json test'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
0: jdbc:hive2://hadoop01:2181/default> select * from test_json01;
+----------------------------------------------------+
| test_json01.jsonstr |
+----------------------------------------------------+
| {"error":0,"status":"success","results":[{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}]} |
+----------------------------------------------------+
1 row selected (0.087 seconds)
0: jdbc:hive2://hadoop01:2181/default> select get_json_object(jsonstr, '$.results') from test_json01;
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] |
+----------------------------------------------------+
1 row selected (0.128 seconds)
0: jdbc:hive2://hadoop01:2181/default> select get_json_object(jsonstr, '$.results[0].currentCity') from test_json01;
+------+
| _c0 |
+------+
| 青岛 |
+------+
1 row selected (0.099 seconds)
0: jdbc:hive2://hadoop01:2181/default> select json_tuple(jsonstr, 'error', 'status', 'results') from test_json01;
+-----+----------+----------------------------------------------------+
| c0 | c1 | c2 |
+-----+----------+----------------------------------------------------+
| 0 | success | [{"currentCity":"青岛","index":[{"title":"穿衣","zs":"较冷","tipt":"穿衣指数","des":"建议着厚外套加毛衣等服装。年老体弱者宜着大衣、呢外套加羊毛衫。"},{"title":"紫外线强度","zs":"中等","tipt":"紫外线强度指数","des":"属中等强度紫外线辐射天气,外出时建议涂擦SPF高于15、PA+的防晒护肤品,戴帽子、太阳镜。"}]}] |
+-----+----------+----------------------------------------------------+
1 row selected (0.112 seconds)
0: jdbc:hive2://hadoop01:2181/default> select get_json_object(jsonstr, '$.results[0].currentCity') from test_json01;
+------+
| _c0 |
+------+
| 青岛 |
+------+
1 row selected (0.094 seconds)
0: jdbc:hive2://hadoop01:2181/default> select json_tuple(jsonstr, 'error', 'status', 'results[0]') from test_json01;
+-----+----------+-------+
| c0 | c1 | c2 |
+-----+----------+-------+
| 0 | success | NULL |
+-----+----------+-------+
1 row selected (0.109 seconds)