Hive数据加载使用复杂的json结构



我试图通过具有自由模式数组和结构的JSON文件加载外部表。

create external table longtimedata_v4
(measurementPointName string,measurementPointId string,dataTypeId string,dataTypeName string,channels ARRAY<            struct<name:string,unit:string,dvalues:ARRAY<struct<atimedt:string,dvalue:string>>>>)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION '/warehouse/tablespace/external/hive/longtimedata'
{
"measurementPointName": "Saab Scania, BBY2908, Sibbhult",
"measurementPointId": 35,
"dataTypeId": 1,
"dataTypeName": "Voltage",
"channels": [
{
"name": "U1 RMS",
"unit": "V",
"values": [
{
"time": "2022-10-24T00:00:00Z",
"value": 21348.7695
},
{
"time": "2022-10-24T00:10:00Z",
"value": 21342.7656
},
{
"time": "2022-10-24T00:20:00Z",
"value": 21401.0957
},
{
"time": "2022-10-24T00:30:00Z",
"value": 21415.2129
},
{
"time": "2022-10-24T00:40:00Z",
"value": 21456.9062
},
{
"time": "2022-10-24T00:50:00Z",
"value": 21501.541
},
{
"time": "2022-10-24T01:00:00Z",
"value": 21527.4355
},
{
"time": "2022-10-24T01:10:00Z",
"value": 21510.9512
},
{
"time": "2022-10-24T01:20:00Z",
"value": 21436.2266
},
{
"time": "2022-10-24T01:30:00Z",
"value": 21423.5176
},
{
"time": "2022-10-24T01:40:00Z",
"value": 21415.6211
},
{
"time": "2022-10-24T01:50:00Z",
"value": 21416.8008
},
{
"time": "2022-10-24T02:00:00Z",
"value": 21460.6016
},
{
"time": "2022-10-24T02:10:00Z",
"value": 21502.0195
},
{
"time": "2022-10-24T02:20:00Z",
"value": 21405.748
},
{
"time": "2022-10-24T02:30:00Z",
"value": 21321.9062
},
{
"time": "2022-10-24T02:40:00Z",
"value": 21394.6074
},
{
"time": "2022-10-24T02:50:00Z",
"value": 21392.3496
},
{
"time": "2022-10-24T03:00:00Z",
"value": 21412.6875
},
{
"time": "2022-10-24T03:10:00Z",
"value": 21320.5469
},
{
"time": "2022-10-24T03:20:00Z",
"value": 21309.9746
},
{
"time": "2022-10-24T03:30:00Z",
"value": 21229.5879
},
{
"time": "2022-10-24T03:40:00Z",
"value": 21174.2891
},
{
"time": "2022-10-24T03:50:00Z",
"value": 21100.6406
},
{
"time": "2022-10-24T04:00:00Z",
"value": 21188.2207
},
{
"time": "2022-10-24T04:10:00Z",
"value": 21146.1191
},
{
"time": "2022-10-24T04:20:00Z",
"value": 21328.709
},
{
"time": "2022-10-24T04:30:00Z",
"value": 21216.4844
},
{
"time": "2022-10-24T04:40:00Z",
"value": 21104.7871
},
{
"time": "2022-10-24T04:50:00Z",
"value": 21096.8398
},
{
"time": "2022-10-24T05:00:00Z",
"value": 21147.1895
},
{
"time": "2022-10-24T05:10:00Z",
"value": 21330.084
},
{
"time": "2022-10-24T05:20:00Z",
"value": 21205.3965
},
{
"time": "2022-10-24T05:30:00Z",
"value": 21116.1309
},
{
"time": "2022-10-24T05:40:00Z",
"value": 21189.4648
},
{
"time": "2022-10-24T05:50:00Z",
"value": 21113.0703
},
{
"time": "2022-10-24T06:00:00Z",
"value": 21074.2109
},
{
"time": "2022-10-24T06:10:00Z",
"value": 21093.0605
},
{
"time": "2022-10-24T06:20:00Z",
"value": 21117.6934
},
{
"time": "2022-10-24T06:30:00Z",
"value": 21087.8496
},
{
"time": "2022-10-24T06:40:00Z",
"value": 21077.8047
}
]
}
]
}

我的数据被加载,但不是完全加载。数组channels.dvalues.atimedtdvalue仅与null一起出现。我尝试通过load语句和其他方式加载数据来填充值数组和底层结构,但没有成功。

SELECT channels[5].name,channels.unit,channels[5].dvalues.atimedt
FROM observematare.longtimedata_v4; 
U2 max  ["V","V","V","V","V","V","V","V","V","V","V","V"]   NULL
2   U2 max  ["V","V","V","V","V","V","V","V","V","V","V","V"]   NULL

问题中没有提到hive版本

你也没有提到在表中使用不同的列名作为数据文件中的字段名的原因。我这么认为是因为values和time是保留关键字。

失败(值为null)原因-列名与json data中的字段名不同

有两种方法可以解决这个问题

  1. 修改数据文件中的字段名以匹配CREATE EXTERNAL TABLE语句
  2. 中的列名
  3. 使用反勾号(')覆盖保留的关键字-也称为quoted identifiers

关于使用Hive文档中的保留关键字

保留关键字允许作为标识符,如果您按照在列名中支持引用标识符(版本0.13.0及以后,请参阅HIVE-6013)中的描述引用它们。大多数关键字通过HIVE-6617保留,以减少语法中的歧义(版本1.2.0及更高版本)。如果用户仍然希望使用这些保留关键字作为标识符,有两种方法:(1)使用带引号的标识符,(2)设置hive.support.sql11.reserved.keywords=false。(2.1.0及更早版本)

  1. 修改数据文件
  2. 字段名
{
"measurementPointName": "Saab Scania, BBY2908, Sibbhult",
"measurementPointId": 35,
"dataTypeId": 1,
"dataTypeName": "Voltage",
"channels": [
{
"name": "U1 RMS",
"unit": "V",
"dvalues": [
{
"atimedt": "2022-10-24T00:00:00Z",
"dvalue": 21348.7695
},
{
"atimedt": "2022-10-24T00:10:00Z",
"dvalue": 21342.7656
},
{
"atimedt": "2022-10-24T00:20:00Z",
"dvalue": 21401.0957
},
{
"atimedt": "2022-10-24T00:30:00Z",
"dvalue": 21415.2129
},
{
"atimedt": "2022-10-24T00:40:00Z",
"dvalue": 21456.9062
},
{
"atimedt": "2022-10-24T00:50:00Z",
"dvalue": 21501.541
},
{
"atimedt": "2022-10-24T01:00:00Z",
"dvalue": 21527.4355
},
{
"atimedt": "2022-10-24T01:10:00Z",
"dvalue": 21510.9512
},
{
"atimedt": "2022-10-24T01:20:00Z",
"dvalue": 21436.2266
},
{
"atimedt": "2022-10-24T01:30:00Z",
"dvalue": 21423.5176
},
{
"atimedt": "2022-10-24T01:40:00Z",
"dvalue": 21415.6211
},
{
"atimedt": "2022-10-24T01:50:00Z",
"dvalue": 21416.8008
},
{
"atimedt": "2022-10-24T02:00:00Z",
"dvalue": 21460.6016
},
{
"atimedt": "2022-10-24T02:10:00Z",
"dvalue": 21502.0195
},
{
"atimedt": "2022-10-24T02:20:00Z",
"dvalue": 21405.748
},
{
"atimedt": "2022-10-24T02:30:00Z",
"dvalue": 21321.9062
},
{
"atimedt": "2022-10-24T02:40:00Z",
"dvalue": 21394.6074
},
{
"atimedt": "2022-10-24T02:50:00Z",
"dvalue": 21392.3496
},
{
"atimedt": "2022-10-24T03:00:00Z",
"dvalue": 21412.6875
},
{
"atimedt": "2022-10-24T03:10:00Z",
"dvalue": 21320.5469
},
{
"atimedt": "2022-10-24T03:20:00Z",
"dvalue": 21309.9746
},
{
"atimedt": "2022-10-24T03:30:00Z",
"dvalue": 21229.5879
},
{
"atimedt": "2022-10-24T03:40:00Z",
"dvalue": 21174.2891
},
{
"atimedt": "2022-10-24T03:50:00Z",
"dvalue": 21100.6406
},
{
"atimedt": "2022-10-24T04:00:00Z",
"dvalue": 21188.2207
},
{
"atimedt": "2022-10-24T04:10:00Z",
"dvalue": 21146.1191
},
{
"atimedt": "2022-10-24T04:20:00Z",
"dvalue": 21328.709
},
{
"atimedt": "2022-10-24T04:30:00Z",
"dvalue": 21216.4844
},
{
"atimedt": "2022-10-24T04:40:00Z",
"dvalue": 21104.7871
},
{
"atimedt": "2022-10-24T04:50:00Z",
"dvalue": 21096.8398
},
{
"atimedt": "2022-10-24T05:00:00Z",
"dvalue": 21147.1895
},
{
"atimedt": "2022-10-24T05:10:00Z",
"dvalue": 21330.084
},
{
"atimedt": "2022-10-24T05:20:00Z",
"dvalue": 21205.3965
},
{
"atimedt": "2022-10-24T05:30:00Z",
"dvalue": 21116.1309
},
{
"atimedt": "2022-10-24T05:40:00Z",
"dvalue": 21189.4648
},
{
"atimedt": "2022-10-24T05:50:00Z",
"dvalue": 21113.0703
},
{
"atimedt": "2022-10-24T06:00:00Z",
"dvalue": 21074.2109
},
{
"atimedt": "2022-10-24T06:10:00Z",
"dvalue": 21093.0605
},
{
"atimedt": "2022-10-24T06:20:00Z",
"dvalue": 21117.6934
},
{
"atimedt": "2022-10-24T06:30:00Z",
"dvalue": 21087.8496
},
{
"atimedt": "2022-10-24T06:40:00Z",
"dvalue": 21077.8047
}
]
}
]
}

Select语句:

SELECT channels[0].name,channels.unit,channels[0].dvalues.atimedt FROM longtimedata_v4;

输出:请注意列名

INFO  : OK
+---------+--------+----------------------------------------------------+
|  name   |  unit  |                      atimedt                       |
+---------+--------+----------------------------------------------------+
| U1 RMS  | ["V"]  | ["2022-10-24T00:00:00Z","2022-10-24T00:10:00Z","2022-10-24T00:20:00Z","2022-10-24T00:30:00Z","2022-10-24T00:40:00Z","2022-10-24T00:50:00Z","2022-10-24T01:00:00Z","2022-10-24T01:10:00Z","2022-10-24T01:20:00Z","2022-10-24T01:30:00Z","2022-10-24T01:40:00Z","2022-10-24T01:50:00Z","2022-10-24T02:00:00Z","2022-10-24T02:10:00Z","2022-10-24T02:20:00Z","2022-10-24T02:30:00Z","2022-10-24T02:40:00Z","2022-10-24T02:50:00Z","2022-10-24T03:00:00Z","2022-10-24T03:10:00Z","2022-10-24T03:20:00Z","2022-10-24T03:30:00Z","2022-10-24T03:40:00Z","2022-10-24T03:50:00Z","2022-10-24T04:00:00Z","2022-10-24T04:10:00Z","2022-10-24T04:20:00Z","2022-10-24T04:30:00Z","2022-10-24T04:40:00Z","2022-10-24T04:50:00Z","2022-10-24T05:00:00Z","2022-10-24T05:10:00Z","2022-10-24T05:20:00Z","2022-10-24T05:30:00Z","2022-10-24T05:40:00Z","2022-10-24T05:50:00Z","2022-10-24T06:00:00Z","2022-10-24T06:10:00Z","2022-10-24T06:20:00Z","2022-10-24T06:30:00Z","2022-10-24T06:40:00Z"] |
+---------+--------+----------------------------------------------------+
  1. 在创建表时使用引号标识符而不更改列名
create external table longtimedata_v4 (
measurementPointName string,
measurementPointId string, 
dataTypeId string,
dataTypeName string, 
channels ARRAY < struct < name : string, 
unit : string, `values` : ARRAY < struct < `time` : string, 
`value` : string >>>>
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION '/warehouse/tablespace/external/hive/longtimedata'

和select statement

SELECT channels[0].name,channels.unit,channels[0].`values`.`time` FROM longtimedata_v4;

输出:请注意列名

INFO  : OK
+---------+--------+----------------------------------------------------+
|  name   |  unit  |                        time                        |
+---------+--------+----------------------------------------------------+
| U1 RMS  | ["V"]  | ["2022-10-24T00:00:00Z","2022-10-24T00:10:00Z","2022-10-24T00:20:00Z","2022-10-24T00:30:00Z","2022-10-24T00:40:00Z","2022-10-24T00:50:00Z","2022-10-24T01:00:00Z","2022-10-24T01:10:00Z","2022-10-24T01:20:00Z","2022-10-24T01:30:00Z","2022-10-24T01:40:00Z","2022-10-24T01:50:00Z","2022-10-24T02:00:00Z","2022-10-24T02:10:00Z","2022-10-24T02:20:00Z","2022-10-24T02:30:00Z","2022-10-24T02:40:00Z","2022-10-24T02:50:00Z","2022-10-24T03:00:00Z","2022-10-24T03:10:00Z","2022-10-24T03:20:00Z","2022-10-24T03:30:00Z","2022-10-24T03:40:00Z","2022-10-24T03:50:00Z","2022-10-24T04:00:00Z","2022-10-24T04:10:00Z","2022-10-24T04:20:00Z","2022-10-24T04:30:00Z","2022-10-24T04:40:00Z","2022-10-24T04:50:00Z","2022-10-24T05:00:00Z","2022-10-24T05:10:00Z","2022-10-24T05:20:00Z","2022-10-24T05:30:00Z","2022-10-24T05:40:00Z","2022-10-24T05:50:00Z","2022-10-24T06:00:00Z","2022-10-24T06:10:00Z","2022-10-24T06:20:00Z","2022-10-24T06:30:00Z","2022-10-24T06:40:00Z"] |
+---------+--------+----------------------------------------------------+

相关内容

  • 没有找到相关文章

最新更新