我有一个具有如下数据的表格,
id code data1 data2 country
1 1 A NULL IND
1 1 B B NZ
1 1 CA
1 1 C Z WI
1 1 D S UK
2 2 NULL NULL IND
2 2 S NULL NZ
2 2 NULL K CA
2 2 T T WI
2 2 R K UK
3 3 NULL A WI
3 3 NULL a UK
记录将根据国家领域的优先级进行人口。优先级是Ind,NZ,CA,WI,UK
如果data1中有任何空,data2字段数据将从下一个优先级记录中填充。
所以,我的预期结果是:目标表:
id code data1 data2 country
1 1 A B IND
2 2 S K IND
3 3 NULL A WI
任何人都可以帮助我查询以实现上述结果集。
我添加了更多的行,以更好地了解查询。
Hive具有first_value()
函数,可用于此目的:
select distinct id, code,
first_value(data1) over (partition by id, code
order by (case when data1 is not null then 1 else 2 end),
(case country when 'IND' then 1 when 'NZ' then 2 when 'CA' then 3 when 'WI' then 4 when 'UK' then 5 else 6 end)
) as data1,
first_value(data2) over (partition by id, code
order by (case when data2 is not null then 1 else 2 end),
(case country when 'IND' then 1 when 'NZ' then 2 when 'CA' then 3 when 'WI' then 4 when 'UK' then 5 else 6 end)
) as data2,
first_value(country) over (partition by id, code
order by (case when data1 is not null then 1 else 2 end),
(case country when 'IND' then 1 when 'NZ' then 2 when 'CA' then 3 when 'WI' then 4 when 'UK' then 5 else 6 end)
) as country
from t;
我不是带有窗口功能的select distinct
的忠实拥护者。在这种情况下,这似乎是最简单的解决方案。
用例获得优先级并在其上使用first_value。
select id, max(code), max(data1), max(data2), max(country)
from (
select
id,
code,
first_value(data1) over (partition by id
order by case when data1 is null or data1 = '' then 1 else 0 end * 10 + priority) data1,
first_value(data2) over (partition by id
order by case when data2 is null or data2 = '' then 1 else 0 end * 10 + priority) data2,
first_value(country) over (partition by id
order by case when country is null or country = '' then 1 else 0 end * 10 + priority) country
from (
select
t.*,
case country
when 'IND' then 1
when 'NZ' then 2
when 'CA' then 3
when 'WI' then 4
when 'UK' then 5
end priority
from your_table t
) t
) t group by id;
生产:
ID MAX(CODE) MAX(DATA1) MAX(DATA2) MAX(COUNTRY)
1 1 A B IND
2 2 S K IND
3 3 NULL A WI
编辑:
您可以使用FIELD
函数(在Hive,MySQL中可用)来产生@Dudu在下面的评论中建议的优先级:
field(country,'IND','NZ','CA','WI','UK')
请参阅:
- https://cwiki.apache.org/confluence/display/hive/languagemanual dudf
基于struct min的另一种方法。
对于我正在使用函数field
(field(country,'IND','NZ','CA','WI','UK')
)的顺序。
由于缺少它,我将其添加到了文档中。https://cwiki.apache.org/confluence/display/hive/languagemanual dudf
select id
,min (code) as code
,min (case when coalesce(trim(data1),'') <> '' then struct(field(country,'IND','NZ','CA','WI','UK'),data1) end).col2 as data1
,min (case when coalesce(trim(data2),'') <> '' then struct(field(country,'IND','NZ','CA','WI','UK'),data2) end).col2 as data2
,min (struct(field(country,'IND','NZ','CA','WI','UK'),country)).col2 as country
from mytable
group by id
order by id
;
演示
create table mytable
(
id int
,code int
,data1 string
,data2 string
,country string
);
insert into mytable values
(1 ,1 ,'A' ,NULL ,'IND')
,(1 ,1 ,'B' ,'B' ,'NZ' )
,(1 ,1 ,'' ,'' ,'CA' )
,(1 ,1 ,'C' ,'Z' ,'WI' )
,(1 ,1 ,'D' ,'S' ,'UK' )
,(2 ,2 ,NULL ,NULL ,'IND')
,(2 ,2 ,'S' ,NULL ,'NZ' )
,(2 ,2 ,NULL ,'K' ,'CA' )
,(2 ,2 ,'T' ,'T' ,'WI' )
,(2 ,2 ,'R' ,'K' ,'UK' )
,(3 ,3 ,NULL ,'A' ,'WI' )
,(3 ,3 ,NULL ,'a' ,'UK' )
;
select id
,min (code) as code
,min (case when coalesce(trim(data1),'') <> '' then struct(field(country,'IND','NZ','CA','WI','UK'),data1) end).col2 as data1
,min (case when coalesce(trim(data2),'') <> '' then struct(field(country,'IND','NZ','CA','WI','UK'),data2) end).col2 as data2
,min (struct(field(country,'IND','NZ','CA','WI','UK'),country)).col2 as country
from mytable
group by id
order by id
;
+----+------+-------+-------+---------+
| id | code | data1 | data2 | country |
+----+------+-------+-------+---------+
| 1 | 1 | A | B | IND |
| 2 | 2 | S | K | IND |
| 3 | 3 | NULL | A | WI |
+----+------+-------+-------+---------+