HIVE-根据日期计算具有窗口的分区的统计信息



我见过类似于我的问题的解决方案,但没有一个对我很有效。我也相信应该有办法让它发挥作用。

给定一张带有的表格

目标1<1><1><1><1><1>
ID 日期
1 20200-01-01
1 2020-01-02
1 22020-01-03 0
1 20200-01-04
1 22020-01-04 0
1 20200-06-01
1 20200-06-02
1 20200-06-03 0
1 20200-06-04
1 20200-06-04 0
2 2020-01-01 1

如果你可以将月数近似为天数,那么你可以在Hive:中使用窗口函数

select id, date, 
count(*) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding -- 90 days
) as count_3,
sum(target) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding
) as sum_3,
count(*) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding -- 360 days
) as count_12,
sum(target) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding
) as sum_12
from mytable

您可以在同一查询中聚合:

select id, date, 
sum(count(*)) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding -- 90 days
) as count_3,
sum(sum(target)) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding
) as sum_3,
sum(count(*)) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding -- 360 days
) as count_12,
sum(sum(target)) over(
partition by id 
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding
) as sum_12
from mytable
group by id, date, unix_timestamp(date)

如果您可以估计间隔(1个月=30天(:(GMB答案的改进(

with t as (
select ID, Date,
sum(target) target,
count(target) c_target
from table
group by ID, Date
)
select ID, Date,
sum(target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 90 preceding
) sum_3,
sum(c_target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 90 preceding
) count_3,
sum(target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 360 preceding
) sum_12,
sum(c_target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 360 preceding
) count_12
from t

或者,如果你想要精确的间隔,你可以进行自联接(但代价高昂(:

with t as (
select ID, Date,
sum(target) target,
count(target) c_target
from table
group by ID, Date
)
select
t_3month.ID, 
t_3month.Date, 
t_3month.sum_3, 
t_3month.count_3, 
sum(t3.target) sum_12, 
sum(t3.c_target) count_12
from (
select 
t1.ID, 
t1.Date,
sum(t2.target) sum_3,
sum(t2.c_target) count_3
from t t1
left join t t2
on t2.Date > t1.Date - interval 3 month and
t2.Date <= t1.Date and
t1.ID = t2.ID
group by t1.ID, t1.Date
) t_3month
left join t t3
on t3.Date > t_3month.Date - interval 12 month and
t3.Date <= t_3month.Date and
t_3month.ID = t3.ID
group by t_3month.ID, t_3month.Date, t_3month.sum_3, t_3month.count_3
order by ID, Date;

最新更新