我见过类似于我的问题的解决方案,但没有一个对我很有效。我也相信应该有办法让它发挥作用。
给定一张带有的表格
ID | 日期 | 目标|
---|---|---|
1 | 20200-01-01 | 1|
1 | 2020-01-02 | <1>|
1 | 22020-01-03 | 0 |
1 | 20200-01-04 | <1>|
1 | 22020-01-04 | 0 |
1 | 20200-06-01 | <1>|
1 | 20200-06-02 | <1>|
1 | 20200-06-03 | 0 |
1 | 20200-06-04 | <1>|
1 | 20200-06-04 | 0 |
2 | 2020-01-01 | 1 |
如果你可以将月数近似为天数,那么你可以在Hive:中使用窗口函数
select id, date,
count(*) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding -- 90 days
) as count_3,
sum(target) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding
) as sum_3,
count(*) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding -- 360 days
) as count_12,
sum(target) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding
) as sum_12
from mytable
您可以在同一查询中聚合:
select id, date,
sum(count(*)) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding -- 90 days
) as count_3,
sum(sum(target)) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 90 preceding
) as sum_3,
sum(count(*)) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding -- 360 days
) as count_12,
sum(sum(target)) over(
partition by id
order by unix_timestamp(date)
range 60 * 60 * 24 * 360 preceding
) as sum_12
from mytable
group by id, date, unix_timestamp(date)
如果您可以估计间隔(1个月=30天(:(GMB答案的改进(
with t as (
select ID, Date,
sum(target) target,
count(target) c_target
from table
group by ID, Date
)
select ID, Date,
sum(target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 90 preceding
) sum_3,
sum(c_target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 90 preceding
) count_3,
sum(target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 360 preceding
) sum_12,
sum(c_target) over(
partition by ID
order by unix_timestamp(Date, 'yyyy-MM-dd')
range 60 * 60 * 24 * 360 preceding
) count_12
from t
或者,如果你想要精确的间隔,你可以进行自联接(但代价高昂(:
with t as (
select ID, Date,
sum(target) target,
count(target) c_target
from table
group by ID, Date
)
select
t_3month.ID,
t_3month.Date,
t_3month.sum_3,
t_3month.count_3,
sum(t3.target) sum_12,
sum(t3.c_target) count_12
from (
select
t1.ID,
t1.Date,
sum(t2.target) sum_3,
sum(t2.c_target) count_3
from t t1
left join t t2
on t2.Date > t1.Date - interval 3 month and
t2.Date <= t1.Date and
t1.ID = t2.ID
group by t1.ID, t1.Date
) t_3month
left join t t3
on t3.Date > t_3month.Date - interval 12 month and
t3.Date <= t_3month.Date and
t_3month.ID = t3.ID
group by t_3month.ID, t_3month.Date, t_3month.sum_3, t_3month.count_3
order by ID, Date;