我试图创建一个逻辑,重新计算使用Apache Hive或Spark SQL相邻行的数据,但我不确定如何,所以我问一个问题。
12
11 1
10 2
9 3
8 4
7 5
6
hh val
1 5
2 4
3 3
4 2
5 1
6 5
7 4
8 3
9 2
10 1
11 5
12 4
重新计算逻辑为:将相邻两个时区的值相加。12点重新计算为19,将10点位置的1、11点位置的5、1点位置的5和2点位置的4加到12点位置的4。
用程序语言似乎很简单,但用SQL很难表达。如果有人知道怎么做,请帮忙。
不能表示为简单的聚合函数(sum)
我不确定我是否正确理解了您的需求,但是您可以考虑使用窗口函数
将相邻两个时区的值相加
-- sample data
WITH sample_table AS (
SELECT 1 hh, 5 val UNION ALL
SELECT 2 hh, 4 val UNION ALL
SELECT 3 hh, 3 val UNION ALL
SELECT 4 hh, 2 val UNION ALL
SELECT 5 hh, 1 val UNION ALL
SELECT 6 hh, 5 val UNION ALL
SELECT 7 hh, 4 val UNION ALL
SELECT 8 hh, 3 val UNION ALL
SELECT 9 hh, 3 val UNION ALL
SELECT 10 hh, 1 val UNION ALL
SELECT 11 hh, 5 val UNION ALL
SELECT 12 hh, 4 val
)
-- query starts here
SELECT hh, val, recalculation FROM (
SELECT *, SUM(val) OVER w AS recalculation
FROM (
SELECT 0 AS day, * FROM sample_table UNION ALL
SELECT 1 AS day, * FROM sample_table UNION ALL
SELECT 2 AS day, * FROM sample_table
) t
-- below window frame covers adjacent 2 time zones (includes 5 hours).
WINDOW w AS (ORDER BY day, hh ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING)
) t
WHERE day = 1 ORDER BY hh;
-- query result
+----+-----+---------------+
| hh | val | recalculation |
+----+-----+---------------+
| 1 | 5 | 21 |
| 2 | 4 | 18 |
| 3 | 3 | 15 |
| 4 | 2 | 15 |
| 5 | 1 | 15 |
| 6 | 5 | 15 |
| 7 | 4 | 16 |
| 8 | 3 | 16 |
| 9 | 3 | 16 |
| 10 | 1 | 16 |
| 11 | 5 | 18 |
| 12 | 4 | 19 |
+----+-----+---------------+