我需要一个连续日期的唯一ID
源表
SNAPSHOT_DATE | CHANNEL | CASE_ID|
---|---|---|
2022-10-18 | 网页 | 521nzT3HQA|
2022-19 | 网页 | 521nzT3HQA |
2022-10-20 | web | 521nzT3HQA |
2022-10-23 | web | 521nzT3HQA |
2022-10-24 | web | 521nzT3HQA |
2022-10-25 | web | 521nzT3HQA |
2022-10-18 | 电话 | 521nzT3HQA//tr>|
2022-19 | 电话 | 521nzT3HQA |
2022-10-21 | 电话 | 521nzT3HQA |
2022-10-22 | 电话 | 521nzT3HQA |
2022-10-18 | 电话 | 52LnlJQAS |
2022-10-26 | 电话 | 52LnlJQAS |
2022-10-20 | 电话 | 521nzT3HQA |
2022-10-24 | 电话 | 521nzT3HQA|
2022-10-25 | 电话 | 521nzT3HQA
这是搜索间隙的典型问题。
像这样的东西应该起作用:
WITH sample_data AS (
SELECT SNAPSHOT_DATE::date AS SNAPSHOT_DATE, CHANNEL, CASE_ID
FROM (VALUES ('2022-10-18', 'web', '521nzT3HQA'),
('2022-10-19', 'web', '521nzT3HQA'),
('2022-10-20', 'web', '521nzT3HQA'),
('2022-10-23', 'web', '521nzT3HQA'),
('2022-10-24', 'web', '521nzT3HQA'),
('2022-10-25', 'web', '521nzT3HQA'),
('2022-10-18', 'phone', '521nzT3HQA'),
('2022-10-19', 'phone', '521nzT3HQA'),
('2022-10-21', 'phone', '521nzT3HQA'),
('2022-10-22', 'phone', '521nzT3HQA'),
('2022-10-18', 'phone', '52LnlJQAS' ),
('2022-10-26', 'phone', '52LnlJQAS' ),
('2022-10-20', 'phone', '521nzT3HQA'),
('2022-10-24', 'phone', '521nzT3HQA'),
('2022-10-25', 'phone', '521nzT3HQA')) T(SNAPSHOT_DATE, CHANNEL, CASE_ID)
), grp AS (
SELECT SNAPSHOT_DATE, CHANNEL, CASE_ID
, MIN(SNAPSHOT_DATE) OVER(PARTITION BY CASE_ID, CHANNEL) AS MIN_SNAPSHOT_DATE
, DENSE_RANK() OVER(PARTITION BY CASE_ID, CHANNEL ORDER BY SNAPSHOT_DATE)
+ DATEDIFF(DAY, SNAPSHOT_DATE, MIN_SNAPSHOT_DATE) AS DIFF
FROM sample_data
)
SELECT SNAPSHOT_DATE, CHANNEL, CASE_ID
, CASE_ID || CHANNEL || MIN(SNAPSHOT_DATE) OVER(PARTITION BY CASE_ID, CHANNEL, DIFF) AS ID
FROM grp
ORDER BY CASE_ID, CHANNEL, SNAPSHOT_DATE;
下面的查询解决了您的问题。它利用滞后来检测一组channel和case_id中的非连续日期。此外,使用滚动和来创建标志,该标志用于分组,以便获得连续日期范围内最早的快照日期。
select snapshot_date
, channel
, case_id
-- get the minimum date over channel, case_id and the group flag
, case_id ||channel || min(snapshot_date) over (partition by channel, case_id, grp_flag) as compound_id
from (
select sum(non_cons_dt) over( partition by channel, case_id order by snapshot_date rows between unbounded preceding and current row) as grp_flag --rolling sum as group flag, will increment on non consecutive date
, x.*
from (
select
iff(dateadd('day', 1, lag(snapshot_date) over(partition by channel, case_id order by snapshot_date)) = snapshot_date, 0, 1) as non_cons_dt -- set 1 if there is a non consecutive date within the group
, snapshot_date
, channel
, case_id
from test
) x
) y