我有一个查询:
select y, m, count(distinct id) as y_m_cnt, *other columns* from tab group by y, m, *other columns*
),
t2 as (
select y, count(distinct id) as y_cnt, *other columns* from tab group by y, *other columns*
)
select * t1.y, t1.m, t1.y_m_cnt, t2.y_cnt, t1.*other columns* from t1 left join t2 on t1.y=t2.y, t1.*other columns* = t2.*other columns*
其中y为年,m为月。我的想法是,我想在两个不同的聚合中计数唯一的id,它们只在周期中不同(一个是年+月,另一个是年)。上面的代码是工作和几乎ok,但我不喜欢它。是否有可能重写这段代码并使其更短?
注:这段代码是简化的。实际上它有更多的列,所以我需要选择相同的列两次,然后我有非常大的连接条件(查询超过100行)。这就是为什么我要简化它。找到了一些分区的解决方案,但是它们没有解决唯一id的问题。
编辑:希望它能在没有完整查询的情况下工作,但似乎不是。问题是我只需要year_count,因为聚合没有mm, dd。有可能使这个查询更短吗?
with full_count as (
select param1,
yy,
mm,
dd,
param2,
param3,
param4,
count(distinct id1) as cnt_id1,
count(distinct id2) as cnt_id2,
count(distinct id3) as cnt_id3,
cast(null as string) as param5,
cast(null as string) as param6,
cast(null as string) as param7,
cast(null as string) as param8,
param9,
param10,
param11,
param12,
param13,
param14,
param15,
param16,
param17,
param18,
param19,
param20,
param21,
param22
from table
group by param1,
yy,
mm,
dd,
param2,
param3,
param4,
param9,
param10,
param11,
param12,
param13,
param14,
param15,
param16,
param17,
param18,
param19,
param20,
param21,
param22
),
year_count as (
select param1,
yy,
param2,
param3,
param4,
count(distinct id1) as cnt_id1_yy,
count(distinct id2) as cnt_id2_yy,
count(distinct id3) as cnt_id3_yy,
cast(null as string) as param5,
cast(null as string) as param6,
cast(null as string) as param7,
cast(null as string) as param8,
param9,
param10,
param11,
param12,
param13,
param14,
param15,
param16,
param17,
param18,
param19,
param20,
param21,
param22
from table
group by param1,
yy,
param2,
param3,
param4,
param9,
param10,
param11,
param12,
param13,
param14,
param15,
param16,
param17,
param18,
param19,
param20,
param21,
param22
)
select fc.param1 as param1,
fc.yy as yy,
fc.mm as mm,
fc.dd as dd,
fc.param2 as param2,
fc.param3 as param3,
fc.param4 as param4,
fc.cnt_id1 as cnt_id1,
fc.cnt_id2 as cnt_id2,
fc.cnt_id3 as cnt_id3,
yc.cnt_id1_yy as cnt_id1_yy,
yc.cnt_id2_yy as cnt_id2_yy,
yc.cnt_id3_yy as cnt_id3_yy,
fc.param5 as param5,
fc.param6 as param6,
fc.param7 as param7,
fc.param8 as param8,
fc.param9 as param9,
fc.param10 as param10,
fc.param11 as param11,
fc.param12 as param12,
fc.param13 as param13,
fc.param14 as param14,
fc.param15 as param15,
fc.param16 as param16,
fc.param17 as param17,
fc.param18 as param18,
fc.param19 as param19,
fc.param20 as param20,
fc.param21 as param21,
fc.param22 as param22
from full_count fc
left join year_count yc on
coalesce(fc.param1, 'tmp') = coalesce(yc.param1, 'tmp') and
coalesce(fc.yy, 'tmp') = coalesce(yc.yy, 'tmp') and
coalesce(fc.param2, 'tmp') = coalesce(yc.param2, 'tmp') and
coalesce(fc.param3, 'tmp') = coalesce(yc.param3, 'tmp') and
coalesce(fc.param4, 'tmp') = coalesce(yc.param4, 'tmp') and
coalesce(fc.param6, 'tmp') = coalesce(yc.param6, 'tmp') and
coalesce(fc.param9, 'tmp') = coalesce(yc.param9, 'tmp') and
coalesce(fc.param10, 'tmp') = coalesce(yc.param10, 'tmp') and
coalesce(fc.param11, 'tmp') = coalesce(yc.param11, 'tmp') and
coalesce(fc.param12, 'tmp') = coalesce(yc.param12, 'tmp') and
coalesce(fc.param13, 'tmp') = coalesce(yc.param13, 'tmp') and
coalesce(fc.param14, 'tmp') = coalesce(yc.param14, 'tmp') and
coalesce(fc.param15, 'tmp') = coalesce(yc.param15, 'tmp') and
coalesce(fc.param16, 'tmp') = coalesce(yc.param16, 'tmp') and
coalesce(fc.param17, 'tmp') = coalesce(yc.param17, 'tmp') and
coalesce(fc.param18, 'tmp') = coalesce(yc.param18, 'tmp') and
coalesce(fc.param19, 'tmp') = coalesce(yc.param19, 'tmp') and
coalesce(fc.param20, 'tmp') = coalesce(yc.param20, 'tmp') and
coalesce(fc.param21, 'tmp') = coalesce(yc.param21, 'tmp') and
coalesce(fc.param22, 'tmp') = coalesce(yc.param22, 'tmp')
您不指定您正在使用的数据库,但许多支持count(distinct)
作为窗口函数:
select t.*,
count(distinct id) over (partition by y) as y_unique,
count(distinct id) over (partition by y, m) as ym_unique
from tab t;
在不支持distinct
的窗口函数的数据库中有解决方法。