Kusto:自联接表并从不同行获取值



使用下面类似的数据集,我可以通过使用扫描运算符来获得所需的输出,以填充测试数据集中的前向字符串/布尔,但对于更大的数据集来说,这是超时的,因为每个属性都有许多事件,并且有数百万个属性。寻找更好/更快的方法。

let T = datatable(PropertyId:int, Tenant:string, Owner:string, NoisyNeighbour:bool , PropertyTitle:string, EventDate:datetime )
[
1, "", "", bool(0),"",datetime(2022-08-01 00:00),
1, "", "abc", bool(null),"",datetime(2022-08-01 01:00),
1, "X","", bool(null),"Title updated",datetime(2022-08-02 00:00),
1, "X", "cde",bool(null),"",datetime(2022-08-03 00:00),
1, "A1", "",bool(null),"",datetime(2022-08-03 00:00),
1, "A2", "",bool(null),"",datetime(2022-08-03 02:00),
1, "A2", "def",bool(null),"",datetime(2022-08-03 03:00),
1, "B", "", bool(null),"",datetime(2022-08-04 00:00),
1, "C","", bool(1),"",datetime(2022-08-05 00:00),
1, "D", "xyz",bool(null),"",datetime(2022-08-06 00:00),
]; T

预期结果:用NoisyNeighbor标记房产的租户,标记噪音的日期,房产的当前所有者,房产所有权

基于arg_max((

  • 对于每个属性,查找最后一次用NN标记的时间
  • 对于每个属性,只获取NN事件之前的事件(包括其自身(
  • 对于每个属性,取Tenant&最后一个事件(用NN标记的事件(的CCD_ 2和最后一个CCD_;不为空的PropertyTitle

let T = datatable(PropertyId:int, Tenant:string, Owner:string, NoisyNeighbour:bool , PropertyTitle:string, EventDate:datetime )
[
1 ,""   ,""    ,false      ,""              ,datetime(2022-08-01 00:00)
,1 ,""   ,"abc" ,bool(null) ,""              ,datetime(2022-08-01 01:00)
,1 ,"X"  ,""    ,bool(null) ,"Title updated" ,datetime(2022-08-02 00:00)
,1 ,"X"  ,"cde" ,bool(null) ,""              ,datetime(2022-08-03 00:00)
,1 ,"A1" ,""    ,bool(null) ,""              ,datetime(2022-08-03 00:00)
,1 ,"A2" ,""    ,bool(null) ,""              ,datetime(2022-08-03 02:00)
,1 ,"A2" ,"def" ,bool(null) ,""              ,datetime(2022-08-03 03:00)
,1 ,"B"  ,""    ,bool(null) ,""              ,datetime(2022-08-04 00:00)
,1 ,"C"  ,""    ,true       ,""              ,datetime(2022-08-05 00:00)
,1 ,"D"  ,"xyz" ,bool(null) ,""              ,datetime(2022-08-06 00:00)
];
T
| where         NoisyNeighbour
| summarize     max(EventDate) by PropertyId
| join          kind=inner T on PropertyId
| where         EventDate <= max_EventDate
| summarize     arg_max(EventDate, Tenant, NoisyNeighbour)
,arg_max(iff(isnotempty(Owner)         ,EventDate ,datetime(null)) ,Owner) 
,arg_max(iff(isnotempty(PropertyTitle) ,EventDate ,datetime(null)) ,PropertyTitle) 
by PropertyId
| project-away  max*
租户NoisyNeighbor>truedef标题已更新
PropertyId事件日期所有者PropertyTitle
12022-08-05T00:00ZC

分区运算符用于并行化扫描运算符的工作。

let T = datatable(PropertyId:int, Tenant:string, Owner:string, NoisyNeighbour:bool , PropertyTitle:string, EventDate:datetime )
[
1 ,""   ,""    ,false      ,""              ,datetime(2022-08-01 00:00)
,1 ,""   ,"abc" ,bool(null) ,""              ,datetime(2022-08-01 01:00)
,1 ,"X"  ,""    ,bool(null) ,"Title updated" ,datetime(2022-08-02 00:00)
,1 ,"X"  ,"cde" ,bool(null) ,""              ,datetime(2022-08-03 00:00)
,1 ,"A1" ,""    ,bool(null) ,""              ,datetime(2022-08-03 00:00)
,1 ,"A2" ,""    ,bool(null) ,""              ,datetime(2022-08-03 02:00)
,1 ,"A2" ,"def" ,bool(null) ,""              ,datetime(2022-08-03 03:00)
,1 ,"B"  ,""    ,bool(null) ,""              ,datetime(2022-08-04 00:00)
,1 ,"C"  ,""    ,true       ,""              ,datetime(2022-08-05 00:00)
,1 ,"D"  ,"xyz" ,bool(null) ,""              ,datetime(2022-08-06 00:00)
];
T
|   partition hint.strategy=native by PropertyId 
(
order by EventDate asc
|   scan with 
(
step s1 : true =>   Owner           = coalesce(Owner        , s1.Owner)
,PropertyTitle   = coalesce(PropertyTitle, s1.PropertyTitle);
)
|   where NoisyNeighbour
)
所有者NoisyNeighbortrue更新标题
PropertyId租户PropertyTitle事件日期
1Cdef2022-08-05T000:00:00Z

最新更新