使用下面类似的数据集,我可以通过使用扫描运算符来获得所需的输出,以填充测试数据集中的前向字符串/布尔,但对于更大的数据集来说,这是超时的,因为每个属性都有许多事件,并且有数百万个属性。寻找更好/更快的方法。
let T = datatable(PropertyId:int, Tenant:string, Owner:string, NoisyNeighbour:bool , PropertyTitle:string, EventDate:datetime )
[
1, "", "", bool(0),"",datetime(2022-08-01 00:00),
1, "", "abc", bool(null),"",datetime(2022-08-01 01:00),
1, "X","", bool(null),"Title updated",datetime(2022-08-02 00:00),
1, "X", "cde",bool(null),"",datetime(2022-08-03 00:00),
1, "A1", "",bool(null),"",datetime(2022-08-03 00:00),
1, "A2", "",bool(null),"",datetime(2022-08-03 02:00),
1, "A2", "def",bool(null),"",datetime(2022-08-03 03:00),
1, "B", "", bool(null),"",datetime(2022-08-04 00:00),
1, "C","", bool(1),"",datetime(2022-08-05 00:00),
1, "D", "xyz",bool(null),"",datetime(2022-08-06 00:00),
]; T
预期结果:用NoisyNeighbor标记房产的租户,标记噪音的日期,房产的当前所有者,房产所有权
基于arg_max((
- 对于每个属性,查找最后一次用NN标记的时间
- 对于每个属性,只获取NN事件之前的事件(包括其自身(
- 对于每个属性,取
Tenant
&最后一个事件(用NN标记的事件(的CCD_ 2和最后一个CCD_;不为空的PropertyTitle
let T = datatable(PropertyId:int, Tenant:string, Owner:string, NoisyNeighbour:bool , PropertyTitle:string, EventDate:datetime )
[
1 ,"" ,"" ,false ,"" ,datetime(2022-08-01 00:00)
,1 ,"" ,"abc" ,bool(null) ,"" ,datetime(2022-08-01 01:00)
,1 ,"X" ,"" ,bool(null) ,"Title updated" ,datetime(2022-08-02 00:00)
,1 ,"X" ,"cde" ,bool(null) ,"" ,datetime(2022-08-03 00:00)
,1 ,"A1" ,"" ,bool(null) ,"" ,datetime(2022-08-03 00:00)
,1 ,"A2" ,"" ,bool(null) ,"" ,datetime(2022-08-03 02:00)
,1 ,"A2" ,"def" ,bool(null) ,"" ,datetime(2022-08-03 03:00)
,1 ,"B" ,"" ,bool(null) ,"" ,datetime(2022-08-04 00:00)
,1 ,"C" ,"" ,true ,"" ,datetime(2022-08-05 00:00)
,1 ,"D" ,"xyz" ,bool(null) ,"" ,datetime(2022-08-06 00:00)
];
T
| where NoisyNeighbour
| summarize max(EventDate) by PropertyId
| join kind=inner T on PropertyId
| where EventDate <= max_EventDate
| summarize arg_max(EventDate, Tenant, NoisyNeighbour)
,arg_max(iff(isnotempty(Owner) ,EventDate ,datetime(null)) ,Owner)
,arg_max(iff(isnotempty(PropertyTitle) ,EventDate ,datetime(null)) ,PropertyTitle)
by PropertyId
| project-away max*
PropertyId | 事件日期 | 租户NoisyNeighbor所有者 | >PropertyTitle |
---|---|---|---|
1 | 2022-08-05T00:00Z | C | truedef标题已更新
分区运算符用于并行化扫描运算符的工作。
let T = datatable(PropertyId:int, Tenant:string, Owner:string, NoisyNeighbour:bool , PropertyTitle:string, EventDate:datetime )
[
1 ,"" ,"" ,false ,"" ,datetime(2022-08-01 00:00)
,1 ,"" ,"abc" ,bool(null) ,"" ,datetime(2022-08-01 01:00)
,1 ,"X" ,"" ,bool(null) ,"Title updated" ,datetime(2022-08-02 00:00)
,1 ,"X" ,"cde" ,bool(null) ,"" ,datetime(2022-08-03 00:00)
,1 ,"A1" ,"" ,bool(null) ,"" ,datetime(2022-08-03 00:00)
,1 ,"A2" ,"" ,bool(null) ,"" ,datetime(2022-08-03 02:00)
,1 ,"A2" ,"def" ,bool(null) ,"" ,datetime(2022-08-03 03:00)
,1 ,"B" ,"" ,bool(null) ,"" ,datetime(2022-08-04 00:00)
,1 ,"C" ,"" ,true ,"" ,datetime(2022-08-05 00:00)
,1 ,"D" ,"xyz" ,bool(null) ,"" ,datetime(2022-08-06 00:00)
];
T
| partition hint.strategy=native by PropertyId
(
order by EventDate asc
| scan with
(
step s1 : true => Owner = coalesce(Owner , s1.Owner)
,PropertyTitle = coalesce(PropertyTitle, s1.PropertyTitle);
)
| where NoisyNeighbour
)
PropertyId | 租户 | 所有者NoisyNeighborPropertyTitle | 事件日期 |
---|---|---|---|
1 | C | def | true更新标题2022-08-05T000:00:00Z |