识别重复记录组(TSQL)



我在MSSQL数据库中有一个巨大的表(超过95 000 000条记录(

id
1 2 3 4 5 6 7 8 9

此查询对95M条记录的性能可能并不理想,但这应该可以做到。

查找包含多行的组的精确匹配

DROP TABLE IF EXISTS #Config
CREATE TABLE #Config 
(id int
,configuration_id int
,equipment_group_id int
,name VARCHAR(100)
,price INT
)
INSERT INTO #Config
VALUES
(1,1,100,'item1',10)
,(2,1,100,'item2',20)
,(3,1,100,'item3',30)
,(4,2,100,'item1',10)
,(5,2,100,'item2',20)
,(6,2,100,'item3',30)
,(7,3,100,'item1',10)
,(8,3,100,'item2',20)
,(9,3,100,'item3',31)

;WITH cte_ConfigCount AS (
SELECT *,ConfigTotalRowCnt = COUNT(*) OVER (PARTITION BY A.configuration_id) /*Counts how many rows in each config*/
FROM #Config AS A
)
SELECT 
A.configuration_id
,B.configuration_id
,TextDescription = CONCAT('Config #',A.configuration_id,' matches Config #',B.configuration_id)
,A.ConfigTotalRowCnt
,RowsMatch = COUNT(*)
FROM cte_ConfigCount AS A
INNER JOIN cte_ConfigCount AS B
ON A.configuration_id < B.configuration_id /*Don't join to self and only join 1 way (so don't have one row with A-B and another row with B-A)*/
AND a.equipment_group_id = B.equipment_group_id
AND A.name = B.name
AND A.price = B.price
GROUP BY A.configuration_id,A.ConfigTotalRowCnt,B.configuration_id
HAVING A.ConfigTotalRowCnt = COUNT(*) /*Only return where the total row for the config matches the rows where the configs match*/

最新更新