你好,我想为Set Cover问题实现一个T-SQL查询,但一直无法找到关于如何在SQL中做到这一点的任何提示。
在我的情况下,我的表只有两列(IDnumber
和Mut
),我想找到IDNumber
的最小数量,以获得每个Mut
的一个。我真的想要每个Mut
获得三个IDnumbers
,但我想我最好从一个开始,因为这可能更容易。
DECLARE @myTable TABLE (
IDnumber int,
Mut varchar(1))
INSERT @myTable VALUES
(1,'C'), (1,'N'), (1,'Z'), (1,'M'), (1,'E'), (2,'E'), (3,'B'), (3,'N'), (3,'D'), (3,'K'),
(3,'W'), (4,'O'), (4,'G'), (4,'N'), (4,'B'), (4,'U'), (4,'C'), (5,'Q'), (5,'H'), (6,'K'),
(6,'Y'), (6,'M'), (6,'A'), (6,'O'), (6,'U'), (6,'J'), (7,'H'), (7,'U'), (7,'M'), (7,'L'),
(8,'B'), (8,'K'), (8,'P'), (9,'Y'), (9,'K'), (10,'Z'), (11,'R'), (12,'X'), (12,'R'),
(12,'O'), (12,'Z'), (4,'C'), (1,'Z'), (4,'S'), (6,'E'), (5,'G'), (4,'C'), (4,'S'), (4,'H'),
(6,'D'), (7,'W'), (3,'U'), (6,'N'), (7,'Y'), (6,'N'), (6,'F'), (4,'C'), (4,'I'), (7,'P'),
(10,'H'), (10,'Z'), (10,'S'), (7,'Z'), (6,'B'), (7,'Z'), (8,'X'), (8,'J'), (8,'P'), (10,'K'),
(8,'K'), (12,'P'), (8,'W'), (10,'M'), (12,'F'), (9,'T'), (9,'D'), (14,'Y'), (12,'P'),
(14,'J'), (13,'D'), (15,'H'), (12,'J'), (6,'H'), (2,'Z'), (8,'G'), (10,'Q'), (6,'D'),
(5,'X'), (9,'T'), (6,'W'), (6,'K'), (10,'W'), (7,'J'), (11,'W'), (12,'V'), (9,'F'), (7,'F'),
(4,'M'), (5,'K'), (12,'G'), (12,'T'), (15,'T'), (13,'W'), (7,'J'), (9,'T'), (10,'U'), (9,'S'),
(10,'L'), (10,'J'), (10,'H'), (11,'H'), (12,'S'), (12,'A'), (14,'L'), (13,'K'), (13,'D'),
(4,'M'), (3,'N'), (4,'F'), (7,'M'), (7,'V'), (5,'R'), (4,'K'), (5,'F'), (7,'G'), (8,'M'),
(4,'X'), (7,'F'), (9,'S'), (7,'N'), (6,'W'), (6,'W'), (5,'S'), (9,'Z'), (10,'I'), (11,'Y'),
(11,'D'), (9,'X'), (7,'G'), (9,'S'), (9,'H'), (9,'T'), (8,'J'), (10,'U'), (9,'F'), (9,'S'),
(7,'D'), (14,'R'), (10,'F'), (7,'E'), (15,'M'), (12,'F'), (5,'C'), (8,'E'), (16,'G'), (11,'V'),
(10,'I'), (12,'I'), (11,'Y'), (12,'I'), (14,'J'), (15,'D'), (19,'J'), (16,'B'), (12,'G'),
(9,'J'), (18,'J'), (18,'C'), (16,'Q'), (18,'P'), (13,'F'), (19,'T'), (15,'J'), (15,'R'),
(15,'Q'), (15,'O'), (11,'A'), (24,'B'), (19,'S'), (22,'I'), (15,'X'), (20,'T'), (15,'E'),
(9,'V'), (8,'H'), (16,'N'), (17,'H')
-- Since the above list was generated by a bunch of random numbers/letters I need to
-- delete the duplicates
;WITH cte AS (
SELECT IDnumber, mut,
row_number() OVER(PARTITION BY IDNumber, mut ORDER BY IDNumber) AS [rn]
FROM @myTable
)
DELETE cte WHERE [rn] > 1
SELECT *
FROM ( SELECT IDnumber, Mut FROM @myTable) AS S
PIVOT
(COUNT(Mut) FOR mut IN ([A],[B],[C],[D],[E],[F],[G],[H],[I],[J],[K],[L],[M],[N],[O],[P],
[Q],[R],[S],[T],[U],[V],[W],[X],[Y],[Z])) AS pvt
因此,您可以从数据透视表中看到,最小的IDnumbers
将是3,5,7和12。
如何实现这个算法?在我看来,我可以找到所有的组合(2^6)它们可能是集合然后确定哪些集合有所有的mut。具有最小数量的id的集合就是最小集合。
这种蛮力可能有效,但效率极低。我的真实世界案例并不庞大,我有43个独特的Muts
(不是例子中的9个)和~2000个IDnumbers
,但我认为这需要一些时间来运行,因为2^2000是相当大的…
谢谢!
下面的方法为每个IDNumber
计算Mut
值的位掩码,然后使用递归CTE生成完成集合的所有可能组合。该代码输出所有可能的IdNumber
组合,给出最终结果,不包括那些一个或多个IdNumber
在组合中冗余的(例如样本数据集中的1,2,3,4,5,6
)。
要将其转换为实际数据,主要区别在于您几乎肯定需要找到另一种机制来将位值分配给Mut
值。(我可以在这里欺骗一下,通过操作每个Mut
字母的十进制ASCII码来计算位值- POWER(2,ASCII(Mut) - 64)
)。
DECLARE @myTable TABLE (
IDnumber int,
Mut varchar(1))
INSERT @myTable VALUES
(1,'A'),(1,'B'),(1,'C'),(1,'D'),
(2,'A'),(2,'C'),(2,'D'),
(3,'A'),(3,'B'),(3,'F'),(3,'Z'),
(4,'A'),(4,'B'),(4,'E'),(4,'F'),
(5,'Y'),
(6,'X'),(6,'Z')
-- calculate the target bitmask
DECLARE @target bigint = ( SELECT SUM(POWER(2,ASCII(Mut) - 64))
FROM (SELECT DISTINCT Mut FROM @myTable) AS x
)
;WITH baseCTE
AS
(
--calculate a starting bitmask for each ID
SELECT IDnumber, sum(mutbit) AS bitmask
FROM (SELECT DISTINCT IDnumber, CAST(POWER(2,ASCII(Mut) - 64) AS bigint) AS mutbit FROM @myTable) AS x
GROUP BY IDnumber
)
,recCTE
AS
(
SELECT IDnumber, bitmask, CAST(IdNumber AS varchar(max)) AS trail, 1 AS lev
FROM baseCTE
UNION ALL
SELECT b.IDnumber, b.bitmask | r.bitmask, CAST(CONCAT(r.trail,',',b.IDnumber) AS varchar(max)), r.lev + 1
FROM recCTE AS r
JOIN baseCTE AS b
ON b.IDnumber > r.IDnumber
WHERE b.bitmask | r.bitmask <> r.bitmask --ignore combinations which don't add any new Mut values
)
SELECT trail, lev
FROM recCTE
WHERE bitmask = @target
ORDER BY lev
NB。SQL Server位运算符仅在一个或另一个值是整数类型的情况下工作,因此此解决方案仅限于64个不同的Mut
值(其中掩码是bigint
) -要使其超出此范围,您必须使用自定义的位比较器(可能使用CLR)。但是,由于问题提到您有43个Mut
值,因此现在可能适合您。
我想要一个更大的数据集来测试,但这与你的结果相匹配,至少是给定的数据集…
DECLARE @myTable TABLE (
IDnumber INT,
Mut VARCHAR(1))
DECLARE @results TABLE (
IDnumber INT)
INSERT @myTable VALUES
(1,'A'),(1,'B'),(1,'C'),(1,'D'),
(2,'A'),(2,'C'),(2,'D'),
(3,'A'),(3,'B'),(3,'F'),(3,'Z'),
(4,'A'),(4,'B'),(4,'E'),(4,'F'),
(5,'Y'),
(6,'X'),(6,'Z')
DECLARE @IDnumber INT
WHILE EXISTS (SELECT 1 FROM @myTable)
BEGIN
WITH MutRank AS
(
-- Find how many IDNumbers are associated with each mut
SELECT Mut,
COUNT(DISTINCT IDnumber) AS IDnumberCnt
FROM @myTable
GROUP BY Mut
), MutRarity AS
(
-- Translate the IDNumberCnt into a weighted rarity so dupes at
-- a IDNumberCnt level reduce their rarity compared to the next level
SELECT Mut,
RANK() OVER (ORDER BY IDnumberCnt DESC) AS MutRarity
FROM MutRank
)
-- Grab the IDNumber that is associated to the most "rare" muts
SELECT @IDnumber = n.IDnumber
FROM (SELECT TOP 1 m.IDnumber,
SUM(MutRarity) AS IDNumberValue
FROM @myTable m
JOIN MutRarity mr
ON m.Mut = mr.Mut
GROUP BY m.IDnumber
ORDER BY IDNumberValue DESC, IDnumber) n
-- Save the number in our results
INSERT @results (IDnumber)
SELECT @IDnumber
-- Purge all records for the "covered" muts and the selected IDNumber
DELETE m2
FROM @myTable m1
JOIN @myTable m2
ON m1.Mut = m2.Mut
AND m1.IDnumber = @IDnumber
END
SELECT *
FROM @results
ORDER BY IDnumber