在R?中生成N个非常大的列表的不同排列



我有一个数字向量,比如1:8000。我想生成这个向量的n个不同的排列。有什么方法可以做到这一点,而不必计算所有排列(自8000!

function distinct_permutations(L, N){
# Return N distinct permutations of L as a list of lists
return(x)
}
x <- seq(1:8000)

使用RcppAlgos::permuteSample,这里展示了n=3个样本中的10个。

set.seed(42)
RcppAlgos::permuteSample(v=10, m=10, n=3)
#      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,]    9    8    5    3    4    1   10    6    2     7
# [2,]    1    3    5    9    7    6    8   10    2     4
# [3,]    2    8    5    9    4    7    3   10    6     1

我将实现我在评论中提出的第一个想法,并与其他用户已经提供的想法进行比较。我的解决方案更通用,因为任何一组值都可以提供(其他值必须被视为索引),我提供了一个向量列表,我的解决方案在许多情况下都更快。

当前解决方案:

  • perm_n我自己的解决方案由sampleunique驱动
  • RcppAlgos::permuteSample
  • arrangements::permutations

EDIT:添加了额外的方法

perm_n <- function(vec, n) {
# sample 2 * N samples as a risk of duplicates for bigger samples
unique(
lapply(seq_len(n * 2), function(x) sample(vec, length(vec)))
)[seq_len(n)]
}
microbenchmark::microbenchmark(
RcppAlgos::permuteSample(v=10, m=10, n=10),
perm_n(1:10, 10),
arrangements::permutations(10, 10, nsample = 10)
)
#> Warning in microbenchmark::microbenchmark(RcppAlgos::permuteSample(v = 10, :
#> less accurate nanosecond times to avoid potential integer overflows
#> Unit: microseconds
#>                                              expr     min       lq      mean
#>  RcppAlgos::permuteSample(v = 10, m = 10, n = 10) 230.174 249.7105 1605.4817
#>                                  perm_n(1:10, 10)  65.559  68.0190  182.8387
#>  arrangements::permutations(10, 10, nsample = 10)   3.649   4.1205  238.7746
#>    median       uq      max neval
#>  1038.961 2292.556 28600.70   100
#>    71.217   82.861 10617.61   100
#>     4.633    7.298 23295.99   100
microbenchmark::microbenchmark(
RcppAlgos::permuteSample(v=8000, m=8000, n=10),
perm_n(1:8000, 10),
arrangements::permutations(8000, 8000, nsample = 10),
times = 10
)
#> Unit: milliseconds
#>                                                  expr      min         lq
#>  RcppAlgos::permuteSample(v = 8000, m = 8000, n = 10) 199.5939 199.858108
#>                                    perm_n(1:8000, 10)   4.3911   4.428697
#>  arrangements::permutations(8000, 8000, nsample = 10) 593.0577 594.338993
#>       mean     median         uq        max neval
#>  200.44507 200.282540 200.944936 201.960178    10
#>    4.49205   4.457807   4.555182   4.663914    10
#>  600.67245 598.277678 604.146808 619.717132    10
microbenchmark::microbenchmark(
RcppAlgos::permuteSample(v=8000, m=8000, n=300),
perm_n(1:8000, 300),
arrangements::permutations(8000, 8000, nsample = 300),
times = 3
)
#> Unit: milliseconds
#>                                                   expr        min         lq
#>  RcppAlgos::permuteSample(v = 8000, m = 8000, n = 300)  5939.2831  5965.0268
#>                                    perm_n(1:8000, 300)   136.7174   137.3693
#>  arrangements::permutations(8000, 8000, nsample = 300) 17875.7788 17877.2049
#>       mean     median         uq        max neval
#>   5984.365  5990.7704  6006.9057  6023.0410     3
#>    138.048   138.0212   138.7133   139.4053     3
#>  17891.089 17878.6310 17898.7438 17918.8566     3

创建于2022-11-23与reprex v2.0.2

对于大型向量,@polkas是正确的,lapplysample将更高效。

为已经提供的解决方案添加更多健壮性:

library(RcppAlgos)
perm_n.safe <- function(v, n) {
if (n/permuteCount(v) > 0.01) return(as.list(data.frame(t(permuteSample(v, n = n)))))
k <- 0L
out <- vector("list", n)
m <- n

while (k < m) {
s <- unique(lapply(1:n, function(i) sample(v)))
out[(k + 1L):(k + length(s))] <- s
k <- k + length(s)
n <- ceiling(1.1*n/k)
}

out
}

这避免了n的盲目加倍,并保证返回n样本。

set.seed(148461194)
microbenchmark::microbenchmark(
perm_n = perm_n(1:8000, 300),
perm_n.safe = perm_n.safe(1:8000, 300),
times = 10
)
#> Unit: milliseconds
#>         expr      min       lq     mean   median       uq      max neval
#>       perm_n 229.9223 231.2907 233.5303 233.0133 235.8125 237.5975    10
#>  perm_n.safe 117.1336 118.6889 123.5767 119.3938 122.4304 147.8789    10
microbenchmark::microbenchmark(
perm_n = perm_n(1:5, 100),
perm_n.safe = perm_n.safe(1:5, 100)
)
#> Unit: microseconds
#>         expr   min     lq    mean median    uq    max neval
#>       perm_n 728.8 766.75 908.462 789.85 993.4 3327.8   100
#>  perm_n.safe 168.9 186.90 211.721 200.80 228.4  414.1   100
sum(lengths(perm_n(1:5, 100)) == 5)
#> [1] 92
sum(lengths(perm_n.safe(1:5, 100)) == 5)
#> [1] 100

最新更新