返回项目的唯一组合R



给定多个向量,我想返回向量中列的唯一组合。下面是一个有效但计算速度没有我想要的那么快的例子:

library(dplyr)
c_sort_collapse <- function(...){
c(...) %>% 
sort() %>% 
paste(collapse = ".")
}
unique_set <- function(...){
list(...) %>% 
purrr::pmap_chr(c_sort_collapse) %>% 
unique()
}
unique_set(c("a", "b", "a"), c("a", "a", "b"))
#> [1] "a.a" "a.b"

有没有一种方法可以更快/更好地向量化,即不依赖于循环遍历每个项目(就像purrr::pmap_chr()步骤中发生的那样(?

此解决方案仅在需要组合两个向量时有效,但速度相当快。我冒昧地给了其他方法stringi::stri_sort()的优势,它已经快了一个数量级。

library(dplyr)
library(stringi)
set.seed(123)
x <- sample(letters, 1000, replace = TRUE)
set.seed(12)
y <- sample(letters, 1000, replace = TRUE)
c_sort_collapse <- function(...){
c(...) |> 
stri_sort() |> 
paste(collapse = ".")
}
unique_set <- function(...){
list(...) |> 
purrr::pmap_chr(c_sort_collapse) |> 
unique()
}
unique_set_matrix <- function(...){
matrix(c(...), nrow = length(list(...)), byrow = TRUE) |>
apply(2, stri_sort) |>
asplit(2) |>
unique() |>
sapply(paste, collapse = ".")
}
pminmax <- function(x, y) {
paste(pmin.int(x, y), pmax.int(x, y), sep = ".") |> unique()
}
all.equal(sort(unique_set(x, y)), sort(pminmax(x, y)))
#> [1] TRUE
bench::mark(
tidy = unique_set(x, y),
matrix = unique_set_matrix(x, y),
Map = Map((x,y) paste0(stri_sort(c(x,y)) , collapse = ".") , x , y) |>
unique() |> unlist(FALSE, FALSE),
pminmax = pminmax(x, y),
iterations = 20, check = FALSE
)
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 tidy         6.13ms   6.24ms      160.    45.4KB     17.8
#> 2 matrix       5.18ms   5.55ms      168.   229.8KB     29.7
#> 3 Map           5.7ms   6.83ms      151.    33.4KB     16.7
#> 4 pminmax     484.6µs 487.85µs     2035.    49.6KB      0

创建于2022-08-05由reprex包(v2.0.1(

我们可以使用Base RMap函数

library(stringi)
Map((x,y) paste0(stri_sort(c(x,y)) , collapse = ".") , x , y) |>
unique() |> unlist()
  • 输出
[1] "a.a" "a.b"
  • 数据
x <- c("a", "b", "a")
y <- c("a", "a", "b")

编辑

我认为使用stringi库中的stri_sort在我的microbenchmark测试中会取得很好的结果,试试吧

library(stringi)
x <- c("a", "b", "a")
y <- c("a", "a", "b")
microbenchmark::microbenchmark(
tidy = unique_set(x, y),
base = Map((x,y) paste0(stri_sort(c(x,y)) , collapse = ".") , x , y) |>
unique() |> unlist(),
times = 1000
)
#<============ Results ===============>#
Unit: microseconds
expr    min      lq      mean   median       uq      max neval
tidy 93.284 96.8405 102.09465 100.1805 103.4035  244.337  1000
base 37.271 39.2190  44.15599  41.6520  43.5285 1860.692  1000

这里是另一个仅适用于两个向量的解决方案。它基于data.table包中的fifelse。我还与@teunband解决方案的pminmax进行了比较:

library(data.table)
pminmax <- function(x, y) {
paste(pmin.int(x, y), pmax.int(x, y), sep = ".") |> unique()
}
dtfifelse = function(x, y) {
{idx=x<y; paste0(fifelse(idx, x, y), ".", fifelse(idx, y, x)) |> unique()}
}
compare = function(n) {
set.seed(0123)
# random sample of strings with up to 10 characters
x = stringi::stri_rand_strings(n, sample(10, n, TRUE))   
y = stringi::stri_rand_strings(n, sample(10, n, TRUE))   
#
bench::mark(
pminmax = pminmax(x, y),
dt_fifelse = dtfifelse(x, y),
iterations = 5, 
check = TRUE
)[1:9]
}
compare(1e3)
# A tibble: 2 x 9
expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
<bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 pminmax      1.99ms   2.03ms      468.    43.4KB        0     5     0    10.68ms
2 dt_fifelse   1.61ms   1.72ms      546.    47.4KB        0     5     0     9.16ms
compare(1e4)
# A tibble: 2 x 9
expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
<bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 pminmax     10.42ms  11.03ms      89.6     480KB        0     5     0     55.8ms
2 dt_fifelse   8.16ms   8.34ms     120.      519KB        0     5     0     41.7ms
compare(1e5)
# A tibble: 2 x 9
expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
<bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 pminmax       124ms    127ms      7.74    4.43MB     1.93     4     1      517ms
2 dt_fifelse    100ms    105ms      9.51    4.81MB     0        5     0      526ms
compare(1e6)
# A tibble: 2 x 9
expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
<bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
1 pminmax       1.31s    1.34s     0.748    42.3MB     1.12     2     3      2.67s
2 dt_fifelse    1.07s    1.07s     0.932    46.1MB     1.40     2     3      2.15s