给定多个向量,我想返回向量中列的唯一组合。下面是一个有效但计算速度没有我想要的那么快的例子:
library(dplyr)
c_sort_collapse <- function(...){
c(...) %>%
sort() %>%
paste(collapse = ".")
}
unique_set <- function(...){
list(...) %>%
purrr::pmap_chr(c_sort_collapse) %>%
unique()
}
unique_set(c("a", "b", "a"), c("a", "a", "b"))
#> [1] "a.a" "a.b"
有没有一种方法可以更快/更好地向量化,即不依赖于循环遍历每个项目(就像purrr::pmap_chr()
步骤中发生的那样(?
此解决方案仅在需要组合两个向量时有效,但速度相当快。我冒昧地给了其他方法stringi::stri_sort()
的优势,它已经快了一个数量级。
library(dplyr)
library(stringi)
set.seed(123)
x <- sample(letters, 1000, replace = TRUE)
set.seed(12)
y <- sample(letters, 1000, replace = TRUE)
c_sort_collapse <- function(...){
c(...) |>
stri_sort() |>
paste(collapse = ".")
}
unique_set <- function(...){
list(...) |>
purrr::pmap_chr(c_sort_collapse) |>
unique()
}
unique_set_matrix <- function(...){
matrix(c(...), nrow = length(list(...)), byrow = TRUE) |>
apply(2, stri_sort) |>
asplit(2) |>
unique() |>
sapply(paste, collapse = ".")
}
pminmax <- function(x, y) {
paste(pmin.int(x, y), pmax.int(x, y), sep = ".") |> unique()
}
all.equal(sort(unique_set(x, y)), sort(pminmax(x, y)))
#> [1] TRUE
bench::mark(
tidy = unique_set(x, y),
matrix = unique_set_matrix(x, y),
Map = Map((x,y) paste0(stri_sort(c(x,y)) , collapse = ".") , x , y) |>
unique() |> unlist(FALSE, FALSE),
pminmax = pminmax(x, y),
iterations = 20, check = FALSE
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 tidy 6.13ms 6.24ms 160. 45.4KB 17.8
#> 2 matrix 5.18ms 5.55ms 168. 229.8KB 29.7
#> 3 Map 5.7ms 6.83ms 151. 33.4KB 16.7
#> 4 pminmax 484.6µs 487.85µs 2035. 49.6KB 0
创建于2022-08-05由reprex包(v2.0.1(
我们可以使用Base R
Map
函数
library(stringi)
Map((x,y) paste0(stri_sort(c(x,y)) , collapse = ".") , x , y) |>
unique() |> unlist()
- 输出
[1] "a.a" "a.b"
- 数据
x <- c("a", "b", "a")
y <- c("a", "a", "b")
编辑
我认为使用stringi
库中的stri_sort
在我的microbenchmark
测试中会取得很好的结果,试试吧
library(stringi)
x <- c("a", "b", "a")
y <- c("a", "a", "b")
microbenchmark::microbenchmark(
tidy = unique_set(x, y),
base = Map((x,y) paste0(stri_sort(c(x,y)) , collapse = ".") , x , y) |>
unique() |> unlist(),
times = 1000
)
#<============ Results ===============>#
Unit: microseconds
expr min lq mean median uq max neval
tidy 93.284 96.8405 102.09465 100.1805 103.4035 244.337 1000
base 37.271 39.2190 44.15599 41.6520 43.5285 1860.692 1000
这里是另一个仅适用于两个向量的解决方案。它基于data.table包中的fifelse
。我还与@teunband解决方案的pminmax
进行了比较:
library(data.table)
pminmax <- function(x, y) {
paste(pmin.int(x, y), pmax.int(x, y), sep = ".") |> unique()
}
dtfifelse = function(x, y) {
{idx=x<y; paste0(fifelse(idx, x, y), ".", fifelse(idx, y, x)) |> unique()}
}
compare = function(n) {
set.seed(0123)
# random sample of strings with up to 10 characters
x = stringi::stri_rand_strings(n, sample(10, n, TRUE))
y = stringi::stri_rand_strings(n, sample(10, n, TRUE))
#
bench::mark(
pminmax = pminmax(x, y),
dt_fifelse = dtfifelse(x, y),
iterations = 5,
check = TRUE
)[1:9]
}
compare(1e3)
# A tibble: 2 x 9
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 pminmax 1.99ms 2.03ms 468. 43.4KB 0 5 0 10.68ms
2 dt_fifelse 1.61ms 1.72ms 546. 47.4KB 0 5 0 9.16ms
compare(1e4)
# A tibble: 2 x 9
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 pminmax 10.42ms 11.03ms 89.6 480KB 0 5 0 55.8ms
2 dt_fifelse 8.16ms 8.34ms 120. 519KB 0 5 0 41.7ms
compare(1e5)
# A tibble: 2 x 9
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 pminmax 124ms 127ms 7.74 4.43MB 1.93 4 1 517ms
2 dt_fifelse 100ms 105ms 9.51 4.81MB 0 5 0 526ms
compare(1e6)
# A tibble: 2 x 9
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
1 pminmax 1.31s 1.34s 0.748 42.3MB 1.12 2 3 2.67s
2 dt_fifelse 1.07s 1.07s 0.932 46.1MB 1.40 2 3 2.15s