我试图过滤r中的数据框,基于以下事实:三列中的前两列的行条目相同。我所做的是如下(其中df是一个3列数据框)
not_duplicate <- df[!(duplicated(dplyr::select(df, col1, col2))), ]
这需要相当长的时间。有没有更快/更好的方法来做这件事?谢谢!
试试这个data.table
选项-
library(data.table)
setDT(df)
unique(df, by = c('col1', 'col2'))
这会使它更快吗?
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
iris %>% distinct(across(1:2), .keep_all = TRUE) %>% as_tibble()
#> # A tibble: 117 x 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 5 3.4 1.5 0.2 setosa
#> 9 4.4 2.9 1.4 0.2 setosa
#> 10 4.9 3.1 1.5 0.1 setosa
#> # … with 107 more rows
由reprex包(v2.0.1)在2018-10-05创建
<标题>速度比较使用unique
和data.table
是迄今为止最快的方法:
library(microbenchmark)
library(tidyverse)
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
#> The following object is masked from 'package:purrr':
#>
#> transpose
df <- nycflights13::flights %>% rename(col1 = year, col2 = month)
microbenchmark(
question = {
df[!(duplicated(dplyr::select(df, col1, col2))), ]
},
Weed = {
df %>%
group_by(col1, col2) %>%
mutate(nrep = n()) %>%
filter(nrep == 1) %>%
ungroup()
},
danlooo = {
df %>% distinct(across(1:2), .keep_all = TRUE)
},
ronak = {
setDT(df)
unique(df, by = c('col1', 'col2'))
},
times = 10
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> question 4.059082 4.131558 79.471833 4.211303 6.153801 453.642332 10
#> Weed 17.702280 19.095162 21.956365 20.565925 26.074310 29.571600 10
#> danlooo 8.079582 8.133334 9.912199 8.686196 9.827751 15.959280 10
#> ronak 1.262237 1.386761 1.648035 1.454734 1.572879 3.419493 10
#> cld
#> a
#> a
#> a
#> a
由reprex包(v2.0.1)在2018-10-05创建
标题>我们可以使用collapse
中的funique
library(collapse)
funique(iris, cols = 5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 7.0 3.2 4.7 1.4 versicolor
3 6.3 3.3 6.0 2.5 virginica
在OP的情况下
funique(df, cols = c('col1', 'col2'))
您可以使用[
:
df[!(duplicated(df[, c("col1", "col2")])),]
或使用附加粘贴:
df[!(duplicated(paste(df$col1, df$col2))),]
基准:
df <- data.frame(col1 = rep(letters[1:3], each=4)
, col2 = rep(letters[1:4], each=3),
col3 = rep(letters[1:2], 6))
library(data.table)
dt <- as.data.table(df)
library(collapse)
library(dplyr)
bench::mark(check = FALSE
, dplyr = df[!(duplicated(dplyr::select(df, col1, col2))), ]
, dplyr2 = (df %>% distinct(across(c(col1, col2)), .keep_all = TRUE) %>% as_tibble())
, base = df[!(duplicated(df[, c("col1", "col2")])),]
, base2 = df[!(duplicated(paste(df$col1, df$col2))),]
, dt = unique(dt, by = c('col1', 'col2'))
, funique = funique(df, cols = c("col1", "col2"))
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 dplyr 1.47ms 1.53ms 647. 6.44KB 14.8 305 7
#2 dplyr2 3.05ms 3.16ms 314. 1.75MB 15.2 145 7
#3 base 105.72µs 112.72µs 8525. 0B 19.0 4039 9
#4 base2 59.55µs 64.29µs 15093. 0B 16.6 7270 8
#5 dt 42.47µs 50.2µs 20178. 92.66KB 12.5 9699 6
#6 funique 18.48µs 20.25µs 47910. 50.81KB 19.2 9996 4
你可以试试
df %>%
group_by(col1, col2) %>%
mutate(nrep = n()) %>%
filter(nrep == 1) %>%
ungroup()