我试图通过id列表来子集相同的数据帧。我有一个大约50,000行的数据帧和一个大约1,000个数据帧的列表。列表中的每个数据帧有100到1000行,并且具有相同的结构。
考虑这个例子:
df1 <- data.frame(id = sample(sample(1000:3000, 1000), 50000, TRUE), info = runif(50000, 200, 300))
set.seed(1)
l <- replicate(1000,
data.frame(id = sample(1000:3000, sample(400:700, 1), replace = TRUE)),
simplify = FALSE)
我想根据l
中的id对df1
进行子集化。我可以通过使用%in%
执行半连接或子集来做到这一点:
library(dplyr)
semi_join(df1, l[[1]], "id")
df1[df1$id %in% l[[1]]$id, ]
我正在寻找一个快速的解决方案,可扩展到数千个数据帧的列表。到目前为止,我将其封装在lapply
中(但可能有更快的矢量化解决方案)。
lapply(l, (x) semi_join(df1, x, "id"))
下面是一个带有解决方案的基准测试:
bc <-
bench::mark(dplyr = lapply(l, (x) semi_join(df1, x, "id")),
baseR = lapply(l, (x) df1[df1$id %in% x$id, ]),
unique = lapply(l, (x) df1[df1$id %in% unique(x$id), ]),
data.table = {df2 <- setDT(df1); lapply(l, (x) df2[df2$id %in% unique(x$id), ])},
iterations = 10, check = FALSE)
#> bc
# A tibble: 4 × 13
# expression min median `itr/sec` mem_alloc gc/se…¹ n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 dplyr 2.25s 2.43s 0.416 1.64GB 4.03 10 97
#2 baseR 9.04s 9.55s 0.105 1.56GB 0.536 10 51
#3 unique 10.3s 10.95s 0.0912 1.57GB 0.420 10 46
#4 data.table 10.21s 10.9s 0.0916 979.5MB 0.458 10 50
任何包都可以使用。
在rbindlist
上进行一次data.table
连接将会很快。
library(data.table)
library(dplyr)
microbenchmark::microbenchmark(
dplyr = lapply(l, (x) semi_join(df1, x, "id")),
baseR = lapply(l, (x) df1[df1$id %in% x$id, ]),
unique = lapply(l, (x) df1[df1$id %in% unique(x$id), ]),
data.table = {setDT(df2); lapply(l, (x) df2[id %in% unique(x$id)])},
bindJoinSplit = split(setDT(df2, key = "id")[unique(rbindlist(l, idcol = "df")), on = "id", allow.cartesian = TRUE, nomatch = 0], by = "df", keep.by = FALSE),
times = 10,
setup = df2 <- copy(df1)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> dplyr 1555.2198 1569.4590 1630.778 1619.9118 1626.8791 1857.1259 10
#> baseR 1079.4916 1087.2524 1131.043 1126.8084 1179.5400 1196.9150 10
#> unique 1171.3705 1214.4915 1253.490 1234.3908 1274.9278 1398.2908 10
#> data.table 1925.1388 1950.9440 1978.538 1982.5494 1995.3917 2038.3573 10
#> bindJoinSplit 368.8109 380.4029 412.595 401.4089 437.4869 503.8478 10
如果可以避免绑定和分裂,这将会更快更容易。From?data.table::split
:
注意处理列表中的数据。桌子通常会很多比操作单个数据慢。表按组使用by参数
假设数据已构建到单个data.table
而不是data.frame
列表中。我们还可以将结果保存在单个data.table
中,而不是保存在data.frame
列表中。
dt <- rbindlist(l, idcol = "df")
microbenchmark::microbenchmark(
bindJoinSplit = split(setDT(df2, key = "id")[unique(rbindlist(l, idcol = "df")), on = "id", allow.cartesian = TRUE, nomatch = 0], by = "df", keep.by = FALSE),
join = setDT(df2, key = "id")[unique(dt), on = "id", allow.cartesian = TRUE, nomatch = 0],
times = 10,
setup = df2 <- copy(df1)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> bindJoinSplit 317.1514 326.3613 343.2588 333.3914 364.1945 398.5467 10
#> join 188.5818 191.4344 198.7380 194.4094 208.7339 219.9847 10
数据:
set.seed(1)
df1 <- data.frame(id = sample(sample(1000:3000, 1000), 50000, TRUE), info = runif(50000, 200, 300))
l <- replicate(1000,
data.frame(id = sample(1000:3000, sample(400:700, 1), replace = TRUE)),
simplify = FALSE)