有了一个数据帧,我想生成一个包含命名向量(每行一个向量)的新列表列。每个向量从其他2个数据框架列中派生其名称和值。但是我卡住了,因为我想这么做:
- 由集团
- 尽可能提高计算效率
例子让我们从{ggplot2}
中取mpg
数据集来说明by group原则。我想把成对的cty
和hwy
值放在一起,按manufacturer
&year
。我们可以这样写:
library(ggplot2)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
my_mpg <-
mpg %>%
select(manufacturer, year, cty, hwy)
via_tidyr_nest <-
my_mpg %>%
group_by(manufacturer, year) %>%
nest()
via_tidyr_nest
#> # A tibble: 30 x 3
#> # Groups: manufacturer, year [30]
#> manufacturer year data
#> <chr> <int> <list>
#> 1 audi 1999 <tibble [9 x 2]>
#> 2 audi 2008 <tibble [9 x 2]>
#> 3 chevrolet 2008 <tibble [12 x 2]>
#> 4 chevrolet 1999 <tibble [7 x 2]>
#> 5 dodge 1999 <tibble [16 x 2]>
#> 6 dodge 2008 <tibble [21 x 2]>
#> 7 ford 1999 <tibble [15 x 2]>
#> 8 ford 2008 <tibble [10 x 2]>
#> 9 honda 1999 <tibble [5 x 2]>
#> 10 honda 2008 <tibble [4 x 2]>
#> # ... with 20 more rows
由reprex包(v0.3.0)于2021-09-27创建
这是完美的,除了我不想要一个嵌套的标签,而是一个嵌套的命名向量。(原因是:一旦我们将输出作为对象存储在环境中,命名的vector版本比嵌套的tibble版本要轻)。
工作但不希望的解决方案将取via_tidyr_nest
并将嵌套的tibble转换为命名向量。
expected_output <-
via_tidyr_nest %>%
mutate(desired_named_vec = map(.x = data, .f = ~pull(.x, cty, hwy))) %>%
select(-data)
expected_output
#> # A tibble: 30 x 3
#> # Groups: manufacturer, year [30]
#> manufacturer year desired_named_vec
#> <chr> <int> <list>
#> 1 audi 1999 <int [9]>
#> 2 audi 2008 <int [9]>
#> 3 chevrolet 2008 <int [12]>
#> 4 chevrolet 1999 <int [7]>
#> 5 dodge 1999 <int [16]>
#> 6 dodge 2008 <int [21]>
#> 7 ford 1999 <int [15]>
#> 8 ford 2008 <int [10]>
#> 9 honda 1999 <int [5]>
#> 10 honda 2008 <int [4]>
#> # ... with 20 more rows
这是不希望的,因为它通过绕道来实现所需的输出。首先,它创建一个tibble,然后将其转换为一个命名向量。虽然在本例中处理时间可以忽略不计,但实际上我有一个很大的数据集(1000万行)。因此,添加任何额外的步骤都是昂贵的。相反,我希望以尽可能少的步骤到达expected_output
。
1次尝试失败:
library(purrr)
via_summarise_map2_setnames <-
my_mpg %>%
group_by(manufacturer, year) %>%
summarise(named_vec = map2(.x = cty, .y = hwy, .f = ~setNames(.x, .y)))
#> `summarise()` has grouped output by 'manufacturer', 'year'. You can override using the `.groups` argument.
via_summarise_map2_setnames
#> # A tibble: 234 x 3
#> # Groups: manufacturer, year [30]
#> manufacturer year named_vec
#> <chr> <int> <list>
#> 1 audi 1999 <int [1]>
#> 2 audi 1999 <int [1]>
#> 3 audi 1999 <int [1]>
#> 4 audi 1999 <int [1]>
#> 5 audi 1999 <int [1]>
#> 6 audi 1999 <int [1]>
#> 7 audi 1999 <int [1]>
#> 8 audi 1999 <int [1]>
#> 9 audi 1999 <int [1]>
#> 10 audi 2008 <int [1]>
#> # ... with 224 more rows
你知道如何直接从my_mpg
到expected_output
,而不需要在两者之间创建一个标签吗?
编辑
只是关于这个问题的一个大概的想法。我知道tidyr::nest()
的默认行为是返回一个嵌套的标题。但我没有发现任何关于这个决定的讨论。换句话说,如果我们想自己选择嵌套数据的类,该怎么办?它可以是默认的tibble
,也可以是data.frame
、data.table
、named vector
等。无论用户选择什么作为输出类
这里有一个方法。在设置名称之前,先将cty
和hwy
强制转换为"list"
。这似乎有效。
library(purrr)
library(dplyr)
data(mpg, package = "ggplot2")
my_mpg <-
mpg %>%
select(manufacturer, year, cty, hwy)
my_mpg %>%
group_by(manufacturer, year) %>%
summarise(named_vec = map2(list(cty), list(hwy), ~set_names(.x, .y)))
#`summarise()` has grouped output by 'manufacturer'. You can override using the `.groups` argument.
## A tibble: 30 x 3
## Groups: manufacturer [15]
# manufacturer year named_vec
# <chr> <int> <list>
# 1 audi 1999 <int [9]>
# 2 audi 2008 <int [9]>
# 3 chevrolet 1999 <int [7]>
# 4 chevrolet 2008 <int [12]>
# 5 dodge 1999 <int [16]>
# 6 dodge 2008 <int [21]>
# 7 ford 1999 <int [15]>
# 8 ford 2008 <int [10]>
# 9 honda 1999 <int [5]>
#10 honda 2008 <int [4]>
## … with 20 more rows
因为这个问题是一个性能问题,这里是4个被提议的解决方案的基准,到目前为止,上面的问题,Nicolas2的,Till的和我的。
f <- function(X) {
X %>%
group_by(manufacturer, year) %>%
nest() %>%
mutate(desired_named_vec = map(.x = data, .f = ~pull(.x, cty, hwy))) %>%
select(-data)
}
g <- function(X) {
df1 <- X %>% group_by(manufacturer, year)
df2 <- attr(df1,"groups")
Map(function(rows) {
r <- df1[rows,"cty",drop=TRUE]
setNames(r,df1[rows,"hwy",drop=TRUE])
},
df2$.rows
) -> l
data.frame(manufacturer=df2$manufacturer,year=df2$year,named_vector=I(l))
}
h <- function(X){
X %>%
group_by(manufacturer, year) %>%
summarise(named_vec = map2(list(cty), list(hwy), ~set_names(.x, .y)), .groups = "drop")
}
i <- function(X){
X |>
select(manufacturer, year, cty, hwy) |>
group_by(manufacturer, year) |>
group_modify((x, ...) tibble(res = list(deframe(x))))
}
mb <- microbenchmark(
Emman = f(my_mpg),
Nicolas2 = g(my_mpg),
Rui = h(my_mpg),
Till = i(my_mpg)
)
print(mb, unit = "relative", order = "median")
#Unit: relative
# expr min lq mean median uq max neval cld
# Rui 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
# Nicolas2 1.527957 1.468524 1.478286 1.482185 1.471565 1.724004 100 b
# Emman 4.504185 4.230921 4.215643 4.234087 4.148188 4.170934 100 c
# Till 6.264028 5.813678 5.883107 5.810876 5.744080 5.666524 100 d
此处可使用dplyr::group_modify()
和tibble::deframe()
。而不是deframe()
,pull(x, cty, hwy)
从你的问题将工作一样。
library(tidyverse)
mpg |>
select(manufacturer, year, cty, hwy) |>
group_by(manufacturer, year) |>
group_modify((x, ...) tibble(res = list(deframe(x))))
#> # A tibble: 30 × 3
#> # Groups: manufacturer, year [30]
#> manufacturer year res
#> <chr> <int> <list>
#> 1 audi 1999 <int [9]>
#> 2 audi 2008 <int [9]>
#> 3 chevrolet 1999 <int [7]>
#> 4 chevrolet 2008 <int [12]>
#> 5 dodge 1999 <int [16]>
#> 6 dodge 2008 <int [21]>
#> 7 ford 1999 <int [15]>
#> 8 ford 2008 <int [10]>
#> 9 honda 1999 <int [5]>
#> 10 honda 2008 <int [4]>
#> # … with 20 more rows
已编辑:将" map "替换为" map "
我希望这能有用。你的解决方案在"f"之内,我的建议在"g"之内。它使用由dplyr的'group_by'创建的索引来收集构建命名向量所需的数据。
f <- function () {
via_tidyr_nest <-
my_mpg %>%
group_by(manufacturer, year) %>%
nest()
expected_output <-
via_tidyr_nest %>%
mutate(desired_named_vec = map(.x = data, .f = ~pull(.x, cty, hwy))) %>%
select(-data)
}
g <- function () {
df1 <- my_mpg %>% group_by(manufacturer, year)
df2 <- attr(df1,"groups")
Map(function(rows) {
r <- df1[rows,"cty",drop=TRUE]
setNames(r,df1[rows,"hwy",drop=TRUE])
},
df2$.rows
) -> l
df <- data.frame(manufacturer=df2$manufacturer,year=df2$year,named_vector=I(l))
}
# other solutions
h <- function () {
hdf <- my_mpg %>%
group_by(manufacturer, year) %>%
summarise(named_vec = map2(list(cty), list(hwy), ~set_names(.x, .y)))
}
k <- function() {
mpg |>
select(manufacturer, year, cty, hwy) |>
group_by(manufacturer, year) |>
group_modify((x, ...) tibble(res = list(deframe(x))))
}
library(microbenchmark)
microbenchmark(OP=f(),Nicolas2=g(),Rui=h(),Till=k())
Unit: milliseconds
expr min lq mean median uq max neval
OP 21.8917 22.64035 24.389126 23.28235 24.70075 39.9593 100
Nicolas2 3.0507 3.15920 3.481469 3.24625 3.57840 7.3173 100
Rui 6.5460 6.75300 7.505564 7.16255 7.64390 12.0359 100
Till 31.2364 32.31115 34.940356 32.92990 36.11505 107.2709 100
目前为止,@RuiBarradas的解决方案是最快的。我想出了一个data.table
版本,它似乎提高了一点速度。
library(ggplot2)
library(purrr)
library(dplyr, warn.conflicts = FALSE)
library(data.table, warn.conflicts = FALSE)
my_mpg <-
mpg %>%
select(manufacturer, year, hwy, cty)
my_mpg %>%
as.data.table() %>%
.[,.(named_vec = map2(.x = list(cty), .y = list(hwy), .f = ~setNames(.x, .y))),.(manufacturer, year)] %>%
as_tibble()
#> # A tibble: 30 x 3
#> manufacturer year named_vec
#> <chr> <int> <list>
#> 1 audi 1999 <int [9]>
#> 2 audi 2008 <int [9]>
#> 3 chevrolet 2008 <int [12]>
#> 4 chevrolet 1999 <int [7]>
#> 5 dodge 1999 <int [16]>
#> 6 dodge 2008 <int [21]>
#> 7 ford 1999 <int [15]>
#> 8 ford 2008 <int [10]>
#> 9 honda 1999 <int [5]>
#> 10 honda 2008 <int [4]>
#> # ... with 20 more rows
由reprex包(v0.3.0)于2021-09-28创建
基准测试library(ggplot2)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(tibble)
library(purrr)
library(data.table, warn.conflicts = FALSE)
my_mpg <-
mpg %>%
select(manufacturer, year, cty, hwy)
f <- function(X) {
X %>%
group_by(manufacturer, year) %>%
nest() %>%
mutate(desired_named_vec = map(.x = data, .f = ~pull(.x, cty, hwy))) %>%
select(-data)
}
g <- function(X) {
df1 <- my_mpg %>% group_by(manufacturer, year)
df2 <- attr(df1,"groups")
Map(function(rows) {
r <- df1[rows,"cty",drop=TRUE]
setNames(r,df1[rows,"hwy",drop=TRUE])
},
df2$.rows
) -> l
data.frame(manufacturer=df2$manufacturer,year=df2$year,named_vector=I(l))
}
h <- function(X){
X %>%
group_by(manufacturer, year) %>%
summarise(named_vec = map2(list(cty), list(hwy), ~set_names(.x, .y)), .groups = "drop")
}
i <- function(X){
X |>
select(manufacturer, year, cty, hwy) |>
group_by(manufacturer, year) |>
group_modify((x, ...) tibble(res = list(deframe(x))))
}
j <- function(X){
X %>%
as.data.table() %>%
.[,.(named_vec = map2(.x = list(cty), .y = list(hwy), .f = ~setNames(.x, .y))),.(manufacturer, year)] %>%
as_tibble()
}
library(microbenchmark)
library(bench)
mb <- microbenchmark(
Emman_OP = f(my_mpg),
Nicolas2 = g(my_mpg),
Rui = h(my_mpg),
Till = i(mpg),
Emman_data.table_version_of_Rui = j(my_mpg)
)
print(mb, unit = "relative", order = "median")
#> Unit: relative
#> expr min lq mean median
#> Emman_data.table_version_of_Rui 1.000000 1.000000 1.000000 1.000000
#> Rui 2.472627 2.457073 2.392998 2.409865
#> Nicolas2 3.317832 3.317378 3.116434 3.295358
#> Emman_OP 10.255926 10.472251 9.842886 10.674290
#> Till 14.061003 14.333661 13.115049 14.937978
#> uq max neval
#> 1.000000 1.000000 100
#> 2.395210 2.191381 100
#> 3.258533 2.719938 100
#> 10.572811 5.331644 100
#> 14.086673 5.418907 100
很好,data.table
对@Rui的适应是最快的。但是!
如果我们看一下内存分配(这是性能的另一个方面):
bm <- bench::mark(Emman_OP = f(my_mpg),
Nicolas2 = g(my_mpg),
Rui = h(my_mpg),
Till = i(mpg),
Emman_data.table_version_of_Rui = j(my_mpg), check = FALSE)
summary(bm, relative = TRUE)
#> # A tibble: 5 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Emman_OP 12.0 11.8 1.24 13.8 1.18
#> 2 Nicolas2 3.75 3.67 3.84 1 1.08
#> 3 Rui 2.77 2.76 5.18 1.49 1.06
#> 4 Till 15.1 15.7 1 7.10 1.45
#> 5 Emman_data.table_version_of_Rui 1 1 14.3 7.53 1
我们可以看到@Nicols2是最轻的,@Rui的原版也不错,但是我的数据。表版就没那么多了。我想知道为什么,以及是否有一种方法可以使data.table
在内存分配方面也更有效地运行。