r-在具有包含数据帧的列表列的tibble中,如何用自定义函数包装mutate(foo=map2(..))



我想围绕涉及dplyr::mutate()purrr::map2()的过程编写一个包装函数。

要进行演示,请考虑以下名为trb:的可移植

df_1 <- mtcars[, c("am", "disp")]
df_2 <- mtcars[, c("mpg", "carb")]
trb <-
tibble::tibble(dat_a = list(df_1),
dat_b = list(df_2))
trb
#> # A tibble: 1 x 2
#>   dat_a         dat_b        
#>   <list>        <list>       
#> 1 <df [32 x 2]> <df [32 x 2]>

我想更改trb中的另一列,称为dat_c,它将包括一个数据帧,其中一列来自dat_a,另一列来自于dat_b。以下代码允许我实现它:

library(dplyr)
library(purrr)
output <- 
trb %>%
mutate(dat_c = map2(.x = dat_a, .y = dat_b, .f = ~data.frame(my_lovely_am = .x$am, 
suberb_carb_col = .y$carb)))
output %>%
pull(dat_c)
#> [[1]]
#>    my_lovely_am suberb_carb_col
#> 1             1               4
#> 2             1               4
#> 3             1               1
#> 4             0               1
#> 5             0               2
#> 6             0               1
# I removed the rest of the rows

如何将上述mutate()过程封装在自定义函数中?特别有问题的是当引用.x$bar.y$foo时。如何指定要从包装函数的参数中获取的列名?

我想象的是一个类似于的自定义函数

create_dat_c <- function(.trb, colname_dat_a, colname_dat_b, header_a, header_b) {
.trb %>%
mutate(dat_c = map2(.x = dat_a, .y = dat_b, .f = ~data.frame(header_a = .x$colname_dat_a, 
header_b = .y$colname_dat_b)))
}

并用调用

create_dat_c(trb, 
colname_dat_a = am, 
colname_dat_b = carb, 
header_a = "splendid_am", 
header_b = "wonderful_carb")
# and returns:
## # A tibble: 1 x 3
##   dat_a         dat_b         dat_c        
##   <list>        <list>        <list>       
## 1 <df [32 x 2]> <df [32 x 2]> <df [32 x 2]>  <<-~-~- dat_c has 2 cols: splendid_am & wonderful_carb

总之,这是我为data.frame(header_a = .x$colname_dat_a, header_b = .y$colname_dat_b)而奋斗的部分。如何使其与包装器的参数配合良好?

下面是实现这一点的函数-

library(dplyr)
library(purrr)
create_dat_c <- function(.trb, colname_dat_a, colname_dat_b, header_a, header_b) {
.trb %>%
mutate(dat_c = map2(.x = dat_a, .y = dat_b, 
.f = ~tibble(!!header_a := .x %>% pull({{colname_dat_a}}), 
!!header_b := .y %>% pull({{colname_dat_b}}))))
}

result <- create_dat_c(trb, 
colname_dat_a = am, 
colname_dat_b = carb, 
header_a = "splendid_am", 
header_b = "wonderful_carb")
result
# A tibble: 1 x 3
#  dat_a         dat_b         dat_c            
#  <list>        <list>        <list>           
#1 <df [32 × 2]> <df [32 × 2]> <tibble [32 × 2]>
result$dat_c
#[[1]]
# A tibble: 32 x 2
#   splendid_am wonderful_carb
#         <dbl>          <dbl>
# 1           1              4
# 2           1              4
# 3           1              1
# 4           0              1
# 5           0              2
# 6           0              1
# 7           0              4
# 8           0              2
# 9           0              2
#10           0              4
# … with 22 more rows

data.frame不支持!!name :=语法,这就是我使用tibble的原因。如果你倾向于使用data.frame,你可以使用-

create_dat_c <- function(.trb, colname_dat_a, colname_dat_b, header_a, header_b) {
.trb %>%
mutate(dat_c = map2(.x = dat_a, .y = dat_b, 
.f = ~setNames(data.frame(.x %>% pull({{colname_dat_a}}), 
.y %>% pull({{colname_dat_b}})), c(header_a, header_b))))
}

以下是tidyr包中unnestnest的替代方案:

library(tidyr)
library(dplyr)
result <- trb %>% 
unnest(cols = c(dat_a, dat_b)) %>% 
mutate(my_lovely_am = am,
suberb_carb_col = carb) %>% 
nest(dat_a = 1:2, 
dat_b = 3:4,
dat_c = 5:6) 

输出:

dat_a             dat_b             dat_c            
<list>            <list>            <list>           
1 <tibble [32 x 2]> <tibble [32 x 2]> <tibble [32 x 2]>

检查:

result$dat_c
my_lovely_am suberb_carb_col
<dbl>           <dbl>
1            1               4
2            1               4
3            1               1
4            0               1
5            0               2
6            0               1
7            0               4
8            0               2
9            0               2
10            0               4
# ... with 22 more rows

我们实际上不需要使用purrr。dplyr可以自己做到这一点:

out <- trb %>%
rowwise %>% 
mutate(dat_c = list(tibble(am = dat_a$am, carb = dat_b$carb))) %>%
ungroup

给予:

> out
# A tibble: 1 x 3
dat_a         dat_b         dat_c            
<list>        <list>        <list>           
1 <df [32 x 2]> <df [32 x 2]> <tibble [32 x 2]>
> str(out)
tibble [1 x 3] (S3: tbl_df/tbl/data.frame)
$ dat_a:List of 1
..$ :'data.frame':    32 obs. of  2 variables:
.. ..$ am  : num [1:32] 1 1 1 0 0 0 0 0 0 0 ...
.. ..$ disp: num [1:32] 160 160 108 258 360 ...
$ dat_b:List of 1
..$ :'data.frame':    32 obs. of  2 variables:
.. ..$ mpg : num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
.. ..$ carb: num [1:32] 4 4 1 1 2 1 4 2 2 4 ...
$ dat_c:List of 1
..$ : tibble [32 x 2] (S3: tbl_df/tbl/data.frame)
.. ..$ am  : num [1:32] 1 1 1 0 0 0 0 0 0 0 ...
.. ..$ carb: num [1:32] 4 4 1 1 2 1 4 2 2 4 ...

相关内容

  • 没有找到相关文章