在 R 中,使用非标准计算从 data.frame 中选择特定变量



我已经设置了几个像关系数据库一样的大型data.frames,我想创建一个函数来查找我需要的任何变量,并从该特定的data.frame中获取它并将其添加到我当前正在处理的data.frame中。我有一种方法可以做到这一点,但它需要暂时列出所有 data.frames,这似乎效率低下。我怀疑非标准评估会为我解决这个问题,但我不确定该怎么做。

以下是有效但似乎效率低下的方法:

Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
ColC = rnorm(10))
Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
ColF = rnorm(10))
Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
ColI = rnorm(10))
Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9])))
# function for grabbing info from other tables
grab <- function(StartDF, ColNames){
AllDFs <- list(Table1, Table2, Table3)
names(AllDFs) <- c("Table1", "Table2", "Table3")
# Determine which data.frames have that column
WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>% 
select(Table)
TempDF <- StartDF
for(i in 1:length(ColNames)){
ToAdd <- AllDFs[WhichDF[i, 1]]
ToAdd <- ToAdd[[1]] %>% 
select(c(ColNames[i], ID))
TempDF <- TempDF %>% left_join(ToAdd)
rm(ToAdd)
}
return(TempDF)

}
grab(Table1, c("ColE", "ColH"))

相反,很棒的是这样的:

grab <- function(StartDF, ColNames){
# Some function that returns the column names of all the data.frames
# without me creating a new object that is a list of them
# Some function that left_joins the correct data.frame plus the column
# "ID" to my starting data.frame, again without needing to create that list 
# of all the data.frames
}

我们可以直接获取从"Key"数据集的"表"列返回的对象的值,而不是手动创建listmget

library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){

# filter the rows of Key based on the ColNames input
# pull the Table column as a vector
# column was factor, so convert to character class
# return the value of the objects with mget in a list
Tables <- Key %>% 
filter(ColumnName %in% ColNames) %>% 
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) 

TempDF <- StartDF
# use the same left_joins in a loop after selecting only the
# ID and corresponding columns from 'ColNames'
for(i in seq_along(ColNames)){
ToAdd  <- Tables[[i]] %>%
select(ColNames[i], ID)          
TempDF <- TempDF %>% 
left_join(ToAdd)
rm(ToAdd)
}
TempDF

}
grab(Table1, c("ColE", "ColH"))

或者另一种选择是reduce

grab <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>% 
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv)  %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .)  %>%
reduce(left_join)
}
grab(Table1, c("ColE", "ColH"))
#   ID       ColA       ColB        ColC        ColE        ColH
#1   A -0.9490093  0.5177143 -1.91015491  0.07777086  1.86277670
#2   B -0.7182786 -1.1019146 -0.70802738 -0.73965230  0.18375660
#3   C  0.5064516 -1.6904354  1.11106206  2.04315508 -0.65365228
#4   D  0.9362477  0.5260682 -0.03419651 -0.51628310 -1.17104181
#5   E  0.5636047 -0.9470895  0.43303304 -2.95928629  1.86425049
#6   F  1.0598531  0.4144901  0.10239896  1.57681703 -0.05382603
#7   G  1.1335047 -0.8282173 -0.28327898  2.02917831  0.50768462
#8   H  0.2941341  0.3261185 -0.15528127 -0.46470035 -0.86561320
#9   I -2.1434905  0.6567689  0.02298549  0.90822132  0.64360337
#10  J  0.4291258  1.3410147  0.67544567  0.12466251  0.75989623

接受的解决方案中存在一个严重的错误。如果您不注意ColNames参数中的排序,则该函数将不起作用。此外,我重新定义了您的数据以改用 tibbles。它们与数据框基本相同,但它们的默认设置更好(例如,您不需要 StringsAsFactor = FALSE(

library(tidyverse)
Table1 <- tibble(
ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)
Key <- tibble(
Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9]))
)
grab_akrun <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>% 
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv)  %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .)  %>%
reduce(left_join)
}
grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343

这有效,但如果您更改顺序:

grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`

相反,您应该像这样处理它:

grab_new <- function(StartDF, ColNames) {
Key %>% 
filter(ColumnName %in% ColNames) %>% 
pluck("Table") %>%
mget(inherits = TRUE) %>% 
map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>% 
reduce(left_join, .init = StartDF)
}
grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343

按预期工作。

创建于 2020-01-21 由 reprex 软件包 (v0.3.0(

最新更新