r语言 - 如何根据某些条件优化粘贴单个/多个列名及其值



我想粘贴列名及其值。它必须基于一些条件(if语句),它可以基于单个变量或多个变量。

下面是一个显示数据的小示例。我想加快这个过程,得到与fun2、fun3和fun4相同的结果。

为了使其尽可能简单,如果列a、b、c和d的值大于零,则只需要设置一条规则为missing。但是,我留下了规则的名称,因为它可以是不同的,比如"规则1"。比;0和规则2;

library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
re = "")

这就是数据的样子:

id         t1         t2  a  b  c  d re
1  0.6883367 -0.3454049  0  0  0  0 '' 
2 -1.0653127 -1.3035077 NA NA NA NA '' 
3  0.5210550  0.8489376  0 NA  0  1 '' 
4  0.3697369 -0.1135827  1  0 NA  1 '' 
5  1.3195759 -1.5431305  0  1  0  0 '' 
6 -0.2106836 -0.3421900 NA  0  1  1 '' 
7 -0.2258871 -2.1644697  1  1 NA  0 '' 
8 -0.7132686  1.7673775  1 NA  1  1 '' 
9  0.9467068  1.8188665  0  1  1 NA '' 
10 -0.3900479  1.7306935  1  1  1  1 '' 

下面是期望的输出。这个想法是保留一个列,其中包含一个描述,说明某些值被设置为缺失的原因。在这个例子中,只有前两个个体有t1和t2的记录。个体1、个体2和个体3有t1的记录,个体1、个体2、个体5、个体7和个体9有t2的记录。

id       t1     t2     a     b     c     d    re                                      
1  -0.182   1.43      0     0     0     0   ""                                      
2  -1.31    0.733    NA    NA    NA    NA   ""                                      
3  -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1);"                       
4  NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1);"        
5  NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
6  NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1);"        
7  NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
8  NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"   
9  NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
10  NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"

首次尝试(fun1)。不是预期的结果,因为它在mutate中查找单个空格。所有其他函数(fun2、fun3和fun4)都输出正确的结果。

fun1 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id    t1     t2     a     b     c     d    re                                      
<int> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
1    NA   1.43     0     0     0     0   "Rule1:t1(  ); "                        
2    NA   0.733   NA    NA    NA    NA   "Rule1:t1(  ); "                        
3    NA  NA        0    NA     0     1   "Rule2:t2(d=1); Rule1:t1(  ); "         
4    NA  NA        1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
5    NA   1.78     0     1     0     0   "Rule1:t1( b=1 ); "                     
6    NA  NA       NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(  c=1); "      
7    NA  -0.345    1     1    NA     0   "Rule1:t1(a=1 b=1 ); "                  
8    NA  NA        1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
9    NA  -1.22     0     1     1    NA   "Rule1:t1( b=1 c=1); "                  
10    NA  NA        1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数2 (fun2)使用"trimws"

fun2 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
1 -0.182   1.43      0     0     0     0   ""                                      
2 -1.31    0.733    NA    NA    NA    NA   ""                                      
3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数3 (fun3)使用"gsub"使用正则表达式。

fun3 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := gsub("\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
1 -0.182   1.43      0     0     0     0   ""                                      
2 -1.31    0.733    NA    NA    NA    NA   ""                                      
3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1b=1); "                   
8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1c=1); "   
9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1c=1); "                   
10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "

函数4 (fun4)在正则表达式mutate中使用stri_detect

fun4 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id      t1     t2     a     b     c     d re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
1 -0.182   1.43      0     0     0     0 ""                                      
2 -1.31    0.733    NA    NA    NA    NA ""                                      
3 -0.0613 NA         0    NA     0     1 "Rule2:t2(d=1); "                       
4 NA      NA         1     0    NA     1 "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
5 NA       1.78      0     1     0     0 "Rule1:t1( b=1 ); "                     
6 NA      NA        NA     0     1     1 "Rule2:t2(d=1); Rule1:t1(  c=1); "      
7 NA      -0.345     1     1    NA     0 "Rule1:t1(a=1 b=1 ); "                  
8 NA      NA         1    NA     1     1 "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
9 NA      -1.22      0     1     1    NA "Rule1:t1( b=1 c=1); "                  
10 NA      NA         1     1     1     1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

使用更多数据进行基准测试

n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a  = sample(c(0, NA, 1), n, replace = TRUE),
b  = sample(c(0, NA, 1), n, replace = TRUE),
c  = sample(c(0, NA, 1), n, replace = TRUE),
d  = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
benchmark(fun1(dat),
fun2(dat),
fun3(dat),
fun4(dat))
Benchmark summary:
Time units : milliseconds 
expr n.eval min lw.qu median mean up.qu  max total relative
fun1(dat)    100 642   653    660  668   666  774 66800     1.00
fun2(dat)    100 742   756    763  773   768  874 77300     1.16
fun3(dat)    100 765   779    785  794   791  903 79400     1.19
fun4(dat)    100 743   756    763  777   770 1010 77700     1.16

有没有人知道如何加快这个过程?

谢谢。

首先,我承认我没能打败基准测试(感谢你的挑战)。可能有一些方法可以从中获得一点速度,但让我推荐一种方法,它可以做同样的事情(小数据更快,大数据差不多),但支持按规则函数。这不是你直接要求的,但你暗示了每个规则的不同功能。

更新(我代码,感谢@Cole找到了我早期探索的残余。)

RULES <- list(
Rule1 = list(
rule = "Rule1",
lhs = "t1",
rhs = c("a", "b", "c"),
fun = function(z) !is.na(z) & z > 0
),
Rule2 = list(
rule = "Rule2",
lhs = "t2",
rhs = "d",
fun = is.na
)
)
fun9 <- function(dat, RULES = list()) {
nr <- nrow(dat)
# RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
for (r in seq_along(RULES)) {
fun <- RULES[[r]]$fun
lhs <- RULES[[r]]$lhs
for (rhs in RULES[[r]]$rhs) {
lgl <- do.call(fun, list(dat[[rhs]]))
set(dat, which(lgl), lhs, NA)
RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
}
ind <- nzchar(RE[[r]])
RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
}
set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}

RULES和使用fun9的前提应该是不言自明的。

使用小数据进行基准测试似乎很有希望:

set.seed(2021)
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
re = "")
fun9(dat, RULES)[]
#        id         t1         t2     a     b     c     d                                re
#     <int>      <num>      <num> <num> <num> <num> <num>                            <char>
#  1:     1 -0.1224600 -1.0822049     0     0     0     0                                 ;
#  2:     2  0.5524566         NA    NA    NA    NA    NA                   ;Rule2:t2( d=1)
#  3:     3  0.3486495  0.1819954     0    NA     0     1                                 ;
#  4:     4         NA  1.5085418     1     0    NA     1                   Rule1:t1( a=1);
#  5:     5         NA  1.6044701     0     1     0     0                   Rule1:t1( b=1);
#  6:     6         NA -1.8414756    NA     0     1     1                   Rule1:t1( c=1);
#  7:     7         NA  1.6233102     1     1    NA     0               Rule1:t1( a=1 b=1);
#  8:     8         NA  0.1313890     1    NA     1     1               Rule1:t1( a=1 c=1);
#  9:     9         NA         NA     0     1     1    NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10:    10         NA  1.5133183     1     1     1     1           Rule1:t1( a=1 b=1 c=1);
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time             gc                  
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>           <list>              
# 1 fun4(dat)          9.52ms   11.1ms      88.5     316KB     2.06    43     1      486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]>    <tibble [44 x 3]>   
# 2 fun9(dat, RULES)   97.5us  113.5us    7760.       416B     6.24  3731     3      481ms <NULL> <Rprofmem[,3] [2 x 3]>  <bch:tm [3,734]> <tibble [3,734 x 3]>

仅从`itr/sec`来看,这个fun9看起来快一点。

数据量较大:

set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a  = sample(c(0, NA, 1), n, replace = TRUE),
b  = sample(c(0, NA, 1), n, replace = TRUE),
c  = sample(c(0, NA, 1), n, replace = TRUE),
d  = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                   time         gc              
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                   <list>       <list>          
# 1 fun4(dat)           1.24s    1.24s     0.806    62.9MB     1.61     1     2      1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms  315.4ms     3.17     53.8MB     4.76     2     3    630.8ms <NULL> <Rprofmem[,3] [70 x 3]>  <bch:tm [2]> <tibble [2 x 3]>

虽然此解决方案不使用tidytable或其流,但它更快。re的清理是另一个步骤,可能会把这个速度降低到致命的水平:-)。

旁注:我试图使用lapply,mget和其他技巧在data.table数据环境中做的事情,但最终,使用data.table::set(https://stackoverflow.com/a/16846530/3358272)和简单向量似乎是最快的。

相关内容

最新更新