R数据帧:循环通过多个列和行值



我是R的新手。我有一个包含数百万行的大型数据帧,如下所示:

Whole   code1       P_1   Q_1   code2   P_2   Q_2   code3   P_3   Q_3
64      a       0.2   0.1   b      0.3    0.2   d      0.1    0.9
55      a       0.5   0.3   c      0.1    0.3   b      0.4    0.4
70      b       0.4   0.1   d      0.2    0.5   NULL   0.7    0.7
26      c       0.7   0.5   a      0.2    0.6   b      0.2    0.2
47      a       0.8   0.7   d      0.1    0.2   NULL   0.6    0.8
35      d       0.2   0.8   b      0.8    0.1   a      0.2    0.1

我正在根据code1、code2和code3中的值查找三个输出字段。

> Output1   :   If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2   :  If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3   :  If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3

如果以下代码可以更正,我们将不胜感激:

df1 %>%    
for (i in 1:6) {
if (paste0("code", i) %in% c("a", "b")) {
mutate (paste0("Output", i) = Whole * paste0("P_", i) )
} else {    
mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
}   
} 
library(dplyr)
df1 %>%
mutate(
Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
)
#   Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1    64     a 0.2 0.1     b 0.3 0.2     d 0.1 0.9    12.8    19.2     6.4
# 2    55     a 0.5 0.3     c 0.1 0.3     b 0.4 0.4    27.5     5.5    22.0
# 3    70     b 0.4 0.1     d 0.2 0.5  NULL 0.7 0.7    28.0    14.0    49.0
# 4    26     c 0.7 0.5     a 0.2 0.6     b 0.2 0.2    13.0    15.6     5.2
# 5    47     a 0.8 0.7     d 0.1 0.2  NULL 0.6 0.8    37.6     4.7    28.2
# 6    35     d 0.2 0.8     b 0.8 0.1     a 0.2 0.1    28.0     3.5     3.5

如果你的数据更通用(不是硬编码的,也不是超过"3"组的列(,那么我们可以重新塑造数据,进行赋值,然后重新塑造它。

library(tidyr)
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\D+)", "\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
pivot_wider(
id_cols = Whole,
names_from = set,
values_from = c(code, P, Q, Output),
names_sep = "_"
)
# # A tibble: 6 x 13
#   Whole code_1 code_2 code_3   P_1   P_2   P_3   Q_1   Q_2   Q_3 Output_1 Output_2 Output_3
#   <int> <chr>  <chr>  <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>    <dbl>
# 1    64 a      b      d        0.2   0.3   0.1   0.1   0.2   0.9     12.8     19.2     57.6
# 2    55 a      c      b        0.5   0.1   0.4   0.3   0.3   0.4     27.5     16.5     22  
# 3    70 b      d      NULL     0.4   0.2   0.7   0.1   0.5   0.7     28       35       49  
# 4    26 c      a      b        0.7   0.2   0.2   0.5   0.6   0.2     13        5.2      5.2
# 5    47 a      d      NULL     0.8   0.1   0.6   0.7   0.2   0.8     37.6      9.4     37.6
# 6    35 d      b      a        0.2   0.8   0.2   0.8   0.1   0.1     28       28        7  

顺便说一句,一般来说,我建议保持"长"格式,不要重新扩大它。这种"长"形式通常更适合其他整洁的函数(包括ggplot2(,并且很容易扩展到任意计数。这将导致这样的数据:

df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\D+)", "\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
#    Whole set   code      P     Q Output
#    <int> <chr> <chr> <dbl> <dbl>  <dbl>
#  1    64 1     a       0.2   0.1   12.8
#  2    64 2     b       0.3   0.2   19.2
#  3    64 3     d       0.1   0.9   57.6
#  4    55 1     a       0.5   0.3   27.5
#  5    55 2     c       0.1   0.3   16.5
#  6    55 3     b       0.4   0.4   22  
#  7    70 1     b       0.4   0.1   28  
#  8    70 2     d       0.2   0.5   35  
#  9    70 3     NULL    0.7   0.7   49  
# 10    26 1     c       0.7   0.5   13  
# 11    26 2     a       0.2   0.6    5.2
# 12    26 3     b       0.2   0.2    5.2
# 13    47 1     a       0.8   0.7   37.6
# 14    47 2     d       0.1   0.2    9.4
# 15    47 3     NULL    0.6   0.8   37.6
# 16    35 1     d       0.2   0.8   28  
# 17    35 2     b       0.8   0.1   28  
# 18    35 3     a       0.2   0.1    7  

(短得多。(

我们可以使用map2。获取具有"P"、"Q"的列的名称。随后是CCD_ 3之后的数字。然后使用map2在相应的列上循环,应用转换逻辑并将列与原始数据集绑定

library(dplyr)
library(purrr)
library(stringr)
ps <-  names(df1)[str_detect(names(df1), "^P_\d+$")]
qs <-  names(df1)[str_detect(names(df1), "^Q_\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
transmute(Output = Whole *  
case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
TRUE ~ !! rlang::sym(.y)))) %>% 
rename_all(~ str_remove(., fixed("..."))) %>%     
bind_cols(df1, .)
#   Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1    64     a 0.2 0.1     b 0.3 0.2     d 0.1 0.9    12.8    19.2     6.4
#2    55     a 0.5 0.3     c 0.1 0.3     b 0.4 0.4    27.5     5.5    22.0
#3    70     b 0.4 0.1     d 0.2 0.5  NULL 0.7 0.7    28.0    14.0    49.0
#4    26     c 0.7 0.5     a 0.2 0.6     b 0.2 0.2    13.0    15.6     5.2
#5    47     a 0.8 0.7     d 0.1 0.2  NULL 0.6 0.8    37.6     4.7    28.2
#6    35     d 0.2 0.8     b 0.8 0.1     a 0.2 0.1    28.0     3.5     3.5

数据

df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a", 
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c", 
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2, 
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL", 
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4, 
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA, 
-6L))

根据您有多少行,这种data.table方法可能会更快。

library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
][Logical == TRUE,`:=`(Output1 = Whole * P_1,
Output2 = Whole * P_2,
Output3 = Whole * P_3)
][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
Output2 = Whole * Q_2,
Output3 = Whole * Q_3)
][,.(Output1,Output2,Output3)]
Output1 Output2 Output3
1:    12.8    19.2     6.4
2:    27.5     5.5    22.0
3:    28.0    14.0    49.0
4:    13.0    15.6     5.2
5:    37.6     4.7    28.2
6:    28.0     3.5     3.5

最新更新