r语言 - 添加一个变量的新级别,并为每个 id 为其分配其他变量的平均值



我想在数据框中添加一个变量的水平,并为其分配同一变量中其他水平的平均值。我不知道该怎么做。因此,我想在变量 trt 中添加级别"基础"。在那里,我想获取变量"pointA"、"pointB"和"pointC"的 trt "OA" 和 "OB" 的平均值

id <- rep(1:3,each=5)
trt <- rep(c("A","OA", "B", "OB","base"),3)
pointA <- sample(1:10,15, replace=TRUE)
pointB<- sample(1:10,15, replace=TRUE)
pointC<- sample(1:10,15, replace=TRUE)
df <- data.frame(id,trt,pointA, pointB,pointC)
df
##>   id  trt pointA pointB pointC
##>1   1    A      3      3      5
##>2   1   OA     10      6      4
##>3   1    B      9      9      7
##>4   1   OB     10      5      6
##>5   1 base      9      7      3
##>6   2    A      2      9      6
##>7   2   OA      6      3      4
##>8   2    B      6      4     10
##>9   2   OB      8      2      4
##>10  2 base      9      8      4
##>11  3    A      7      1      8
##>12  3   OA      3     10      2
##>13  3    B      2      4      6
##>14  3   OB      2      2      9
##>15  3 base      3      6      8
df[5,3] <- (df[df$id==1 & df$trt=="OA",3] + df[df$id==1 & df$trt=="OB",3])/2
df[5,4] <- (df[df$id==1 & df$trt=="OA",4] + df[df$id==1 & df$trt=="OB",4])/2
df[5,5] <- (df[df$id==1 & df$trt=="OA",5] + df[df$id==1 & df$trt=="OB",5])/2
df[10,3] <- (df[df$id==2 & df$trt=="OA",3] + df[df$id==2 & df$trt=="OB",3])/2
df[10,4] <- (df[df$id==2 & df$trt=="OA",4] + df[df$id==2 & df$trt=="OB",4])/2
df[10,5] <- (df[df$id==2 & df$trt=="OA",5] + df[df$id==2 & df$trt=="OB",5])/2
df[15,3] <- (df[df$id==3 & df$trt=="OA",3] + df[df$id==3 & df$trt=="OB",3])/2
df[15,4] <- (df[df$id==3 & df$trt=="OA",4] + df[df$id==3 & df$trt=="OB",4])/2
df[15,5] <- (df[df$id==3 & df$trt=="OA",5] + df[df$id==3 & df$trt=="OB",5])/2

我希望它看起来像:

df
##>   id  trt pointA pointB pointC
##>1   1    A    3.0    3.0    5.0
##>2   1   OA   10.0    6.0    4.0
##>3   1    B    9.0    9.0    7.0
##>4   1   OB   10.0    5.0    6.0
##>5   1 base   10.0    5.5    5.0
##>6   2    A    2.0    9.0    6.0
##>7   2   OA    6.0    3.0    4.0
##>8   2    B    6.0    4.0   10.0
##>9   2   OB    8.0    2.0    4.0
##>10  2 base    7.0    2.5    4.0
##>11  3    A    7.0    1.0    8.0
##>12  3   OA    3.0   10.0    2.0
##>13  3    B    2.0    4.0    6.0
##>14  3   OB    2.0    2.0    9.0
##>15  3 base    2.5    6.0    5.5

我们可以按"id"进行分组,然后mutate_atreplace"trt"为"base"的位置与值子集的mean,其中"trt"是"OA/OB">

library(dplyr)
df %>% 
group_by(id) %>%
mutate_at(vars(starts_with('point')),
~ replace(., trt == 'base',  mean(.[trt %in% c("OA", "OB")])))
# Groups:   id [3]
#      id trt   pointA pointB pointC
#   <int> <chr>  <dbl>  <dbl>  <dbl>
# 1     1 A        3      3      5  
# 2     1 OA      10      6      4  
# 3     1 B        9      9      7  
# 4     1 OB      10      5      6  
# 5     1 base    10      5.5    5  
# 6     2 A        2      9      6  
# 7     2 OA       6      3      4  
# 8     2 B        6      4     10  
# 9     2 OB       8      2      4  
#10     2 base     7      2.5    4  
#11     3 A        7      1      8  
#12     3 OA       3     10      2  
#13     3 B        2      4      6  
#14     3 OB       2      2      9  
#15     3 base     2.5    6      5.5

或者另一种选择是filter"trt"为"OA/OB"的行,获取"点"列的mean并使用bind_rows绑定行

df %>%
filter(trt %in% c("OA", "OB")) %>%
group_by(id, trt = 'base') %>% 
summarise_at( vars(starts_with("point")), mean) %>%
bind_rows(., df %>% 
filter(trt != "base")) %>% 
arrange(id, trt == 'base')

或与split/unsplitbase R

unsplit(lapply(split(df, df$id), function(x) {
x[x$trt == 'base', 3:5] <- colMeans(subset(x, 
trt %in% c("OA", "OB"), select = 3:5))
x}),
df$id)

数据

df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L), trt = c("A", "OA", "B", "OB", "base", "A", 
"OA", "B", "OB", "base", "A", "OA", "B", "OB", "base"), pointA = c(3L, 
10L, 9L, 10L, 9L, 2L, 6L, 6L, 8L, 9L, 7L, 3L, 2L, 2L, 3L), pointB = c(3L, 
6L, 9L, 5L, 7L, 9L, 3L, 4L, 2L, 8L, 1L, 10L, 4L, 2L, 6L), pointC = c(5L, 
4L, 7L, 6L, 3L, 6L, 4L, 10L, 4L, 4L, 8L, 2L, 6L, 9L, 8L)), 
class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15"))

对于基本 R 解决方案:

像以前一样重新创建df,但首先设置种子:

set.seed(1)
id <- rep(1:3,each=5)
trt <- rep(c("A","OA", "B", "OB","base"),3)
pointA <- sample(1:10,15, replace=TRUE)
pointB<- sample(1:10,15, replace=TRUE)
pointC<- sample(1:10,15, replace=TRUE)
df <- data.frame(id,trt,pointA, pointB,pointC)

首先,获取哪些行的值为"base"trt

base_row_ind_vec <- which(df$trt == "base")

然后遍历这些行,将第 3 列到第 5 列中的值替换为它们对trt"OA""OB"的相应个体值的平均值:

for(i in base_row_ind_vec){
id_curr <- df[i,"id"]
df[i,3:5] <- lapply(df[df$id == id_curr & df$trt %in% c("OA", "OB"), 3:5], mean)
}

然后df包含以下内容:

id  trt pointA pointB pointC
1   1    A    9.0    9.0   10.0
2   1   OA    4.0    5.0    6.0
3   1    B    7.0    5.0    4.0
4   1   OB    1.0    9.0    4.0
5   1 base    2.5    7.0    5.0
6   2    A    7.0    5.0    9.0
7   2   OA    2.0    5.0    7.0
8   2    B    3.0    2.0    6.0
9   2   OB    1.0   10.0    9.0
10  2 base    1.5    7.5    8.0
11  3    A    5.0    1.0    9.0
12  3   OA   10.0    4.0    7.0
13  3    B    6.0    3.0    8.0
14  3   OB   10.0    6.0    6.0
15  3 base   10.0    5.0    6.5

最新更新