我想在数据框中添加一个变量的水平,并为其分配同一变量中其他水平的平均值。我不知道该怎么做。因此,我想在变量 trt 中添加级别"基础"。在那里,我想获取变量"pointA"、"pointB"和"pointC"的 trt "OA" 和 "OB" 的平均值
id <- rep(1:3,each=5)
trt <- rep(c("A","OA", "B", "OB","base"),3)
pointA <- sample(1:10,15, replace=TRUE)
pointB<- sample(1:10,15, replace=TRUE)
pointC<- sample(1:10,15, replace=TRUE)
df <- data.frame(id,trt,pointA, pointB,pointC)
df
##> id trt pointA pointB pointC
##>1 1 A 3 3 5
##>2 1 OA 10 6 4
##>3 1 B 9 9 7
##>4 1 OB 10 5 6
##>5 1 base 9 7 3
##>6 2 A 2 9 6
##>7 2 OA 6 3 4
##>8 2 B 6 4 10
##>9 2 OB 8 2 4
##>10 2 base 9 8 4
##>11 3 A 7 1 8
##>12 3 OA 3 10 2
##>13 3 B 2 4 6
##>14 3 OB 2 2 9
##>15 3 base 3 6 8
df[5,3] <- (df[df$id==1 & df$trt=="OA",3] + df[df$id==1 & df$trt=="OB",3])/2
df[5,4] <- (df[df$id==1 & df$trt=="OA",4] + df[df$id==1 & df$trt=="OB",4])/2
df[5,5] <- (df[df$id==1 & df$trt=="OA",5] + df[df$id==1 & df$trt=="OB",5])/2
df[10,3] <- (df[df$id==2 & df$trt=="OA",3] + df[df$id==2 & df$trt=="OB",3])/2
df[10,4] <- (df[df$id==2 & df$trt=="OA",4] + df[df$id==2 & df$trt=="OB",4])/2
df[10,5] <- (df[df$id==2 & df$trt=="OA",5] + df[df$id==2 & df$trt=="OB",5])/2
df[15,3] <- (df[df$id==3 & df$trt=="OA",3] + df[df$id==3 & df$trt=="OB",3])/2
df[15,4] <- (df[df$id==3 & df$trt=="OA",4] + df[df$id==3 & df$trt=="OB",4])/2
df[15,5] <- (df[df$id==3 & df$trt=="OA",5] + df[df$id==3 & df$trt=="OB",5])/2
我希望它看起来像:
df
##> id trt pointA pointB pointC
##>1 1 A 3.0 3.0 5.0
##>2 1 OA 10.0 6.0 4.0
##>3 1 B 9.0 9.0 7.0
##>4 1 OB 10.0 5.0 6.0
##>5 1 base 10.0 5.5 5.0
##>6 2 A 2.0 9.0 6.0
##>7 2 OA 6.0 3.0 4.0
##>8 2 B 6.0 4.0 10.0
##>9 2 OB 8.0 2.0 4.0
##>10 2 base 7.0 2.5 4.0
##>11 3 A 7.0 1.0 8.0
##>12 3 OA 3.0 10.0 2.0
##>13 3 B 2.0 4.0 6.0
##>14 3 OB 2.0 2.0 9.0
##>15 3 base 2.5 6.0 5.5
我们可以按"id"进行分组,然后mutate_at
replace
"trt"为"base"的位置与值子集的mean
,其中"trt"是"OA/OB">
library(dplyr)
df %>%
group_by(id) %>%
mutate_at(vars(starts_with('point')),
~ replace(., trt == 'base', mean(.[trt %in% c("OA", "OB")])))
# Groups: id [3]
# id trt pointA pointB pointC
# <int> <chr> <dbl> <dbl> <dbl>
# 1 1 A 3 3 5
# 2 1 OA 10 6 4
# 3 1 B 9 9 7
# 4 1 OB 10 5 6
# 5 1 base 10 5.5 5
# 6 2 A 2 9 6
# 7 2 OA 6 3 4
# 8 2 B 6 4 10
# 9 2 OB 8 2 4
#10 2 base 7 2.5 4
#11 3 A 7 1 8
#12 3 OA 3 10 2
#13 3 B 2 4 6
#14 3 OB 2 2 9
#15 3 base 2.5 6 5.5
或者另一种选择是filter
"trt"为"OA/OB"的行,获取"点"列的mean
并使用bind_rows
绑定行
df %>%
filter(trt %in% c("OA", "OB")) %>%
group_by(id, trt = 'base') %>%
summarise_at( vars(starts_with("point")), mean) %>%
bind_rows(., df %>%
filter(trt != "base")) %>%
arrange(id, trt == 'base')
或与split/unsplit
base R
unsplit(lapply(split(df, df$id), function(x) {
x[x$trt == 'base', 3:5] <- colMeans(subset(x,
trt %in% c("OA", "OB"), select = 3:5))
x}),
df$id)
数据
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), trt = c("A", "OA", "B", "OB", "base", "A",
"OA", "B", "OB", "base", "A", "OA", "B", "OB", "base"), pointA = c(3L,
10L, 9L, 10L, 9L, 2L, 6L, 6L, 8L, 9L, 7L, 3L, 2L, 2L, 3L), pointB = c(3L,
6L, 9L, 5L, 7L, 9L, 3L, 4L, 2L, 8L, 1L, 10L, 4L, 2L, 6L), pointC = c(5L,
4L, 7L, 6L, 3L, 6L, 4L, 10L, 4L, 4L, 8L, 2L, 6L, 9L, 8L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15"))
对于基本 R 解决方案:
像以前一样重新创建df
,但首先设置种子:
set.seed(1)
id <- rep(1:3,each=5)
trt <- rep(c("A","OA", "B", "OB","base"),3)
pointA <- sample(1:10,15, replace=TRUE)
pointB<- sample(1:10,15, replace=TRUE)
pointC<- sample(1:10,15, replace=TRUE)
df <- data.frame(id,trt,pointA, pointB,pointC)
首先,获取哪些行的值为"base"
trt
:
base_row_ind_vec <- which(df$trt == "base")
然后遍历这些行,将第 3 列到第 5 列中的值替换为它们对trt
"OA"
和"OB"
的相应个体值的平均值:
for(i in base_row_ind_vec){
id_curr <- df[i,"id"]
df[i,3:5] <- lapply(df[df$id == id_curr & df$trt %in% c("OA", "OB"), 3:5], mean)
}
然后df
包含以下内容:
id trt pointA pointB pointC
1 1 A 9.0 9.0 10.0
2 1 OA 4.0 5.0 6.0
3 1 B 7.0 5.0 4.0
4 1 OB 1.0 9.0 4.0
5 1 base 2.5 7.0 5.0
6 2 A 7.0 5.0 9.0
7 2 OA 2.0 5.0 7.0
8 2 B 3.0 2.0 6.0
9 2 OB 1.0 10.0 9.0
10 2 base 1.5 7.5 8.0
11 3 A 5.0 1.0 9.0
12 3 OA 10.0 4.0 7.0
13 3 B 6.0 3.0 8.0
14 3 OB 10.0 6.0 6.0
15 3 base 10.0 5.0 6.5