我有两个data.frames
(下面的代码)。data.frames
匹配项中的行 ID。
set.seed(12345)
df1 <- data.frame(id=c(letters),
g1=c(rep(0,7), rep(1,18), "NA"),
g2=c(rep("NA", 3), rep(0,23)),
g3=c(rep(0,5), rep("NA",20),1),
g4=c(sample(c(0,1), replace=TRUE, size=26)),
g5=c(sample(c(0,1), replace=TRUE, size=20), "NA", sample(c(0,1), replace=TRUE, size=5)),
g6=c(rep(1,26)),
g7=c(sample(c(0,1), replace=TRUE, size=26)),
g8=c(rep(0,5), rep("NA",21)),
g9=c(rep("NA",26)),
g10=c(rep(0,26)))
df1[,2:11] <- sapply(df1[,2:11], as.numeric)
df2 <- data.frame(id=c(letters),
b1=c(runif(7, 3, 7.8), "NA", runif(18, 6, 18)),
b2=c(runif(7, -1, 4), "NA", runif(18, 0, 5)),
b3=c(runif(20, 0, 16), "NA", runif(5, -1, 2)),
b4=c(runif(7, 5, 29), rep("NA", 3), runif(3, -1, 2)),
b5=c(runif(3, 3, 8), rep("NA",23)),
b6=c(rep("NA",21), runif(5, 0, 19)),
b7=c(rep("NA",26)),
b8=c(runif(26, 1, 9)),
b9=c(runif(7, -1, 4), "NA", runif(18, -1, 4)),
b10=c(runif(6, -1, 4), rep("NA", 2), runif(18, 0, 5)),
b11=c(runif(7, 18, 23), "NA", runif(18, 12, 19)),
b12=c(runif(7, 0, 4), "NA", runif(18, 0, 4)),
b13=c(runif(7, 1, 4), "NA", runif(14, -2, 18), rep("NA", 4)),
b14=c(runif(6, 6, 8), rep("NA", 3), runif(17, 0, 5)),
b15=c(runif(7, 11, 12), "NA", runif(18, -1, 5)),
b16=c(runif(7, 3, 4), "NA", runif(18, 12, 21)),
b17=c(rep("NA", 8), runif(16, 1, 8), rep("NA", 2)))
df2[,2:18] <- sapply(df2[,2:18], as.numeric)
我想使用 t 检验来测试对于df1
中的每个"g"列,0 和 1 组在df2
中的值是否明显不同。
例如,对于 b1:
t.test(df2$b1[1:8], df2$b1[9:26])$p.val
#[1] 3.846501e-07
我想用结果创建一个新df3
,如下所示:
df3 <- data.frame(g=rep("g1", 3),
b=c("b1", "b2", "b3"),
mean_0=c(mean(na.omit(df2$b1[1:8])),0,0),
mean_1=c(mean(na.omit(df2$b1[9:26])),0,0),
p.val=c(t.test(df2$b1[1:8], df2$b1[9:26])$p.val,1,1),
p.adjust=c(0,1,1))
df3 <- df3[order(df3$p.val),]
我不知道如何编写这个难题。谁能帮忙?
这是我对传统 for 循环的尝试:
#preallocate the result matrix
resultlength <- (ncol(df1)-1) * (ncol(df2)-1)
results <- data.frame(b=vector(mode= "integer", length=resultlength), g=vector(mode= "integer", length=resultlength), p = vector(mode= "numeric", length=resultlength))
#index
i=0
for (g in (2:ncol(df1)) ) {
gindex <- g -1
independent <- df1[, g]
for(col in (2:ncol(df2)) ) {
b=col-1
tdf <- data.frame(dependent=df2[, col], independent )
tdf <- tdf[complete.cases(tdf), ]
# Check to ensure at least a 0 & 1 is present
if (length(unique(tdf$independent)) == 2) {
tresult <- tryCatch( t.test(tdf$dependent ~ tdf$independent)$p.value, error = function(e) 1.2)
}
else
{
tresult <-1.1 #error for lacking 2 groups
}
i <- i +1
results[i,] <- c(b, gindex, tresult)
}
}
这确实有一些错误检查和更正。 现在,如果其中一个组中只有一个值,它将失败。
一种可能的解决方案是使用purrr::map2()
迭代 "g" 和 "b" 列的不同组合,例如
library(tidyverse)
library(broom)
set.seed(12345)
df1 <- data.frame(id=c(letters),
g1=c(rep(0,7), rep(1,18), "NA"),
g2=c(rep("NA", 3), rep(0,23)),
g3=c(rep(0,5), rep("NA",20),1),
g4=c(sample(c(0,1), replace=TRUE, size=26)),
g5=c(sample(c(0,1), replace=TRUE, size=20), "NA", sample(c(0,1), replace=TRUE, size=5)),
g6=c(rep(1,26)),
g7=c(sample(c(0,1), replace=TRUE, size=26)),
g8=c(rep(0,5), rep("NA",21)),
g9=c(rep("NA",26)),
g10=c(rep(0,26)))
df1[,2:11] <- sapply(df1[,2:11], as.numeric)
df2 <- data.frame(id=c(letters),
b1=c(runif(7, 3, 7.8), "NA", runif(18, 6, 18)),
b2=c(runif(7, -1, 4), "NA", runif(18, 0, 5)),
b3=c(runif(20, 0, 16), "NA", runif(5, -1, 2)),
b4=c(runif(7, 5, 29), rep("NA", 3), runif(3, -1, 2)),
b5=c(runif(3, 3, 8), rep("NA",23)),
b6=c(rep("NA",21), runif(5, 0, 19)),
b7=c(rep("NA",26)),
b8=c(runif(26, 1, 9)),
b9=c(runif(7, -1, 4), "NA", runif(18, -1, 4)),
b10=c(runif(6, -1, 4), rep("NA", 2), runif(18, 0, 5)),
b11=c(runif(7, 18, 23), "NA", runif(18, 12, 19)),
b12=c(runif(7, 0, 4), "NA", runif(18, 0, 4)),
b13=c(runif(7, 1, 4), "NA", runif(14, -2, 18), rep("NA", 4)),
b14=c(runif(6, 6, 8), rep("NA", 3), runif(17, 0, 5)),
b15=c(runif(7, 11, 12), "NA", runif(18, -1, 5)),
b16=c(runif(7, 3, 4), "NA", runif(18, 12, 21)),
b17=c(rep("NA", 8), runif(16, 1, 8), rep("NA", 2)))
df2[,2:18] <- sapply(df2[,2:18], as.numeric)
# In your example you use "t.test(df2$b1[1:8], df2$b1[9:26])$p.val"
# but these columns don't line up with the values in df1;
# you should instead use "t.test(df2$b1[1:7], df2$b1[8:25])$p.val", e.g.
t.test(df2$b1[1:7], df2$b1[8:25])$p.val
#> [1] 8.505498e-07
t.test(df2[df1$g1 == 0,]$b1, df2[df1$g1 == 1,]$b1)$p.val
#> [1] 8.505498e-07
df3 <- data.frame(g=rep("g1", 3),
b=c("b1", "b2", "b3"),
mean_0=c(mean(na.omit(df2$b1[1:8])),0,0),
mean_1=c(mean(na.omit(df2$b1[9:26])),0,0),
p.val=c(t.test(df2$b1[1:8], df2$b1[9:26])$p.val,1,1),
p.adjust=c(0,1,1))
df3 <- df3[order(df3$p.val),]
df3
#> g b mean_0 mean_1 p.val p.adjust
#> 1 g1 b1 4.870685 12.10767 3.846501e-07 0
#> 2 g1 b2 0.000000 0.00000 1.000000e+00 1
#> 3 g1 b3 0.000000 0.00000 1.000000e+00 1
df4 <- map2(.x = rep(c("g1", "g4", "g5", "g7"), each = 4),
.y = rep(c("b1", "b2", "b3", "b4"), times = 4),
.f = ~tidy(t.test(df2[df1[[.x]] == 0,][[.y]],
df2[df1[[.x]] == 1,][[.y]])) %>%
mutate(g = .x,
b = .y) %>%
select(g,
b,
"mean_0" = estimate1,
"mean_1" = estimate2,
p.value))
result <- bind_rows(df4)
result
#> # A tibble: 16 × 5
#> g b mean_0 mean_1 p.value
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 g1 b1 4.87 12.0 0.000000851
#> 2 g1 b2 2.41 2.74 0.584
#> 3 g1 b3 7.82 7.53 0.907
#> 4 g1 b4 19.1 11.5 0.0611
#> 5 g4 b1 10.7 9.85 0.683
#> 6 g4 b2 2.59 2.78 0.798
#> 7 g4 b3 7.84 7.11 0.782
#> 8 g4 b4 12.8 13.9 0.822
#> 9 g5 b1 10.1 9.81 0.883
#> 10 g5 b2 2.48 2.81 0.614
#> 11 g5 b3 8.05 7.06 0.743
#> 12 g5 b4 10.8 14.6 0.471
#> 13 g7 b1 9.52 11.3 0.325
#> 14 g7 b2 2.57 3.04 0.396
#> 15 g7 b3 8.20 5.51 0.318
#> 16 g7 b4 16.5 8.31 0.0923
创建于 2022-08-09 由 reprex 软件包 (v2.0.1)
铌。这仅适用于t.test()
不返回错误的列。在这个例子中,我"精心挑选"了"g"和"b"列,它们不全是零/全是一/全NA,所以这个例子就完成了,所以你可能需要改变你的实际数据的方法。