r语言 - 如何将观测值的存在或不存在转换为具有这种格式的二进制事件计数的矩阵?



我正在尝试将遗传估算程序的输入转换为不同的格式,以便我可以在下游分析中使用它。 输入外观的玩具示例如下:

input <- data.frame(A1 = c("a", "a", "b"), A2 = c("b", "a", "b"),
row.names = c("ind1", "ind2", "ind3"), stringsAsFactors = FALSE)
A1 A2
ind1  a  b
ind2  a  a
ind3  b  b

我需要一个矩阵(或数据框,我不介意(,每个单独的两列,每个可能的观察一行。然后,如果每个个体的两个观测值相同,则第二列和该观测值行中将有一个"1"。否则,两个观测值行的第一列中都会有一个"1"。所需的输出如下所示:

output <- matrix(c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), nrow = 2, ncol = 6,
dimnames = list(c("a", "b"),
c("ind1_1", "ind1_2", "ind2_1", "ind2_2", "ind3_1", "ind3_2")))
ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
a      1      0      0      1      0      0
b      1      0      0      0      0      1

我试图创建一个全为零的矩阵,但后来我很难找到应该有"1"的位置,或多或少是这样的:

observations <- sort(unique(c(input$A1, input$A2)))
individuals <- row.names(input)
output2 <- data.frame(matrix(0, nrow = length(observations),
ncol = length(individuals) * 2), row.names = observations)
colnames(output2) <- rep(individuals, each = 2)

然后,我考虑使用带有条件函数的应用语句,如果每个人的观察结果相等或不同,则具有不同的结果。但如果你想出不同的想法,我愿意接受建议。我不介意其他类似语言(python,perl...(的解决方案。

当然,现实比这更复杂,所以我真的很感激可扩展的解决方案。这是具有五个测量值的原始输入的示例:

ID locus allele1 allele2      prob     matching
397    FAM_308  HLAA   26:01   29:02 0.9805655 0.0006153191
677   FAM_2235  HLAA   03:01   03:01 0.9917792 0.0043972647
274 882_cas326  HLAA   01:01   02:01 0.8891524 0.0001758429
246 851_cas295  HLAA   02:01   03:01 0.9468442 0.0002267387
95  678_cas122  HLAA   02:01   02:01 0.9643058 0.0004104801

在玩具示例中,各个 ID(行名称(位于 ID 列中,A1 是等位基因 1 列,A2 是等位基因 2 列。预期输出如下:

FAM_308 FAM_308 FAM_2235 FAM_2235 882_cas326 882_cas326 851_cas295 851_cas295
01:01       0       0        0        0          1          0          0          0
02:01       0       0        0        0          1          0          1          0
03:01       0       0        0        1          0          0          1          0
26:01       1       0        0        0          0          0          0          0
29:02       1       0        0        0          0          0          0          0
678_cas122 678_cas122
01:01          0          0
02:01          0          1
03:01          0          0
26:01          0          0
29:02          0          0

非常感谢您的贡献!

这是使用您的虚拟数据的解决方案。应该很容易适应真实的东西。

library(dplyr)
A1 <- c("a", "a", "b")
A2 <- c("b", "a", "b")
In <- c("ind1", "ind2", "ind3")
alleles <- data.frame(In, A1, A2)
result <- 
bind_rows(alleles, alleles, .id="Index") %>% 
arrange(In) %>% 
mutate(a=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "a" & A2 == "a" ~ 1,
TRUE ~ 0
)) %>% 
mutate(b=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "b" & A2 == "b" ~ 1,
TRUE ~ 0
))
reshaped <- result %>% 
mutate(new_name=paste(In, Index, sep="_")) %>% 
select(new_name, a, b) %>% 
t
final <- as.matrix(reshaped[2:3,])
colnames(final) <- reshaped[1,]
rownames(final) <- c("a", "b")
final
ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
a "1"    "0"    "0"    "1"    "0"    "0"   
b "1"    "0"    "0"    "0"    "0"    "1"

编辑:一个更通用的解决方案,避免每个等位基因case_when。使用真实数据样本(我认为(:

library(dplyr)
library(tidyr)
ID <- c("FAM_308", "FAM_2235", "882_cas326", "851_cas295", "678_cas122")
allele1 <- c("26:01", "03:01", "01:01", "02:01", "02:01")
allele2 <- c("29:02", "03:01", "02:01", "03:01", "02:01")
DD <- data.frame(ID, allele1, allele2, stringsAsFactors = FALSE) %>% arrange(ID, allele1, allele2)
DD_long <- gather(DD, Allele, Value, -ID)
all_rows <- unique(DD_long$Value)
all_cols <- unique(DD_long$ID)
mm <- matrix(
0,
nrow = length(all_rows),
ncol = length(all_cols) * 2 ,
dimnames = list(all_rows, c(
paste(all_cols, 1, sep = "_"), paste(all_cols, 2, sep = "_")
))
)
# function to fill rows, 
# but don't keep track of whether alleles match
fill_row <- function(row, mat) {
x <- filter(DD_long, Value == row) %>% 
mutate(z=paste(ID, gsub("allele", "", Allele), sep="_")) %>% 
select(z) %>% unlist %>% unname
cat("found allele ", row, "in individual ", x, "nn")
mat[row, x] <- 1
mat
}
for (i in seq_along(all_rows)) {
mm <- fill_row(all_rows[i], mm)
}
# reorganize the 1s and 0s dependent on whether alleles match
reorganize_row <- function(row, col, mat) {
if (sum(mat[row,grep(col, colnames(mm))]) == 1) {
mat[row, grep(col, x = colnames(mat))[1]] <- 1
mat[row, grep(col, x = colnames(mat))[2]] <- 0
}
if (sum(mat[row,grep(col, colnames(mm))]) == 2) {
mat[row, grep(col, x = colnames(mat))[1]] <- 0
mat[row, grep(col, x = colnames(mat))[2]] <- 1
}
mat 
}
# nested loop, sorry
for (i in seq_along(all_rows)) {
for (j in seq_along(all_cols)) {
mm <- reorganize_row(all_rows[i], col = all_cols[j], mat = mm)
}
}
# sort the matrix to be as in example
nn <- mm[c("01:01", "02:01", "03:01", "26:01", "29:02"),
c(
"FAM_308_1",
"FAM_308_2",
"FAM_2235_1",
"FAM_2235_2",
"882_cas326_1",
"882_cas326_2",
"851_cas295_1",
"851_cas295_2",
"678_cas122_1",
"678_cas122_2"
)]
colnames(nn) <- gsub("_1|_2", "", x = colnames(nn))
nn
FAM_308 FAM_308 FAM235 FAM235 882_cas326 882_cas326 851_cas295 851_cas295 678_cas122 678_cas122
01:01       0       0      0      0          1          0          0          0          0          0
02:01       0       0      0      0          1          0          1          0          0          1
03:01       0       0      0      1          0          0          1          0          0          0
26:01       1       0      0      0          0          0          0          0          0          0
29:02       1       0      0      0          0          0          0          0          0          0

使用基数 R,我们可以获得观察的所有unique值。对于每行中的每个观察结果,我们根据条件返回输出。将所有结果绑定在一起,并分配列名和行名。首先在共享input数据上执行此操作

unique_vals <- unique(unlist(input))
cols <- c(t(outer(rownames(input), c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x) 
c(apply(input, 1, function(y) 
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
rownames(output) <- unique_vals

output
#  ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
#a      1      0      0      1      0      0
#b      1      0      0      0      0      1

现在将其应用于原始数据帧 (df(

vals <- c("allele1", "allele2")
unique_vals <- sort(unique(unlist(df[vals])))
cols <- c(t(outer(df$ID, c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x) 
c(apply(df[vals], 1, function(y) 
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
output
#      FAM_308_1 FAM_308_2 FAM_2235_1 FAM_2235_2 882_cas326_1 882_cas326_2
#01:01         0         0          0          0            1            0
#02:01         0         0          0          0            1            0
#03:01         0         0          0          1            0            0
#26:01         1         0          0          0            0            0
#29:02         1         0          0          0            0            0
#      851_cas295_1 851_cas295_2 678_cas122_1 678_cas122_2
#01:01            0            0            0            0
#02:01            1            0            0            1
#03:01            1            0            0            0
#26:01            0            0            0            0
#29:02            0            0            0            0

具有相同名称的列不是一个好的做法,因此在列名称中添加"_1""_2"

df在哪里

df <- structure(list(ID = c("FAM_308", "FAM_2235", "882_cas326", "851_cas295", 
"678_cas122"), locus = c("HLAA", "HLAA", "HLAA", "HLAA", "HLAA"
), allele1 = c("26:01", "03:01", "01:01", "02:01", "02:01"), 
allele2 = c("29:02", "03:01", "02:01", "03:01", "02:01"), 
prob = c(0.9805655, 0.9917792, 0.8891524, 0.9468442, 0.9643058
), matching = c(0.0006153191, 0.0043972647, 0.0001758429, 
0.0002267387, 0.0004104801)), class = "data.frame", row.names = c("397", 
"677", "274", "246", "95"))

最新更新