我正在尝试将遗传估算程序的输入转换为不同的格式,以便我可以在下游分析中使用它。 输入外观的玩具示例如下:
input <- data.frame(A1 = c("a", "a", "b"), A2 = c("b", "a", "b"),
row.names = c("ind1", "ind2", "ind3"), stringsAsFactors = FALSE)
A1 A2
ind1 a b
ind2 a a
ind3 b b
我需要一个矩阵(或数据框,我不介意(,每个单独的两列,每个可能的观察一行。然后,如果每个个体的两个观测值相同,则第二列和该观测值行中将有一个"1"。否则,两个观测值行的第一列中都会有一个"1"。所需的输出如下所示:
output <- matrix(c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), nrow = 2, ncol = 6,
dimnames = list(c("a", "b"),
c("ind1_1", "ind1_2", "ind2_1", "ind2_2", "ind3_1", "ind3_2")))
ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
a 1 0 0 1 0 0
b 1 0 0 0 0 1
我试图创建一个全为零的矩阵,但后来我很难找到应该有"1"的位置,或多或少是这样的:
observations <- sort(unique(c(input$A1, input$A2)))
individuals <- row.names(input)
output2 <- data.frame(matrix(0, nrow = length(observations),
ncol = length(individuals) * 2), row.names = observations)
colnames(output2) <- rep(individuals, each = 2)
然后,我考虑使用带有条件函数的应用语句,如果每个人的观察结果相等或不同,则具有不同的结果。但如果你想出不同的想法,我愿意接受建议。我不介意其他类似语言(python,perl...(的解决方案。
当然,现实比这更复杂,所以我真的很感激可扩展的解决方案。这是具有五个测量值的原始输入的示例:
ID locus allele1 allele2 prob matching
397 FAM_308 HLAA 26:01 29:02 0.9805655 0.0006153191
677 FAM_2235 HLAA 03:01 03:01 0.9917792 0.0043972647
274 882_cas326 HLAA 01:01 02:01 0.8891524 0.0001758429
246 851_cas295 HLAA 02:01 03:01 0.9468442 0.0002267387
95 678_cas122 HLAA 02:01 02:01 0.9643058 0.0004104801
在玩具示例中,各个 ID(行名称(位于 ID 列中,A1 是等位基因 1 列,A2 是等位基因 2 列。预期输出如下:
FAM_308 FAM_308 FAM_2235 FAM_2235 882_cas326 882_cas326 851_cas295 851_cas295
01:01 0 0 0 0 1 0 0 0
02:01 0 0 0 0 1 0 1 0
03:01 0 0 0 1 0 0 1 0
26:01 1 0 0 0 0 0 0 0
29:02 1 0 0 0 0 0 0 0
678_cas122 678_cas122
01:01 0 0
02:01 0 1
03:01 0 0
26:01 0 0
29:02 0 0
非常感谢您的贡献!
这是使用您的虚拟数据的解决方案。应该很容易适应真实的东西。
library(dplyr)
A1 <- c("a", "a", "b")
A2 <- c("b", "a", "b")
In <- c("ind1", "ind2", "ind3")
alleles <- data.frame(In, A1, A2)
result <-
bind_rows(alleles, alleles, .id="Index") %>%
arrange(In) %>%
mutate(a=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "a" & A2 == "a" ~ 1,
TRUE ~ 0
)) %>%
mutate(b=case_when(
Index == 1 & A1 == "a" & A2 == "b" ~ 1,
Index == 2 & A1 == "b" & A2 == "b" ~ 1,
TRUE ~ 0
))
reshaped <- result %>%
mutate(new_name=paste(In, Index, sep="_")) %>%
select(new_name, a, b) %>%
t
final <- as.matrix(reshaped[2:3,])
colnames(final) <- reshaped[1,]
rownames(final) <- c("a", "b")
final
ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
a "1" "0" "0" "1" "0" "0"
b "1" "0" "0" "0" "0" "1"
编辑:一个更通用的解决方案,避免每个等位基因case_when
。使用真实数据样本(我认为(:
library(dplyr)
library(tidyr)
ID <- c("FAM_308", "FAM_2235", "882_cas326", "851_cas295", "678_cas122")
allele1 <- c("26:01", "03:01", "01:01", "02:01", "02:01")
allele2 <- c("29:02", "03:01", "02:01", "03:01", "02:01")
DD <- data.frame(ID, allele1, allele2, stringsAsFactors = FALSE) %>% arrange(ID, allele1, allele2)
DD_long <- gather(DD, Allele, Value, -ID)
all_rows <- unique(DD_long$Value)
all_cols <- unique(DD_long$ID)
mm <- matrix(
0,
nrow = length(all_rows),
ncol = length(all_cols) * 2 ,
dimnames = list(all_rows, c(
paste(all_cols, 1, sep = "_"), paste(all_cols, 2, sep = "_")
))
)
# function to fill rows,
# but don't keep track of whether alleles match
fill_row <- function(row, mat) {
x <- filter(DD_long, Value == row) %>%
mutate(z=paste(ID, gsub("allele", "", Allele), sep="_")) %>%
select(z) %>% unlist %>% unname
cat("found allele ", row, "in individual ", x, "nn")
mat[row, x] <- 1
mat
}
for (i in seq_along(all_rows)) {
mm <- fill_row(all_rows[i], mm)
}
# reorganize the 1s and 0s dependent on whether alleles match
reorganize_row <- function(row, col, mat) {
if (sum(mat[row,grep(col, colnames(mm))]) == 1) {
mat[row, grep(col, x = colnames(mat))[1]] <- 1
mat[row, grep(col, x = colnames(mat))[2]] <- 0
}
if (sum(mat[row,grep(col, colnames(mm))]) == 2) {
mat[row, grep(col, x = colnames(mat))[1]] <- 0
mat[row, grep(col, x = colnames(mat))[2]] <- 1
}
mat
}
# nested loop, sorry
for (i in seq_along(all_rows)) {
for (j in seq_along(all_cols)) {
mm <- reorganize_row(all_rows[i], col = all_cols[j], mat = mm)
}
}
# sort the matrix to be as in example
nn <- mm[c("01:01", "02:01", "03:01", "26:01", "29:02"),
c(
"FAM_308_1",
"FAM_308_2",
"FAM_2235_1",
"FAM_2235_2",
"882_cas326_1",
"882_cas326_2",
"851_cas295_1",
"851_cas295_2",
"678_cas122_1",
"678_cas122_2"
)]
colnames(nn) <- gsub("_1|_2", "", x = colnames(nn))
nn
FAM_308 FAM_308 FAM235 FAM235 882_cas326 882_cas326 851_cas295 851_cas295 678_cas122 678_cas122
01:01 0 0 0 0 1 0 0 0 0 0
02:01 0 0 0 0 1 0 1 0 0 1
03:01 0 0 0 1 0 0 1 0 0 0
26:01 1 0 0 0 0 0 0 0 0 0
29:02 1 0 0 0 0 0 0 0 0 0
使用基数 R,我们可以获得观察的所有unique
值。对于每行中的每个观察结果,我们根据条件返回输出。将所有结果绑定在一起,并分配列名和行名。首先在共享input
数据上执行此操作
unique_vals <- unique(unlist(input))
cols <- c(t(outer(rownames(input), c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x)
c(apply(input, 1, function(y)
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
rownames(output) <- unique_vals
output
# ind1_1 ind1_2 ind2_1 ind2_2 ind3_1 ind3_2
#a 1 0 0 1 0 0
#b 1 0 0 0 0 1
现在将其应用于原始数据帧 (df
(
vals <- c("allele1", "allele2")
unique_vals <- sort(unique(unlist(df[vals])))
cols <- c(t(outer(df$ID, c("_1", "_2"), paste0)))
output <- do.call(rbind.data.frame, lapply(unique_vals, function(x)
c(apply(df[vals], 1, function(y)
if (all(y == x)) c(0, 1) else if (any(y == x)) c(1, 0) else c(0, 0)))))
names(output) <- cols
output
# FAM_308_1 FAM_308_2 FAM_2235_1 FAM_2235_2 882_cas326_1 882_cas326_2
#01:01 0 0 0 0 1 0
#02:01 0 0 0 0 1 0
#03:01 0 0 0 1 0 0
#26:01 1 0 0 0 0 0
#29:02 1 0 0 0 0 0
# 851_cas295_1 851_cas295_2 678_cas122_1 678_cas122_2
#01:01 0 0 0 0
#02:01 1 0 0 1
#03:01 1 0 0 0
#26:01 0 0 0 0
#29:02 0 0 0 0
具有相同名称的列不是一个好的做法,因此在列名称中添加"_1"
和"_2"
。
df
在哪里
df <- structure(list(ID = c("FAM_308", "FAM_2235", "882_cas326", "851_cas295",
"678_cas122"), locus = c("HLAA", "HLAA", "HLAA", "HLAA", "HLAA"
), allele1 = c("26:01", "03:01", "01:01", "02:01", "02:01"),
allele2 = c("29:02", "03:01", "02:01", "03:01", "02:01"),
prob = c(0.9805655, 0.9917792, 0.8891524, 0.9468442, 0.9643058
), matching = c(0.0006153191, 0.0043972647, 0.0001758429,
0.0002267387, 0.0004104801)), class = "data.frame", row.names = c("397",
"677", "274", "246", "95"))