我想为DNA序列列表生成一个热编码矩阵。我试图从以下链接解决我的问题 如何为 DNA 序列生成一个热编码?但是有些解决方案只给出了一个DNA序列,而不是DNA序列列表。
例如
def one_hot_encode(seq):
mapping = dict(zip("ACGT", range(4)))
seq2 = [mapping[i] for i in seq]
return np.eye(4)[seq2]
one_hot_encode("AACGT")
在上面给定的代码中,如果我运行one_hot_encode("AACGT","GGTAC","CGTAC")
它将失败,我也想生成矩阵作为输出。
目前,我在R中工作,下面是我在r数据帧中的DNA序列(单列文件)
ACTTTA
TTGATG
CTTACG
GTACGT
预期产出
1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0
0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1
是否可以在 R 中执行此操作?
下面是使用基本 R 的解决方案,该解决方案生成解决方案的转置。也就是说,它为每个单独的字符创建一个单热列向量并连接这些列(即始终有四行,无论字符串的数量如何)。
(有关生成与问题相同格式的解决方案,请参阅底部。
sequences = c('ACTTTA', 'TTGATG', 'GATTACA')
strsplit(sequences, '') |>
lapply(match, table = c('A', 'C', 'G', 'T')) |>
lapply((x) {
m = diag(0L, nrow = 4L, ncol = length(x))
m[cbind(x, seq_along(x))] = 1L
m
}) |>
do.call('cbind', args = _)
或者,或者(更短,但不一定更易读):
strsplit(sequences, '') |>
lapply(match, table = c('A', 'C', 'G', 'T')) |>
unlist() %>%
{diag(1L, 4L)[, .]}
不幸的是,这需要"magrittr"管道,但我们可以通过将最后一行的表达式抽象为函数来更改它(并使其更具可读性):
num_to_one_hot = function (x, bits) {
diag(1L, bits)[, x]
}
这是一个很好的通用函数,用于对数字向量进行独热编码。此外,我们可以在第一步之后unlist
,这使我们能够 避免lapply
.这样,我们的DNA编码代码就变成了:
strsplit(sequences, '') |>
unlist() |>
match(c('A', 'C', 'G', 'T')) |>
num_to_one_hot(bits = 4L)
。我发现这是迄今为止所有替代方案中最不言自明(也是最易读的!)。它也是完全矢量化的,不使用lapply
或类似功能,因此效率也更高。
为了完整起见,这里有一个解决方案,通过将先前算法生成的矩阵从 col-major 方向转换为行主要方向,生成与问题中请求的输出相同的输出:
strsplit(sequences, '') |>
unlist() |>
match(c('A', 'C', 'G', 'T')) |>
num_to_one_hot(4L) |>
matrix(nrow = length(sequences), byrow = TRUE)
我对所有当前发布的解决方案进行了基准测试,使用num_to_one_hot
的解决方案(下面的Konrad2
和Konrad3
)是最快的:
> bench::mark(Maël(), Paul(), Konrad1(), Konrad2(), Konrad3(), GKi1(), GKi2(), Thomas(), check = FALSE)
# # A tibble: 8 × 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 Maël() 776µs 818.25µs 1154. 1.36KB 13.3 522 6 452.2ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 2 Paul() 1.07ms 1.17ms 743. 4.38KB 2.04 365 1 491.2ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 3 Konrad1() 21.7µs 23.7µs 39401. 432B 15.8 9996 4 253.7ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 4 Konrad2() 4.8µs 5.7µs 148619. 1.69KB 14.9 9999 1 67.3ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 5 Konrad3() 6.6µs 7.9µs 108540. 2.11KB 10.9 9999 1 92.1ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 6 GKi1() 9.2µs 10.8µs 83596. 960B 16.7 9998 2 119.6ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 7 GKi2() 258.3µs 278µs 3442. 960B 0 1721 0 500ms <NULL> <Rprofmem> <bench_tm> <tibble>
# 8 Thomas() 10µs 11.6µs 77604. 2.39KB 7.76 9999 1 128.8ms <NULL> <Rprofmem> <bench_tm> <tibble>
在基本 R 中,您可以执行以下操作:
dna <- c("A", "C", "G", "T")
dna_seq <- c("ACTTTA", "TTGATG")
one_hot_encode <- function(x){
spl <- strsplit(x, "")[[1]]
fa <- factor(spl, levels = dna)
sapply(fa, table) |>
Reduce(f = c, x = _)
}
data.frame(do.call(rbind, lapply(dna_seq, one_hot_encode)))
输出
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24
1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
2 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
在 base R 中,我们可以使用match
和utf8ToInt
来查找映射位置,并matrix
来格式化输出,例如
dna <- c("ACTTTA", "TTGATG", "CTTACG", "GTACGT")
matrix(t(diag(4)[match(unlist(lapply(dna, utf8ToInt)), utf8ToInt("ACGT")), ]), nrow = length(dna), byrow = TRUE)
这样我们将获得
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
[1,] 1 0 0 0 0 1 0 0 0 0 0 1 0 0
[2,] 0 0 0 1 0 0 0 1 0 0 1 0 1 0
[3,] 0 1 0 0 0 0 0 1 0 0 0 1 1 0
[4,] 0 0 1 0 0 0 0 1 1 0 0 0 0 1
[,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
[1,] 0 1 0 0 0 1 1 0 0 0
[2,] 0 0 0 0 0 1 0 0 1 0
[3,] 0 0 0 1 0 0 0 0 1 0
[4,] 0 0 0 0 1 0 0 0 0 1
library(stringr)
dataIn <- c(
"ACTTTA",
"TTGATG",
"CTTACG",
"GTACGT"
)
one_hot_encode <- function(baseSeq) {
outSeq <- stringr::str_replace_all(baseSeq, c("A" = "1000",
"C" = "0100",
"G" = "0010",
"T" = "0001"))
outSeq <- str_extract_all(outSeq, boundary("character"))
unlist(outSeq)
}
data.frame(do.call(rbind,lapply(dataIn, one_hot_encode)))
给
> data.frame(do.call(rbind,lapply(dataIn, one_hot_encode)))
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24
1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
2 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
3 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0
4 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1
一些行和列名称可能会整理输出,但我认为这本质上是您所追求的?
另一种基本方式。
s <- c("ACTTTA", "TTGATG", "CTTACG", "GTACGT")
dna <- c("A", "C", "G", "T")
lup <- setNames(asplit(diag(length(dna)), 1), dna)
lapply(strsplit(s, "", TRUE), (x) unlist(lup[x], FALSE, FALSE))
#[[1]]
# [1] 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
#
#[[2]]
# [1] 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
#
#[[3]]
# [1] 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0
#
#[[4]]
# [1] 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1
或使用gsub
. <- gsub("A", "1000", s)
. <- gsub("C", "0100", .)
. <- gsub("G", "0010", .)
. <- gsub("T", "0001", .)
cbind(s, .)
# s .
#[1,] "ACTTTA" "100001000001000100011000"
#[2,] "TTGATG" "000100010010100000010010"
#[3,] "CTTACG" "010000010001100001000010"
#[4,] "GTACGT" "001000011000010000100001"
lapply(strsplit(., "", TRUE), as.integer)
#[[1]]
# [1] 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0
#
#[[2]]
# [1] 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0
#
#[[3]]
# [1] 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0
#
#[[4]]
# [1] 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1
或
lapply(strsplit(s, "", TRUE), (x) c(diag(length(dna))[,match(x, dna)]))
或
matrix(diag(4)[,unlist(lapply(strsplit(s, "", TRUE), match, dna), FALSE, FALSE)], length(s), byrow = TRUE)
或
. <- chartr("ACGT", "1-4", s)
. <- strsplit(., "", TRUE)
matrix(diag(4)[, as.integer(unlist(.))], length(s), byrow = TRUE)
或
. <- paste(s, collapse="")
. <- chartr("ACGT", "1-4", .)
matrix(diag(1L, 4L)[, utf8ToInt(.) - utf8ToInt("0")], length(s), byrow = TRUE)
或
matrix(diag(1L, 4L)[,rep(1:4, utf8ToInt("ACGT") - 64)[utf8ToInt(paste(s, collapse="")) - 64]], length(s), byrow = TRUE)
<小时 />基准
library(magrittr) #For Konrad
s <- c("ACTTTA", "TTGATG", "CTTACG", "GTACGT")
dna <- c("A", "C", "G", "T")
bench::mark(check=FALSE,
GKi1 = {lup <- setNames(asplit(diag(length(dna)), 1), dna)
lapply(strsplit(s, "", TRUE), (x) unlist(lup[x], FALSE, FALSE))},
GKi2 = {. <- gsub("A", "1000", s, fixed=TRUE)
. <- gsub("C", "0100", ., fixed=TRUE)
. <- gsub("G", "0010", ., fixed=TRUE)
gsub("T", "0001", ., fixed=TRUE)},
GKi3 = lapply(strsplit(s, "", TRUE), (x) c(diag(length(dna))[,match(x, dna)])),
GKi4 = matrix(diag(4)[,unlist(lapply(strsplit(s, "", TRUE), match, dna), FALSE, FALSE)], length(s), byrow = TRUE),
GKi5 = matrix(diag(4)[, as.integer(unlist(strsplit(chartr("ACGT", "1-4", s), "", TRUE)))], length(s), byrow = TRUE),
GKi6 = matrix(diag(1L, 4L)[, utf8ToInt(chartr("ACGT", "1-4", paste(s, collapse=""))) - utf8ToInt("0")], length(s), byrow = TRUE),
GKi7 = matrix(diag(1L, 4L)[,rep(1:4, utf8ToInt("ACGT") - 64)[utf8ToInt(paste(s, collapse="")) - 64]], length(s), byrow = TRUE),
Konrad = {
strsplit(s, '') |>
lapply(match, table = c('A', 'C', 'G', 'T')) |>
unlist() %>%
{replace(diag(0L, 4L, length(.)), cbind(., seq_along(.)), 1L)}
},
Thomas = matrix(t(diag(4)[match(unlist(lapply(s, utf8ToInt)), utf8ToInt("ACGT")), ]), nrow = length(dna), byrow = TRUE)
)
结果
expression min median itr/s…¹ mem_al…² gc/se…³ n_itr n_gc total…⁴ result
<bch:expr> <bch:t> <bch:t> <dbl> <bch:by> <dbl> <int> <dbl> <bch:t> <list>
1 GKi1 23µs 26.31µs 33203. 3.96MB 69.9 9979 21 300.5ms <NULL>
2 GKi2 6.97µs 8.27µs 109392. 0B 98.5 9991 9 91.3ms <NULL>
3 GKi3 12.39µs 15.41µs 51404. 169.84KB 87.5 9983 17 194.2ms <NULL>
4 GKi4 8.21µs 10.67µs 80760. 1.59KB 121. 9985 15 123.6ms <NULL>
5 GKi5 7.03µs 8.43µs 108751. 6.2KB 76.2 9993 7 91.9ms <NULL>
6 GKi6 6.2µs 7.62µs 118818. 864B 95.1 9992 8 84.1ms <NULL>
7 GKi7 5.64µs 7.46µs 113596. 1.08KB 90.9 9992 8 88ms <NULL>
8 Konrad 11.18µs 13.91µs 60428. 8.92KB 103. 9983 17 165.2ms <NULL>
9 Thomas 8.4µs 10.75µs 77019. 6.34KB 69.4 9991 9 129.7ms <NULL>