r - 列出序列(DNA)的所有突变



我有一个DNA序列,我想在DNA序列读取列表中找到它的所有实例或任何可能的突变。我正在使用 grepl 来执行此操作,因为在我使用它的实例中它比 matchPattern 更快。我使用 parLapply 将我的突变载体提供给 grepl 函数。但我感兴趣的是制作一种简单的方法来自动生成我的序列突变载体。最初我输入了每个突变,但这为人为错误留下了空间,如果序列被延长,则需要键入更多的突变。此外,我当前的代码只允许 1 个突变,并且某些序列应该比其他序列允许更多的突变。我不是在寻找为我编写循环的人,而只是给我一个解释任何字符串的建议。

现在,我有一种半自动的方式来产生突变。它现在可以生成载体,而无需我将它们全部输入出来,但仅适用于 8 个核苷酸长序列。必须有更好的方法来生成任何长度的任何核苷酸序列的载体。

这是我的代码:

#My sequence of interest
seq1 <- "GGCGACTG"
lenseq1 <- nchar(seq1)
#A vector of the length of the sequence I wish to create all mutations of
mutsinseq1 <- rep(seq1, 5*lenseq1+4*(lenseq1-1)+1)
#The possible substitutions, insertions, and deletions to the sequence of interest
possnuc <- c("A","T","C","G","")
lenpossnuc <- length(possnuc)
#changing all elements of the vector except for the first
#the first 8 if statements are nucleotide substitutions or deletions
#the other if statements allow for inserts between nucleotides
for(i in 2:length(mutsinseq1)){
if(i<7){
mutsinseq1[i] <- paste(possnuc[i-1],substr(seq1,2,lenseq1),sep = "") 
} else if(i<12){
mutsinseq1[i] <- paste(substr(seq1,1,1),possnuc[i-6],substr(seq1,3,lenseq1),sep = "")
} else if(i<17){
mutsinseq1[i] <- paste(substr(seq1,1,2),possnuc[i-11],substr(seq1,4,lenseq1),sep = "")
} else if(i<22){
mutsinseq1[i] <- paste(substr(seq1,1,3),possnuc[i-16],substr(seq1,5,lenseq1),sep = "")
} else if(i<27){
mutsinseq1[i] <- paste(substr(seq1,1,4),possnuc[i-21],substr(seq1,6,lenseq1),sep = "")
} else if(i<32){
mutsinseq1[i] <- paste(substr(seq1,1,5),possnuc[i-26],substr(seq1,7,lenseq1),sep = "")
} else if(i<37){
mutsinseq1[i] <- paste(substr(seq1,1,6),possnuc[i-31],substr(seq1,8,lenseq1),sep = "")
} else if(i<42){
mutsinseq1[i] <- paste(substr(seq1,1,7),possnuc[i-36],sep = "")
} else if(i<46){
mutsinseq1[i] <- paste(substr(seq1,1,1),possnuc[i-41],substr(seq1,2,lenseq1),sep = "")
} else if(i<50){
mutsinseq1[i] <- paste(substr(seq1,1,2),possnuc[i-45],substr(seq1,3,lenseq1),sep = "")
} else if(i<54){
mutsinseq1[i] <- paste(substr(seq1,1,3),possnuc[i-49],substr(seq1,4,lenseq1),sep = "")
} else if(i<58){
mutsinseq1[i] <- paste(substr(seq1,1,4),possnuc[i-53],substr(seq1,5,lenseq1),sep = "")
} else if(i<62){
mutsinseq1[i] <- paste(substr(seq1,1,5),possnuc[i-57],substr(seq1,6,lenseq1),sep = "")
} else if(i<66){
mutsinseq1[i] <- paste(substr(seq1,1,6),possnuc[i-61],substr(seq1,7,lenseq1),sep = "")
} else{
mutsinseq1[i] <- paste(substr(seq1,1,7),possnuc[i-65],substr(seq1,8,lenseq1),sep = "")
}
}
#getting rid of duplicate mutations
mutsinseq1 <- mutsinseq1[-which(duplicated(mutsinseq1))]

以下是我希望生成的(由我当前的代码生成):

mutsinseq1
[1] "GGCGACTG"  "AGCGACTG"  "TGCGACTG"  "CGCGACTG"  "GCGACTG"   "GACGACTG"  "GTCGACTG"  "GCCGACTG"  "GGAGACTG"  "GGTGACTG"  "GGGGACTG"  "GGGACTG"   "GGCAACTG" 
[14] "GGCTACTG"  "GGCCACTG"  "GGCACTG"   "GGCGTCTG"  "GGCGCCTG"  "GGCGGCTG"  "GGCGCTG"   "GGCGAATG"  "GGCGATTG"  "GGCGAGTG"  "GGCGATG"   "GGCGACAG"  "GGCGACCG" 
[27] "GGCGACGG"  "GGCGACG"   "GGCGACTA"  "GGCGACTT"  "GGCGACTC"  "GGCGACT"   "GAGCGACTG" "GTGCGACTG" "GCGCGACTG" "GGGCGACTG" "GGACGACTG" "GGTCGACTG" "GGCCGACTG"
[40] "GGCAGACTG" "GGCTGACTG" "GGCGGACTG" "GGCGAACTG" "GGCGTACTG" "GGCGCACTG" "GGCGATCTG" "GGCGACCTG" "GGCGAGCTG" "GGCGACATG" "GGCGACTTG" "GGCGACGTG" "GGCGACTAG"
[53] "GGCGACTCG" "GGCGACTGG"

如何解决问题?

在其他语言中,你可以使用一系列嵌套循环来做到这一点,但在 R 中,有一些不错的组合函数。以下是执行所需操作的整体功能:

library(stringr)
library(purrr)
library(dplyr)
mutate_sequence <- function(string, num = 1, nucleotides = c("A","T","C","G","_")) {
l_str <- str_length(string)
choices <- cross(list(
cols = combn(seq_len(l_str), num, simplify = F),
muts = cross(rerun(num, nucleotides)) %>% map(unlist)
))
choice_matrix <- 
map_dfr(choices, as_tibble, .id = "rows") %>% 
mutate(rows = as.numeric(rows))
seq_matrix <- str_split(rep(string, max(choice_matrix$rows)), "", simplify = T)
seq_matrix[as.matrix(choice_matrix[,1:2])] <- str_to_lower(choice_matrix$muts)
apply(seq_matrix, 1, paste, collapse = "")
}

我使用了一些包来使事情变得更容易一些,但它都可以转换为基本R。

下面是示例输出:

mutate_sequence("ATCG", num = 2)
[1] "aaCG" "aTaG" "aTCa" "AaaG" "AaCa" "ATaa" "taCG" "tTaG" "tTCa" "AtaG" "AtCa" "ATta" "caCG" "cTaG"
[15] "cTCa" "AcaG" "AcCa" "ATca" "gaCG" "gTaG" "gTCa" "AgaG" "AgCa" "ATga" "_aCG" "_TaG" "_TCa" "A_aG"
[29] "A_Ca" "AT_a" "atCG" "aTtG" "aTCt" "AatG" "AaCt" "ATat" "ttCG" "tTtG" "tTCt" "AttG" "AtCt" "ATtt"
[43] "ctCG" "cTtG" "cTCt" "ActG" "AcCt" "ATct" "gtCG" "gTtG" "gTCt" "AgtG" "AgCt" "ATgt" "_tCG" "_TtG"
[57] "_TCt" "A_tG" "A_Ct" "AT_t" "acCG" "aTcG" "aTCc" "AacG" "AaCc" "ATac" "tcCG" "tTcG" "tTCc" "AtcG"
[71] "AtCc" "ATtc" "ccCG" "cTcG" "cTCc" "AccG" "AcCc" "ATcc" "gcCG" "gTcG" "gTCc" "AgcG" "AgCc" "ATgc"
[85] "_cCG" "_TcG" "_TCc" "A_cG" "A_Cc" "AT_c" "agCG" "aTgG" "aTCg" "AagG" "AaCg" "ATag" "tgCG" "tTgG"
[99] "tTCg" "AtgG" "AtCg" "ATtg" "cgCG" "cTgG" "cTCg" "AcgG" "AcCg" "ATcg" "ggCG" "gTgG" "gTCg" "AggG"
[113] "AgCg" "ATgg" "_gCG" "_TgG" "_TCg" "A_gG" "A_Cg" "AT_g" "a_CG" "aT_G" "aTC_" "Aa_G" "AaC_" "ATa_"
[127] "t_CG" "tT_G" "tTC_" "At_G" "AtC_" "ATt_" "c_CG" "cT_G" "cTC_" "Ac_G" "AcC_" "ATc_" "g_CG" "gT_G"
[141] "gTC_" "Ag_G" "AgC_" "ATg_" "__CG" "_T_G" "_TC_" "A__G" "A_C_" "AT__"

我将突变设置为小写或"_"以使其位置一目了然,但您可以轻松地更改它以使它们恢复为"正常"序列。

所以每行都做一些事情:

l_str <- str_length(string)

获取字符串中的字符数。

combn(seq_len(l_str), num, simplify = F)

1)这是序列(索引)上所有可能的位置组合,一次num突变的数量。

rerun(num, nucleotides)

2)这会将您的核苷酸载体重复num次,并使其成为一个列表。 然后cross(rerun(num, nucleotides))把该列表中的每个组合都作为一个列表,所以你把核苷酸的每一个可能的组合都重复。cross(rerun(num, nucleotides)) %>% map(unlist)将列表的最深层折叠为向量。

因此,最后两个块为您提供了所有可能的位置选择,然后是每种可能的替换组合。然后我们需要它们成对的所有可能组合!

choices <- cross(list(
cols = combn(seq_len(l_str), num, simplify = F),
muts = cross(rerun(num, nucleotides)) %>% map(unlist)
))

对于上述输出,这意味着:

[[1]]
[[1]]$`cols`
[1] 1 2
[[1]]$muts
[1] "A" "A"

[[2]]
[[2]]$`cols`
[1] 1 2
[[2]]$muts
[1] "T" "A"
...

所以首先对于位置1/2,它给了我们A/AT/AG/AC/A_/A等。然后每个组合再次用于位置 1/3,然后是位置 1/4,然后是 2/3,然后是2/4,然后是3/4

所以现在你有这个长长的清单,让我们把它变成更好的东西。首先,我们将每个元素制作成一个带有colsmuts的数据帧,然后将它们全部绑定到一个数据帧中,每个元素都有一个标识符,称为rows

map_dfr(choices, as_tibble, .id = "rows")
# A tibble: 50 x 3
rows   cols muts 
<chr> <int> <chr>
1 1         1 A    
2 1         2 A    
3 2         1 T    
4 2         2 A    
5 3         1 C    
6 3         2 A    
7 4         1 G    
8 4         2 A    
9 5         1 _    
10 5         2 A
# ... with 40 more rows

这给了我们一个很长的数据帧。rows中的每一个都是一个输出字符串,cols告诉我们字符串中的哪个位置将被替换。muts是将进入这些位置的角色。为了稍后执行子集化,我们将使用mutate(...)rows转换为数字。

seq_matrix <- str_split(rep(string, max(choice_matrix$rows)), "", simplify = T)

现在,我们获取您的原始字符串并重复多次,因为choice_matrix告诉我们我们将有突变的序列。然后我们获取该向量,并沿字符边界拆分每个向量:

[,1] [,2] [,3] [,4]
[1,] "A"  "T"  "C"  "G" 
[2,] "A"  "T"  "C"  "G" 
[3,] "A"  "T"  "C"  "G" 
[4,] "A"  "T"  "C"  "G" 
[5,] "A"  "T"  "C"  "G" 
[6,] "A"  "T"  "C"  "G" 
...

现在我们有一个大矩阵,R在这些大矩阵上运行速度很快。我们本可以使用矩阵运算完成所有其他步骤,但这似乎比使用此列表组合函数要多得多。

seq_matrix[as.matrix(choice_matrix[,1:2])] <- str_to_lower(choice_matrix$muts)

这根据choice_matrix中的rowscols来识别每个位置。然后,它将muts中的适当值放入其中。这也是您可以取出str_to_lower以防止它们小写的地方。您将更改默认参数nucleotides以使"_"变为""

[,1] [,2] [,3] [,4]
[1,] "a"  "a"  "C"  "G" 
[2,] "a"  "T"  "a"  "G" 
[3,] "a"  "T"  "C"  "a" 
[4,] "A"  "a"  "a"  "G" 
[5,] "A"  "a"  "C"  "a" 
[6,] "A"  "T"  "a"  "a" 
...

因此,第 1 行在位置 1 和 2 中获得了"A"和"A"。然后第 2 行在位置 1 和 3 中得到"A"和"A",依此类推。现在我们只需要在每一行上apply(这就是apply(..., 1, ...)中的1所做的)一个函数,将每一行合并为一个字符串。那将是paste(..., collapse = "").

这将快速为您带来巨大的产出。如果你在原来的8个核苷酸序列上进行3个突变,你会得到7000的输出。4个突变为43750。而且每次都变慢了,在我的桌面上运行 4 个突变大约需要 5 秒。您可以预先计算输出长度,即choose(l_str, num) * length(nucleotides)^num.


再次更新:

为了处理插入和删除,我们只需要字符矩阵为每个可能的插入都有一个插槽。这是那个版本:

mutate_sequence <- function(string, num = 1, nucleotides = c("A","T","C","G","")) {
if (num < 1) {return(string)}
l_str <- str_length(string)
l_pos <- (num + 1)*(l_str - 1) + 1
choices <- cross(list(
cols = combn(seq_len(l_pos), num, simplify = F),
muts = cross(rerun(num, nucleotides)) %>% map(unlist)
))
choice_matrix <- 
map_dfr(choices, as_data_frame, .id = "rows") %>% 
mutate(rows = as.numeric(rows))
blanks <- character(l_pos)
orig_pos <- (seq_len(l_str) - 1) * (num+1) + 1
blanks[orig_pos] <- str_split(string, "", simplify = T)
seq_matrix <- matrix(
rep(blanks, max(choice_matrix$rows)), 
ncol = l_pos, byrow = T
)
seq_matrix[as.matrix(choice_matrix[,1:2])] <- str_to_lower(choice_matrix$muts)
sequences <- apply(seq_matrix, 1, paste, collapse = "")
sequences[!duplicated(str_to_upper(sequences))]
}

这与上述函数的版本基本相同,但首先创建一个空白向量,每次插入都有足够的点。对于每个原始核苷酸,您需要在其之后插入一个额外的点,除了最后一个。这适用于l_pos <- (num + 1)*(l_str - 1) + 1职位。character(l_pos)给你空白,然后你用原始核苷酸填充空白(seq_len(l_str) - 1) * (num+1) + 1.

例如,具有两个突变的ATCG变得"A" "" "" "T" "" "" "C" "" "" "G"。函数的其余部分工作相同,只是将每个可能的核苷酸(或缺失)放在每个可能的位置。

然后,paste将它们全部重新组合在一起之前的输出如下所示:

[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "a"  "a"  ""   "T"  ""   ""   "C"  ""   ""   "G"  
[2,] "a"  ""   "a"  "T"  ""   ""   "C"  ""   ""   "G"  
[3,] "a"  ""   ""   "a"  ""   ""   "C"  ""   ""   "G"  
[4,] "a"  ""   ""   "T"  "a"  ""   "C"  ""   ""   "G"  
[5,] "a"  ""   ""   "T"  ""   "a"  "C"  ""   ""   "G" 
...  

然后在paste每一行之后,我们可以检查duplicated重复并排除这些重复。我们也可以去掉小写突变,改用unique(sequences)。现在输出比以前短得多,前 55 个中的 278 个:

[1] "aaTCG"  "aaCG"   "aTaCG"  "aTaG"   "aTCaG"  "aTCa"   "AaaTCG" "AaaCG"  "AaTaCG" "AaTaG"  "AaTCaG"
[12] "AaTCa"  "AaaG"   "AaCaG"  "AaCa"   "ATaaCG" "ATaaG"  "ATaCaG" "ATaCa"  "ATaa"   "ATCaaG" "ATCaa" 
[23] "taTCG"  "taCG"   "tTaCG"  "tTaG"   "tTCaG"  "tTCa"   "AtaTCG" "AtTaCG" "AtTaG"  "AtTCaG" "AtTCa" 
[34] "ATta"   "ATCtaG" "ATCta"  "caTCG"  "caCG"   "cTaCG"  "cTaG"   "cTCaG"  "cTCa"   "AcaTCG" "AcaCG" 
[45] "AcTaCG" "AcTaG"  "AcTCaG" "AcTCa"  "AcaG"   "AcCaG"  "AcCa"   "ATcaCG" "ATcCaG" "ATcCa"  "gaTCG"
...

已编辑 第三次完全修订以更好地解决这个问题!顺便说一下,这里的关键解决方案(以三个帮助程序函数的形式)不需要Biostrings包。

据我了解,一个简短的DNA查询序列要与大量的参考DNA序列相匹配。这里的转折点是,在参考DNA序列中搜索DNA查询序列上插入或删除形式的任意数量的变异。

Biostrings包中vmatchPattern()的函数可以识别给定模式的匹配项,这些匹配项在一组参考序列中具有任意数量的不匹配。此外,vmatchPattern()可以识别给定模式的匹配项,并可能插入或删除(插入缺失)。然而,与matchPattern()不同的是,vmatchPattern()不能同时做这两件事。

这里寻求的解决方案是生成DNA查询序列的生成变体,然后将其传递给搜索函数,例如grepl()或此处建议的vmatchPattern()

此处发布的修订后的解决方案包括三个功能。makeDel()将生成具有任意删除次数的短序列的所有可能变体。配套函数makeIns()将生成短序列的变体,插入在symbol中指定为 IUPAC 符号。makeSub()将使用IUPAC代码在symbol中指定的基础进行所需的替换。这种方法生成其他碱基的所有可能组合,允许字符串直接用于模式匹配函数,包括vmatchPaterrn

如果要使用它,这将确保包Biostrings可用。此代码适用于 3.60 及更高版本的 R。

if (!require("Biostrings")) {
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
}
library(Biostrings)

现在一些测试数据。将使用原始查询序列"GGCGACTG"作为"查询"和200到400个核苷酸之间的1000个随机序列将用作参考集。

seq1 <- "GGCGACTG"
set.seed(1234)
ref <- replicate(1e3,
sample(c("A", "C", "G", "T"), sample(200:400, 1), replace = TRUE),
simplify = FALSE)
ref <- sapply(ref, paste, collapse = "")
ref <- DNAStringSet(ref)

在继续解决方案之前,让我们先看看仅使用模式可以找到什么。

# how times does the pattern occur? 
table(vcountPattern(seq1, ref)) # just 3 times
>   0   1 
> 997   3 
# how many times allowing for one mismatch?
# once in 96 sequences and twice in three sequences
n <- vcountPattern(seq1, ref, max.mismatch = 1)
table(n)
> n
>   0   1   2 
> 901  96   3 
# examine the matched sequences
m <- vmatchPattern(seq1, ref, max.mismatch = 1) # find the patterns
sel <- which.max(n) # select the first one with 2 matches
Views(ref[[sel]], m[[sel]]) # examine the matches
>   Views on a 384-letter DNAString subject
> subject: TCGCGTCGCACTTCTGCTAACACAGC...GCCCAGTCGACTGCTGCTCGGATTGC
> views:
>     start end width
> [1]   104 111     8 [GGCGACCG]
> [2]   364 371     8 [GTCGACTG] 

下面是用于生成变体的三个帮助程序函数。参数seq可以是字符串,如"GGGCGACTG"或DNAString对象。参数n是一个整数,它指定删除、插入或替换的上限。这些函数将创建具有 0、1、...、n 个删除、插入或替换的变体。如果n设置为 0,则该函数将返回原始序列。makeIns()makeSub()的参数symbol应该是单个IUPAC字符,以指定将插入或替换哪些碱基。默认值"N"指定所有可能的基数("A"、"C"、"G"和"T")。

makeDel()使用combn()来标识删除位置。makeIns()makeSub()的逻辑稍微复杂一些,因为需要允许插入彼此相邻,并且需要创建所有组合。在这里,我选择不在查询序列的开头或结尾添加插入。

所有函数都返回适合在vmatchPattern()grep()中使用的字符向量。

要在 DNA 字符串中创建缺失:

##
## makeDel - create 1:n deletions in a character string (DNA sequence)
##  return a character vector of all possible variants
##
makeDel <- function(seq, n) {
# accept only a single value for 'seq'
cseq <- as.character(seq)
cseq <- unlist(strsplit(cseq[1], ""))
nseq <- length(cseq)
# simple argument checks
if (!is(n, "numeric")) stop("'n' must be an integer")
if (n == 0) return(paste(cseq, collapse = ""))
if (n >= nseq) stop("too many deletions for ", nseq, " letters")
# create all possible combinations to be dropped in 'index'
index <- lapply(seq_len(n), function(j) combn(nseq, j, simplify = FALSE))
index <- unlist(index, recursive = FALSE)
# drop base in each possible position and reassemble
ans <- lapply(index, function(idx) cseq[-idx])
ans <- sapply(ans, paste, collapse = "")
ans <- unique(ans) # remove duplicates
return(ans)
}

要在 DNA 字符串中创建插入片段:

##
## makeIns - create 1:n insertions into DNA string (character vector)
##   where each insertion is one of a given IUPAC-specified symbol
##   return a character vector of all possible variants
##
makeIns <- function(seq, n, symbol = "N") {
# IUPAC codes for ambiguous bases
iupac <- c(N = "ACGT", A = "A", C = "C", G = "G", T = "T", M = "AC", R = "AG",
W = "AT", S = "CG", Y = "CT", K = "GT", V = "ACG", H = "ACT",
D = "AGT", B = "CGT")
# only accept single value for 'seq'
cseq <- as.character(seq)
cseq <- unlist(strsplit(cseq[1], ""))
nseq <- length(cseq)
# simple argument checks
if (!is(n, "numeric")) stop("'n' must be an integer")
symbol <- toupper(symbol)
if (nchar(symbol) != 1 | !symbol %in% names(iupac))
stop("'symbol' must be a single valid IUPAC symbol")
if (n == 0) return(paste(cseq, collapse = ""))
if (n >= nseq) stop("seems like too many insertions for ", nseq, " letters")
# which bases are to be inserted?
ACGT <- strsplit(iupac[symbol], "")[[1]]
# create all possible combinations of positions for the insertion 
ipos <- seq_len(nseq - 1) # insert after this position
index <- lapply(1:n, function(i) do.call(expand.grid, rep(list(ipos), i)))
index <- lapply(index, function(v) split(v, seq_len(nrow(v))))
index <- unlist(index, recursive = FALSE)
index <- lapply(index, unlist)
index <- lapply(index, sort)
# place the required number of insertions after each position in index
res <- lapply(index, function(idx) {
tally <- rle(idx)$lengths
breaks <- diff(c(0, idx, nseq))
NN <- Map(base::rep, symbol, tally)
spl <- split(cseq, rep(seq_along(breaks), breaks))
sel <- head(seq_along(spl), -1)
spl[sel] <- Map(base::c, spl[sel], NN)
ans <- unlist(spl)
if (length(ACGT) > 1) { # replicate and replace with appropriate bases
sites <- grep(symbol, ans)
nsites <- length(sites)
nsymbol <- length(ACGT)
bases <- expand.grid(rep(list(ACGT), nsites), stringsAsFactors = FALSE)
bases <- as.matrix(bases)
nvars <- nrow(bases)
ans <- do.call(rbind, rep(list(ans), nvars))
ans[, sites] <- bases
ans <- split(ans, seq_len(nvars))
ans <- lapply(ans, paste, collapse = "")
}
else
ans <- paste(ans, collapse = "")
return(ans)
})
res <- unlist(res)
res <- unique(res)
return(res)
}

要在DNA串中创建替换:

##
## makeSub - create an arbitrary number of substitutions in each 1:n positions
##   with the IUPAC bases specified by 'symbol'
##   return a character vector with all possible variants
##
makeSub <- function(seq, n, symbol = "N")
{
# IUPAC codes for ambiguous bases
iupac <- c(N = "ACGT", A = "A", C = "C", G = "G", T = "T", M = "AC", R = "AG",
W = "AT", S = "CG", Y = "CT", K = "GT", V = "ACG", H = "ACT",
D = "AGT", B = "CGT")
# accept only a single value for 'seq'
cseq <- as.character(seq)
cseq <- unlist(strsplit(cseq[1], ""))
nseq <- length(cseq)
# simple argument checks
if (!is(n, "numeric")) stop("'n' must be an integer")
symbol <- toupper(symbol)
if (nchar(symbol) != 1 | !symbol %in% names(iupac))
stop("'symbol' must be a single valid IUPAC symbol")
if (n == 0) return(paste(cseq, collapse = ""))
if (n > nseq) stop("too many substitutions for ", nseq, " bases")
# which bases are to be used for the substitution?
ACGT <- strsplit(iupac[symbol], "")[[1]]
# create all possible combinations of positions to be changed in 'index'
index <- lapply(seq_len(n), function(j) combn(nseq, j, simplify = FALSE))
index <- unlist(index, recursive = FALSE)
# for each numeric vector in index, create as many variants as
# alternative bases are needed, collect in 'ans'
ans <- lapply(index, function(idx) {
bases <- lapply(cseq[idx], function(v) setdiff(ACGT, v))
bases <- bases[sapply(bases, length) > 0] # defensive 
bases <- expand.grid(bases, stringsAsFactors = FALSE)
bases <- as.matrix(bases)
nvars <- nrow(bases)
vars <- do.call(rbind, rep(list(cseq), nvars))
vars[ ,idx] <- bases
if (!is.null(vars))
return(split(vars, seq_len(nvars)))
})
ans <- unlist(ans, recursive = FALSE)
ans <- sapply(ans, paste, collapse = "")
ans <- unique(ans) # remove duplicates
return(ans)
}

输出示例:

makeDel(seq1, 0)
> [1] "GGCGACTG"
makeDel(seq1, 1)
> [1] "GCGACTG" "GGGACTG" "GGCACTG" "GGCGCTG" "GGCGATG" "GGCGACG" "GGCGACT"
makeDel(seq1, 2)
>  [1] "GCGACTG" "GGGACTG" "GGCACTG" "GGCGCTG" "GGCGATG" "GGCGACG" "GGCGACT"
>  [8] "CGACTG"  "GGACTG"  "GCACTG"  "GCGCTG"  "GCGATG"  "GCGACG"  "GCGACT" 
> [15] "GGGCTG"  "GGGATG"  "GGGACG"  "GGGACT"  "GGCCTG"  "GGCATG"  "GGCACG" 
> [22] "GGCACT"  "GGCGTG"  "GGCGCG"  "GGCGCT"  "GGCGAG"  "GGCGAT"  "GGCGAC" 
makeIns(seq1, 1) # default form
>  [1] "GAGCGACTG" "GCGCGACTG" "GGGCGACTG" "GTGCGACTG" "GGACGACTG" "GGCCGACTG"
>  [7] "GGTCGACTG" "GGCAGACTG" "GGCGGACTG" "GGCTGACTG" "GGCGAACTG" "GGCGCACTG"
> [13] "GGCGTACTG" "GGCGACCTG" "GGCGAGCTG" "GGCGATCTG" "GGCGACATG" "GGCGACGTG"
> [19] "GGCGACTTG" "GGCGACTAG" "GGCGACTCG" "GGCGACTGG"
makeIns(seq1, 1, symbol = "Y") # inserting only "C" or "T"
>  [1] "GCGCGACTG" "GTGCGACTG" "GGCCGACTG" "GGTCGACTG" "GGCTGACTG" "GGCGCACTG"
>  [7] "GGCGTACTG" "GGCGACCTG" "GGCGATCTG" "GGCGACTTG" "GGCGACTCG"
makeSub("AAA", 1)
> [1] "CAA" "GAA" "TAA" "ACA" "AGA" "ATA" "AAC" "AAG" "AAT"
makeSub("AAA", 2)
>  [1] "CAA" "GAA" "TAA" "ACA" "AGA" "ATA" "AAC" "AAG" "AAT" "CCA" "GCA" "TCA"
> [13] "CGA" "GGA" "TGA" "CTA" "GTA" "TTA" "CAC" "GAC" "TAC" "CAG" "GAG" "TAG"
> [25] "CAT" "GAT" "TAT" "ACC" "AGC" "ATC" "ACG" "AGG" "ATG" "ACT" "AGT" "ATT"

这些函数可以与vmatchPattern()一起使用,以创建变体和提取匹配项。一种建议的方法是首先使用max.mismatch = 1找到那些不匹配的序列。接下来,使用带fixed = FALSEvmatchPattern()查找带有删除和插入的序列,默认值 0 表示max.mismatch

或者,可以将帮助程序函数生成的显式模式传递给并行运行的grep进程!下面显示了vmatchPattern的使用,但可能有原因使用不同的工具执行分析。请参阅有关此主题的评论。

# first, allow mismatches to the original pattern
# the result is a "ByPos_MIndex" object of length 1000
m1 <- vmatchPattern(seq1, ref, max.mismatch = 1) # as before...
n1 <- elementNROWS(m1) # counts the number of elements (matches)
which(n1 > 0) # which of the 1000 ref sequence had a match with 0 or 1 mismatches?
>  [1]  14  71  73  76  79  88  90 108 126 129 138 141 150 160 163 179 180 195 200
> [20] 205 212 225 227 239 241 246 247 255 276 277 280 299 310 335 338 345 347 357
> [39] 359 369 378 383 387 390 391 404 409 410 414 418 469 472 479 488 499 509 523
> [58] 531 533 567 571 574 580 588 590 591 594 601 634 636 646 654 667 679 685 694
> [77] 696 713 717 732 734 737 749 750 761 762 783 815 853 854 857 903 929 943 959
> [96] 969 981 986 998
# Second search each of the patterns with lapply
# generates seven lists of objects, each of length 10000
pat2 <- makeDel(seq1, 1)
m2 <- lapply(pat2, function(pat) vmatchPattern(pat, ref))
# generates 22 lists of objects, each of length 10000
pat3 <- makeIns(seq1, 1)
m3 <- lapply(pat3, function(pat) vmatchPattern(pat, ref))

m2m3中的第二个和第三个结果是"ByPos_MIndex"对象列表。下面的示例从m2中提取匹配项的数量,并以缩写形式显示这些匹配项,并带有str()。列表中的每个值标识与相应模式至少有一个匹配项的参考序列。

n2 <- lapply(m2, elementNROWS)
str(sapply(n2, function(n) which(n > 0)))
> List of 7
>  $ : int [1:14] 14 138 179 335 369 391 567 679 713 734 ...
>  $ : int [1:18] 138 200 240 298 310 343 510 594 598 599 ...
>  $ : int [1:15] 21 26 45 60 260 497 541 600 607 642 ...
>  $ : int [1:17] 27 54 120 121 123 132 210 242 244 257 ...
>  $ : int [1:18] 15 33 110 126 154 419 528 539 546 606 ...
>  $ : int [1:12] 24 77 79 139 525 588 601 679 770 850 ...
>  $ : int [1:15] 179 345 378 414 469 571 574 580 591 713 ...

最后一个例子用同样的机制检查了22个"ByPos_MIndex"对象(m3)的第三个列表。它表明 22 个变体中的一些不匹配,一些匹配一次,五个匹配两次。

n3 <- lapply(m3, elementNROWS) # extract all counts
names(n3) <- sprintf("pat_%02d", seq_along(n3)) # for convenience
str(lapply(n3, function(n) which(n > 0)))
> List of 22
>  $ pat_01: int 679
>  $ pat_02: int 391
>  $ pat_03: int(0) 
>  $ pat_04: int 737
>  $ pat_05: int(0) 
>  $ pat_06: int(0) 
>  $ pat_07: int 108
>  $ pat_08: int 276
>  $ pat_09: int 439
>  $ pat_10: int [1:2] 764 773
>  $ pat_11: int(0) 
>  $ pat_12: int [1:2] 22 820
>  $ pat_13: int 795
>  $ pat_14: int [1:2] 914 981
>  $ pat_15: int(0) 
>  $ pat_16: int 112
>  $ pat_17: int 884
>  $ pat_18: int(0) 
>  $ pat_19: int [1:2] 345 378
>  $ pat_20: int [1:2] 571 854
>  $ pat_21: int 574
>  $ pat_22: int(0) 

不用说,为了提取序列信息,仍然存在大量的数据争吵。这可以通过帮助页面进行编码,以便matchPattern,并在一定程度上了解help("lowlevel-matching", package = "Biostrings")中描述的模式匹配逻辑。

尽管Biostrings中的例程使用非常快速且非常节省内存的算法来处理大序列。乔似乎发现在其他情况下原始搜索更快。总有更多的东西要学!

最新更新