在Needleman-Wunsch全局对齐中进行回溯,而不存储指针



我的理解是,虽然我能找到的每一个动态编程讨论基本上都有一个存储指针的矩阵,但在回溯步骤中重新计算之前的单元格会更快。

我有我的动态编程算法来正确地构建矩阵,但我对如何进行回溯计算感到困惑。我也被告知有必要重新计算这些值(而不仅仅是查找它们),但我不知道如何得出不同的数字。

我正在实施的软件版本包括一个选项,用于打开两个序列中的间隙,因此每个矩阵的递归关系有三个选项。下面是我的全局对齐类的当前版本。根据我的手工计算,我相信score_align可以正确地生成矩阵,但显然traceback_col_seq不起作用。

INF = 2147483647 #max size of int32
class global_aligner():
    def __init__(self, subst, open=10, extend=2, double=3):
        self.extend, self.open, self.double, self.subst = extend, open, double, subst
    def __call__(self, row_seq, col_seq):
        #add alphabet error checking?
        score_align(row_seq, col_seq)
        return traceback_col_seq()
    def init_array(self):
        self.M = zeros((self.maxI, self.maxJ), int)
        self.Ic = zeros((self.maxI, self.maxJ), int)
        self.Ir = zeros((self.maxI, self.maxJ), int)
        for i in xrange(self.maxI):
            self.M[i][0], self.Ir[i][0], self.Ic[i][0] = 
                    -INF, -INF, -(self.open+self.extend*i)
        for j in xrange(self.maxJ):
            self.M[0][j], self.Ic[0][j], self.Ir[0][j] = 
                    -INF, -INF, -(self.open+self.extend*j)
        self.M[0][0] = 0
        self.Ic[0][0] = -self.open
    def score_cell(self, i, j, chars):
        thisM = [self.Ic[i-1][j-1]+self.subst[chars], self.M[i-1][j-1]+
                        self.subst[chars], self.Ir[i-1][j-1]+self.subst[chars]]
        thisC = [self.Ic[i][j-1]-self.extend, self.M[i][j-1]-self.open, 
                        self.Ir[i][j-1]-self.double]
        thisR = [self.M[i-1][j]-self.open, self.Ir[i-1][j]-self.extend, 
                        self.Ic[i-1][j]-self.double]
        return max(thisM), max(thisC), max(thisR)
    def score_align(self, row_seq, col_seq):
        self.row_seq, self.col_seq = list(row_seq), list(col_seq)
        self.maxI, self.maxJ = len(self.row_seq)+1, len(self.col_seq)+1
        self.init_array()
        for i in xrange(1, self.maxI):
            row_char = self.row_seq[i-1]
            for j in xrange(1, self.maxJ):
                chars = row_char+self.col_seq[j-1]
                self.M[i][j], self.Ic[i][j], self.Ir[i][j] = 
                        self.score_cell(i, j, chars)
    def traceback_col_seq(self):
        self.traceback = list()
        i, j = self.maxI-1, self.maxJ-1
        while i > 1 and j > 1:
            cell = [self.M[i][j], self.Ic[i][j], self.Ir[i][j]]
            cellMax = max(cell)
            chars = self.row_seq[i-1]+self.col_seq[j-1]
            if cell.index(cellMax) == 0: #M
                diag = [diagM, diagC, diagR] = self.score_cell(i-1, j-1, chars)
                diagMax = max(diag)
                if diag.index(diagMax) == 0: #match
                    self.traceback.append(self.col_seq[j-1])
                elif diag.index(diagMax) == 1: #insert column (open)
                    self.traceback.append('-')
                elif diag.index(diagMax) == 2: #insert row (open other)
                    self.traceback.append(self.col_seq[j-1].lower())
                i, j = i-1, j-1
            elif cell.index(cellMax) == 1: #Ic
                up = [upM, upC, upR] = self.score_cell(i-1, j, chars)
                upMax = max(up)
                if up.index(upMax) == 0: #match (close)
                    self.traceback.append(self.col_seq[j-1])
                elif up.index(upMax) == 1: #insert column (extend)
                    self.traceback.append('-')
                elif up.index(upMax) == 2: #insert row (double)
                    self.traceback.append('-')
                i -= 1
            elif cell.index(cellMax) == 2: #Ir
                left = [leftM, leftC, leftR] = self.score_cell(i, j-1, chars)
                leftMax = max(left)
                if left.index(leftMax) == 0: #match (close)
                    self.traceback.append(self.col_seq[j-1])
                elif left.index(leftMax) == 1: #insert column (double)
                    self.traceback.append('-')
                elif left.index(leftMax) == 2: #insert row (extend other)
                    self.traceback.append(self.col_seq[j-1].lower())
                j -= 1
        for j in xrange(0,j,-1):
            self.traceback.append(self.col_seq[j-1])
        for i in xrange(0,i, -1):
            self.traceback.append('-')
        return ''.join(self.traceback[::-1])    

test = global_aligner(blosumMatrix)
test.score_align('AA','AAA')
test.traceback_col_seq()

我认为主要问题是,在生成可能来自的单元格时,没有考虑到当前所在的矩阵。cell = [self.M[i][j], self.Ic[i][j], self.Ir[i][j]]在while循环中第一次是正确的,但在那之后,你不能只选择得分最高的矩阵。你的选择受到来自哪里的限制。我在执行代码时遇到了一些问题,但我认为您在while循环中的if语句中已经考虑到了这一点。如果是这样的话,那么我认为沿着这些路线的改变就足够了:

 cell = [self.M[i][j], self.Ic[i][j], self.Ir[i][j]]
 cellIndex = cell.index(max(cell))
 while i > 1 and j > 1:
      chars = self.row_seq[i-1]+self.col_seq[j-1]
      if cellIndex == 0: #M
            diag = [diagM, diagC, diagR] = self.score_cell(i-1, j-1, chars)
            diagMax = max(diag)
            ...
            cellIndex = diagMax
            i, j = i-1, j-1
        elif cell.index(cellMax) == 1: #Ic
            up = [upM, upC, upR] = self.score_cell(i-1, j, chars)
            upMax = max(up)
            ...
            cellIndex = upMax
            i -= 1
        elif cell.index(cellMax) == 2: #Ir
            left = [leftM, leftC, leftR] = self.score_cell(i, j-1, chars)
            leftMax = max(left)
            ...
            cellIndex = leftMax
            j -= 1

就像我说的,我不确定我是否正确地遵循了你的代码,但看看这是否有帮助。

最新更新