Perl:用每个选项替换不明确的字母



我正在尝试执行以下应该很容易的任务,但我无法理解它:

我有两个带有字母(核苷酸碱基)的序列,可能含糊不清。我想重写每个序列的所有可能性......

例如,第一个序列是:

 CAGCMGCCGCGGTAAYWC

它包含 M、Y 和 W,分别可以是 [A,C]、[C,T]、[A,T]。因此,上述顺序应重写为以下可能性:

 CAGCAGCCGCGGTAACAC
 CAGCAGCCGCGGTAACTC
 CAGCAGCCGCGGTAATAC
 CAGCAGCCGCGGTAATTC
 CAGCCGCCGCGGTAACAC
 CAGCCGCCGCGGTAACTC
 CAGCCGCCGCGGTAATAC
 CAGCCGCCGCGGTAATTC

到目前为止,我有 MWE:

#!/usr/bin/perl -w
use warnings;
use strict;
my %seqs = (
    "PrefixPE/1"  => "CAGCMGCCGCGGTAAYWC",
    "PrefixPE/2" => "BSCCCGYCAATTYMTKTRAGT"
);
my %ops;
$ops{"R"}{"A"}="";
$ops{"R"}{"G"}="";
$ops{"Y"}{"C"}="";
$ops{"Y"}{"T"}="";
$ops{"M"}{"A"}="";
$ops{"M"}{"C"}="";
$ops{"K"}{"G"}="";
$ops{"K"}{"T"}="";
$ops{"W"}{"A"}="";
$ops{"W"}{"T"}="";
$ops{"B"}{"C"}="";
$ops{"B"}{"G"}="";
$ops{"B"}{"T"}="";
$ops{"S"}{"C"}="";
$ops{"S"}{"G"}="";
foreach my $id(keys %seqs){
    my $seq=$seqs{$id};
    my @nts=(split '', $seq);
    my $i=0;
    foreach my $n(@nts){
        $i++;
        if (exists $ops{$n}){
            my $j=0;
            foreach my $o(keys %{$ops{$n}}){
                $j++;
                print "$id, pos $i.$j = <$o>n";
            }
        }
        else{
            print "$id, pos $i = <$n>n";
        }
    }
}

对于每个字母,展开可能的序列集。

#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
use Syntax::Construct qw{ /r };
my %ambiguous = ( M => [qw[ A C ]],
                  Y => [qw[ C T ]],
                  W => [qw[ A T ]],
                );
my $string = 'CAGCMGCCGCGGTAAYWC';
my $is_ambiguous = '[' . (join q(), keys %ambiguous) . ']';
my @strings = $string;
while ($strings[0] =~ $is_ambiguous) {
    my ($letter) = $strings[0] =~ /($is_ambiguous)/;
    @strings = map {
        my $s = $_;
        map $s =~ s/$letter/$_/r, @{ $ambiguous{$letter} }
        # map { (my $x = $s) =~ s/$letter/$_/; $x } @{ $ambiguous{$letter} }
    } @strings;
}
say for @strings;

在 5.14 之前的 Perl 上,使用注释行而不是上面的行,并删除 Syntax::Construct。

以下是使用递归的解决方案:

use feature qw(say);
use strict;
use warnings;
my %seqs = (
    "PrefixPE/1"  => "CAGCMGCCGCGGTAAYWC",
    "PrefixPE/2" => "BSCCCGYCAATTYMTKTRAGT"
);
my %ops = (
    R => 'AG',
    Y => 'CT',
    M => 'AC',
    K => 'GT',
    W => 'AT',
    B => 'CGT',
    S => 'CG',
);
$ops{$_} = [ split //, $ops{$_} ] for keys %ops;
my $perm = GenPermutations->new( %ops  );
for my $id (keys %seqs) {
    my $seq = $seqs{$id};
    $perm->gen( $seq );
    $perm->print_result();
}
exit;
package GenPermutations;
sub new {
    my ( $class, $ops ) = @_;
    my ($pat) = map qr/$_/, '[' . (join '', keys %$ops) . ']';
    my $info = { ops => $ops, pat => $pat, pos => [], result => [], seq => undef }; 
    return bless $info, $class;
}
sub _init {
    my ( $self, $seq ) = @_;
    @{ $self->{pos} } = ();
    @{ $self->{result} } = ();
    $self->{seq} = $seq;
    while ( $seq =~ /($self->{pat})/g ) {
        push @{ $self->{pos} }, [$-[1], $1];
    }
}
sub print_result {
    my ( $self ) = @_;
    say $self->{seq} . ' : found ' . (scalar @{ $self->{result} }) . ' permutations : ';
    say "  $_" for @{ $self->{result} };
    say "";
}
sub gen {
    my ( $self, $seq ) = @_;
    $self->_init( $seq );
    $self->_gen( $seq, 0 );
}
sub _gen {
    my ( $self, $str, $pos_index ) = @_;
    if ( $pos_index > $#{$self->{pos}} ) {
        push @{ $self->{result} }, $str;
        return;
    }
    my $info = $self->{pos}[$pos_index];
    my ( $index, $letter) = @$info;
    $pos_index++;
    for my $replace ( @{ $self->{ops}{$letter} } ) {
        my $temp = $str;
        substr $temp, $index, 1, $replace; 
        $self->_gen( $temp, $pos_index );
    }
}
1;

最新更新