我正在尝试执行以下应该很容易的任务,但我无法理解它:
我有两个带有字母(核苷酸碱基)的序列,可能含糊不清。我想重写每个序列的所有可能性......
例如,第一个序列是:
CAGCMGCCGCGGTAAYWC
它包含 M、Y 和 W,分别可以是 [A,C]、[C,T]、[A,T]。因此,上述顺序应重写为以下可能性:
CAGCAGCCGCGGTAACAC
CAGCAGCCGCGGTAACTC
CAGCAGCCGCGGTAATAC
CAGCAGCCGCGGTAATTC
CAGCCGCCGCGGTAACAC
CAGCCGCCGCGGTAACTC
CAGCCGCCGCGGTAATAC
CAGCCGCCGCGGTAATTC
到目前为止,我有 MWE:
#!/usr/bin/perl -w
use warnings;
use strict;
my %seqs = (
"PrefixPE/1" => "CAGCMGCCGCGGTAAYWC",
"PrefixPE/2" => "BSCCCGYCAATTYMTKTRAGT"
);
my %ops;
$ops{"R"}{"A"}="";
$ops{"R"}{"G"}="";
$ops{"Y"}{"C"}="";
$ops{"Y"}{"T"}="";
$ops{"M"}{"A"}="";
$ops{"M"}{"C"}="";
$ops{"K"}{"G"}="";
$ops{"K"}{"T"}="";
$ops{"W"}{"A"}="";
$ops{"W"}{"T"}="";
$ops{"B"}{"C"}="";
$ops{"B"}{"G"}="";
$ops{"B"}{"T"}="";
$ops{"S"}{"C"}="";
$ops{"S"}{"G"}="";
foreach my $id(keys %seqs){
my $seq=$seqs{$id};
my @nts=(split '', $seq);
my $i=0;
foreach my $n(@nts){
$i++;
if (exists $ops{$n}){
my $j=0;
foreach my $o(keys %{$ops{$n}}){
$j++;
print "$id, pos $i.$j = <$o>n";
}
}
else{
print "$id, pos $i = <$n>n";
}
}
}
对于每个字母,展开可能的序列集。
#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
use Syntax::Construct qw{ /r };
my %ambiguous = ( M => [qw[ A C ]],
Y => [qw[ C T ]],
W => [qw[ A T ]],
);
my $string = 'CAGCMGCCGCGGTAAYWC';
my $is_ambiguous = '[' . (join q(), keys %ambiguous) . ']';
my @strings = $string;
while ($strings[0] =~ $is_ambiguous) {
my ($letter) = $strings[0] =~ /($is_ambiguous)/;
@strings = map {
my $s = $_;
map $s =~ s/$letter/$_/r, @{ $ambiguous{$letter} }
# map { (my $x = $s) =~ s/$letter/$_/; $x } @{ $ambiguous{$letter} }
} @strings;
}
say for @strings;
在 5.14 之前的 Perl 上,使用注释行而不是上面的行,并删除 Syntax::Construct。
以下是使用递归的解决方案:
use feature qw(say);
use strict;
use warnings;
my %seqs = (
"PrefixPE/1" => "CAGCMGCCGCGGTAAYWC",
"PrefixPE/2" => "BSCCCGYCAATTYMTKTRAGT"
);
my %ops = (
R => 'AG',
Y => 'CT',
M => 'AC',
K => 'GT',
W => 'AT',
B => 'CGT',
S => 'CG',
);
$ops{$_} = [ split //, $ops{$_} ] for keys %ops;
my $perm = GenPermutations->new( %ops );
for my $id (keys %seqs) {
my $seq = $seqs{$id};
$perm->gen( $seq );
$perm->print_result();
}
exit;
package GenPermutations;
sub new {
my ( $class, $ops ) = @_;
my ($pat) = map qr/$_/, '[' . (join '', keys %$ops) . ']';
my $info = { ops => $ops, pat => $pat, pos => [], result => [], seq => undef };
return bless $info, $class;
}
sub _init {
my ( $self, $seq ) = @_;
@{ $self->{pos} } = ();
@{ $self->{result} } = ();
$self->{seq} = $seq;
while ( $seq =~ /($self->{pat})/g ) {
push @{ $self->{pos} }, [$-[1], $1];
}
}
sub print_result {
my ( $self ) = @_;
say $self->{seq} . ' : found ' . (scalar @{ $self->{result} }) . ' permutations : ';
say " $_" for @{ $self->{result} };
say "";
}
sub gen {
my ( $self, $seq ) = @_;
$self->_init( $seq );
$self->_gen( $seq, 0 );
}
sub _gen {
my ( $self, $str, $pos_index ) = @_;
if ( $pos_index > $#{$self->{pos}} ) {
push @{ $self->{result} }, $str;
return;
}
my $info = $self->{pos}[$pos_index];
my ( $index, $letter) = @$info;
$pos_index++;
for my $replace ( @{ $self->{ops}{$letter} } ) {
my $temp = $str;
substr $temp, $index, 1, $replace;
$self->_gen( $temp, $pos_index );
}
}
1;