我有一个选项卡delim文件文件,其中包含以下信息
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_23_167_0_1
MAASDASDWEPWERIWERIWER
>ss_23_167_0_1
QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_45_201_0_1
ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
对于像ss_45_201_0_1
和ss_23_167_0_1
这样的id,有多个条目,我只想保留那些长度最大的条目。我想得到如下输出:
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
我在R中尝试了以下代码,但失败了
Unique(fasta)
有人能指导我吗?对于那些有多个不同长度的条目的相同id,我怎么能只得到最长的序列呢。
这里有三个选项需要考虑。
选项1:基本R
取消列表,使用nchar
,然后使用ave
计算要保留的值。
x <- nchar(unlist(l))
l[as.logical(ave(x, names(x), FUN = function(x) x == max(x)))]
# $ss_23_122_0_1
# [1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
#
# $ss_23_167_0_1
# [1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
#
# $ss_45_201_0_1
# [1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
#
# $ss_89_10_0_2
# [1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"
选项2:"data.table"
使用"重新整形2"中的melt
创建data.frame
。使用rank
和nchar
进行子集。(我使用了rank而不是==
,这样我就不必使用nchar
两次了——还没有检查比较效率。)
library(data.table)
library(reshape2)
as.data.table(melt(l))[, Rnk := rank(nchar(as.character(value))),
by = L1][Rnk == 1]
# value L1 Rnk
# 1: MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2: MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3: ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4: NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
选项3:"dplyr"
类似于"data.table"的方法。
library(dplyr)
library(reshape2)
melt(l) %>%
group_by(L1) %>%
mutate(Rnk = dense_rank(nchar(as.character(value)))) %>%
filter(Rnk == 1)
# Source: local data frame [4 x 3]
# Groups: L1
#
# value L1 Rnk
# 1 MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2 MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3 ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4 NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
也许还有一种更优雅的方式。。。
l <-list(ss_23_122_0_1 = "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS",
ss_23_167_0_1 = "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW",
ss_23_167_0_1 = "MAASDASDWEPWERIWERIWER",
ss_23_167_0_1 = "QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE",
ss_45_201_0_1 = "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER",
ss_45_201_0_1 = "ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE",
ss_89_10_0_2 = "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP")
res <- split(l, names(l))
ind <- lapply(split(sapply(l, nchar), names(l)), which.max)
Map(function(x, y) x[y], res, ind)
$ss_23_122_0_1
$ss_23_122_0_1$ss_23_122_0_1
[1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
$ss_23_167_0_1
$ss_23_167_0_1$ss_23_167_0_1
[1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
$ss_45_201_0_1
$ss_45_201_0_1$ss_45_201_0_1
[1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
$ss_89_10_0_2
$ss_89_10_0_2$ss_89_10_0_2
[1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"