我有一个数据框(subset_df
(,如下所示:
structure(list(sequence = c("CSPPPPSPSPHPRPP", "GEGSPTSPTSPKQPG",
"EAGAPAGSGAPPPAD", "PAPPKPKESKEPENA", "AKPKQQDEDPDGAAE", "AYATMLKDVQWKVRKS",
"HEKLVQDIWKKLEAKG", "SCSVKLGLWKNAVNNC", "MAYVCELGPNQGWK", "LKDPKQYQSIVDAEWK",
"KEAPGATEKDRAKATP", "TAYIMRPLDHGADVTL", "CVTQEHFREAMAKTNP", "AGTGFPYREMMPMNAP",
"HKKSTEDNDDDAFCAP", "RPGGPPGYRTPYTAK", "TQGDRQKIQDAVSAA", "EVKSRYNVDVSQNKR",
"VIEMTRAFEDDDFDK", "GSADLTPSNLTRPAS"), group = c("BP", "BP",
"BP", "BP", "BP", "EpQ", "EpQ", "EpQ", "EpQ", "EpQ", "abc", "abc",
"abc", "abc", "abc", "LbT", "LbT", "LbT", "LbT", "LbT")), .Names = c("sequence",
"group"), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame"))
最终,我想创建一个新列(subset_df$ID
(,其ID基于subset_df$sequence
下的每个条目,遵循以下模式:
- group_number_first
subset_df$sequence
下的四个字符。
为了说明这一点,下面我粘贴了一些示例:
"BP_1_CSPP" "BP_2_GEGS" "BP_3_EAGA" "BP_4_PAPP" "BP_5_AKPK" "EpQ_1_AYAT"
我正在使用包stringr
中的函数str_sub
来生成输出(请参阅下面的循环(。
到目前为止,我所做的如下:
# define where the groups are (BP, abc, LbT, EpQ)
groups <- c("BP", "EpQ", "abc", "LbT")
# define the indexes of all groups using a loop
groups_indexes <- list()
for(i in groups) {
groups_indexes[[i]] <- grep(pattern = i, x = subset_df$group)
}
考虑我的列表(groups_indexes
(输出:
$BP
[1] 1 2 3 4 5
$EpQ
[1] 6 7 8 9 10
$abc
[1] 11 12 13 14 15
$LbT
[1] 16 17 18 19 20
我只设法为groups_indexes
中的每个元素使用一个for loop
来完成这项工作,如下所示(四个for loops
(
# BP
for(i in groups_indexes[1]) {
subset_df$IDs[i] <- paste0("BP_", i, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
# EpQ
for(i in groups_indexes[2]) {
subset_df$IDs[i] <- paste0("EpQ_", i-(groups_indexes$EpQ[1])+1, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
# abc
for(i in groups_indexes[3]) {
subset_df$IDs[i] <- paste0("abc_", i-(groups_indexes$abc[1])+1, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
# LbT
for(i in groups_indexes[4]) {
subset_df$IDs[i] <- paste0("LbT_", i-(groups_indexes$LbT[1])+1, "_", str_sub(string = subset_df$sequence[i], start = 1, end = 4))
}
分别运行这四个for loops
后,我得到了以下输出:
> subset_df$IDs
[1] "BP_1_CSPP" "BP_2_GEGS" "BP_3_EAGA" "BP_4_PAPP" "BP_5_AKPK" "EpQ_1_AYAT" "EpQ_2_HEKL" "EpQ_3_SCSV" "EpQ_4_MAYV" "EpQ_5_LKDP"
[11] "abc_1_KEAP" "abc_2_TAYI" "abc_3_CVTQ" "abc_4_AGTG" "abc_5_HKKS" "LbT_1_RPGG" "LbT_2_TQGD" "LbT_3_EVKS" "LbT_4_VIEM" "LbT_5_GSAD"
但是,我正在尝试使用单个循环来完成这项工作。我虽然使用names(groups_indexes[i])
来调用我for loops
中的引号中的字符("BP","EpQ","abc","LbT"(。
也许我们需要按"组"分组,然后paste
带有行序列(row_number()
(和"序列"的子字符串(substr
(的"组">
library(dplyr)
subset_df %>%
group_by(group) %>%
mutate(ID = paste(group, row_number(), substr(sequence, 1, 4), sep="_"))
# sequence group ID
# <chr> <chr> <chr>
#1 CSPPPPSPSPHPRPP BP BP_1_CSPP
#2 GEGSPTSPTSPKQPG BP BP_2_GEGS
#3 EAGAPAGSGAPPPAD BP BP_3_EAGA
#4 PAPPKPKESKEPENA BP BP_4_PAPP
#5 AKPKQQDEDPDGAAE BP BP_5_AKPK
#6 AYATMLKDVQWKVRKS EpQ EpQ_1_AYAT
#7 HEKLVQDIWKKLEAKG EpQ EpQ_2_HEKL
#8 SCSVKLGLWKNAVNNC EpQ EpQ_3_SCSV
#9 MAYVCELGPNQGWK EpQ EpQ_4_MAYV
#10 LKDPKQYQSIVDAEWK EpQ EpQ_5_LKDP
#11 KEAPGATEKDRAKATP abc abc_1_KEAP
#12 TAYIMRPLDHGADVTL abc abc_2_TAYI
#13 CVTQEHFREAMAKTNP abc abc_3_CVTQ
#14 AGTGFPYREMMPMNAP abc abc_4_AGTG
#15 HKKSTEDNDDDAFCAP abc abc_5_HKKS
#16 RPGGPPGYRTPYTAK LbT LbT_1_RPGG
#17 TQGDRQKIQDAVSAA LbT LbT_2_TQGD
#18 EVKSRYNVDVSQNKR LbT LbT_3_EVKS
#19 VIEMTRAFEDDDFDK LbT LbT_4_VIEM
#20 GSADLTPSNLTRPAS LbT LbT_5_GSAD
如果我们的数字不是按组和基于整个列,那么删除group_by
操作并执行
subset_df %>%
mutate(ID = paste(group, row_number(), substr(sequence, 1, 4), sep="_"))