我需要帮助,以便在R数据框架上使用正则表达式。
数据框架:
Names
1 [NW_019417116.1_2_3-HSPs_-__Canis_glabripennis]
2 [scaffold_29690_2-HSPs_-__Lopus_eremicus,scaffold_3037_+__Pig_sd,scaffold_144501_556-844_+__Lopus_eremicus]
3 [UDEW01029642.1_1-357_+__Falis_catus,scaffold_100683_331-631_+__Pig_sud,scaffold_209026_2-HSPs_+__Pig_sud,scaffold_220973_58-331_+__Pig_sud]
4 [scaffold_27087_+__Falis_catus,scaffold_77874_2-HSPs_-__Ratus_griseus]
和在df$Names
中,我想为HSPs
的行提取两个部分(部分具有物种名称,看起来像(__Content_content):
例如在scaffold_77874_2-HSPs_-__Ratus_griseus
中,这在Ratus_griseus
中和脚手架部分,在scaffold_77874_2-HSPs_-__Ratus_griseus
中,在scaffold_77874
部分
Names Specie_name Scaffold_name
[NW_019417116.1_2_3-HSPs_-__Canis_glabripennis] Canis_glabripennis NW_019417116.1_2
[scaffold_29690_2-HSPs_-__Lopus_eremicus,scaffold_3037_+__Pig_sd,scaffold_144501_556-844_+__Lopus_eremicus] Lopus_eremicus UDEW01029642.1
[UDEW01029642.1_1-357_+__Falis_catus,scaffold_100683_331-631_+__Pig_sud,scaffold_209026_2-HSPs_+__Pig_sud,scaffold_220973_58-331_+__Pig_sud] Pig_sud scaffold_209026
[scaffold_27087_+__Falis_catus,scaffold_77874_2-HSPs_-__Ratus_griseus] Rattus_griseus scaffold_77874
So far I try:
df %>%
mutate(Scaffold_name = str_match(Names, regex(',(.*)_\d+-HSPs'))[, 2],
Specie_name = str_extract(Names, ',(.*)_\d+-HSPs_.__.*,|,(.*)_\d+-HSPs_.__.*]|\[*.[^,]*_\d+-HSPs_.__.*\]'),
Specie_name = gsub(',','',Specie_name),
Specie_name = gsub(']','',Specie_name),
Specie_name = gsub('\[','',Specie_name),
Specie_name = gsub('.*__','',Specie_name)
)
这里是数据
structure(list(Names = structure(c(1L, 3L, 4L, 2L), .Label = c("[NW_019417116.1_3-HSPs_-__Canis_glabripennis]",
"[scaffold_27087_+__Falis_catus,scaffold_77874_2-HSPs_-__Ratus_griseus]",
"[scaffold_29690_2-HSPs_-__Lopus_eremicus,scaffold_3037_+__Pig_sd,scaffold_144501_556-844_+__Lopus_eremicus]",
"[UDEW01029642.1_1-357_+__Falis_catus,scaffold_100683_331-631_+__Pig_sud,scaffold_209026_2-HSPs_+__Pig_sud,scaffold_220973_58-331_+__Pig_sud]"
), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))
我们可以使用separate_rows
在,
处拆分'Names',然后通过删除[
和]
来清理'Names'列,separate
通过在-[-+]__
处拆分'Names'成两列
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(Names = as.character(Names)) %>%
separate_rows(Names, sep=",") %>%
mutate(Names = str_remove_all(Names, "\[|\]")) %>%
filter(str_detect(Names, "HSPs")) %>%
separate(Names, into = c('Scaffold_name', "Specie_name"),
sep = "_[-+]__", remove = FALSE) %>%
mutate(Scaffold_name = str_remove(Scaffold_name, "-HSPs"))
或者我们可以通过直接提取子字符串来避免filter/separate_rows
,然后再提取separate
df %>%
mutate(sub_name = str_extract(Names, "[[:alnum:]._]+-HSPs[^,]+")) %>%
separate(sub_name, into = c('Scaffold_name', 'Specie_name'),
sep='-HSPs_[-+]__') %>%
mutate(Specie_name = str_remove(Specie_name, "\]"))
与产出
#Names
#1 [NW_019417116.1_3-HSPs_-__Canis_glabripennis]
#2 [scaffold_29690_2-HSPs_-__Lopus_eremicus,scaffold_3037_+__Pig_sd,scaffold_144501_556-844_+__Lopus_eremicus]
#3 [UDEW01029642.1_1-357_+__Falis_catus,scaffold_100683_331-631_+__Pig_sud,scaffold_209026_2-HSPs_+__Pig_sud,scaffold_220973_58-331_+__Pig_sud]
#4 [scaffold_27087_+__Falis_catus,scaffold_77874_2-HSPs_-__Ratus_griseus]
# Scaffold_name Specie_name
#1 NW_019417116.1_3 Canis_glabripennis
#2 scaffold_29690_2 Lopus_eremicus
#3 scaffold_209026_2 Pig_sud
#4 scaffold_77874_2 Ratus_griseus