我的数据帧是这样的
Fasta标头 | ab12_P002; ab12_P003; ab12_P005; ab23_P002; ab23_P001 |
---|
ab45_P001; ab36_P001 |
ab55_P001; ab55_P002 |
这里有一个稍微不同的方法:
library(stringr)
library(dplyr)
library(tidyr)
without_02473 %>%
separate_rows(`Fasta headers`) %>%
filter(str_detect(`Fasta headers`, 'ab\d+')) %>%
distinct()
`Fasta headers`
<chr>
1 ab12
2 ab23
3 ab45
4 ab36
5 ab55
使用strsplit
并删除_之后的所有内容并过滤distinct
的另一个选项如下:
library(dplyr)
library(tidyr)
without_02473 %>%
mutate(`Fasta headers` = strsplit(`Fasta headers`, ";")) %>%
unnest(`Fasta headers`) %>%
mutate(`Fasta headers` = sub("_[^_]+$", "", `Fasta headers`)) %>%
distinct()
#> # A tibble: 5 × 1
#> `Fasta headers`
#> <chr>
#> 1 ab12
#> 2 ab23
#> 3 ab45
#> 4 ab36
#> 5 ab55
创建于2023-01-03与reprex v2.0.2
我们可以使用separate_rows
在;
处拆分Fasta headers
以创建新行,然后使用trimws
从_
开始删除后缀部分
library(dplyr)
library(tidyr)
out <- without_02473 %>%
separate_rows(`Fasta headers`, sep = ";") %>%
mutate(`Fasta headers` = trimws(`Fasta headers`, whitespace = "_.*")) %>%
distinct
与产出
out
# A tibble: 5 × 1
`Fasta headers`
<chr>
1 ab12
2 ab23
3 ab45
4 ab36
5 ab55
library(writexl)
write_xlsx(out, "first.xlsx")
或者可以只提取_
之前的单词,str_extract_all
,unnest
list
列,并获得distinct
行
library(stringr)
without_02473 %>%
mutate(`Fasta headers` = str_extract_all(`Fasta headers`,
"\w+(?=_)")) %>%
unnest(`Fasta headers`) %>%
distinct
数据without_02473 <- structure(list(`Fasta headers` = c("ab12_P002;ab12_P003;ab12_P005;ab23_P002;ab23_P001",
"ab45_P001;ab36_P001", "ab55_P001;ab55_P002")), class = "data.frame", row.names = c(NA,
-3L))