我的想法可能很天真。但是我想要基于第二个"0"来分割df的行[1:3]_&";,使用tidyr::extract()
library(tidyr)
library(dplyr)
extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$")
并且df[4:6]的行基于第一"_">
extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
我正在考虑类似的东西
df %>%
mutate(n=row_number())
mutate(col2=case_when
(n<=3 ~ extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$"),
n>3 ~ extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
)
当然,这是大错特错的,但这在某种程度上可能吗?
示例数据:
df=tibble(col1 = c("2397_A_run379_CTTGTACT_S119_L004_R1_001",
"3779_A_run535_TTATAGCC_S91_L003_R1_001",
"4958_BV_run685_GCGTACGT_S89_L005_R1_001",
"5126AA_S27_L004_R1_001",
"5126AF_S32_L004_R1_001",
"5126AL_S38_L004_R1_001"))
df
#> # A tibble: 6 × 1
#> col1
#> <chr>
#> 1 2397_A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535_TTATAGCC_S91_L003_R1_001
#> 3 4958_BV_run685_GCGTACGT_S89_L005_R1_001
#> 4 5126AA_S27_L004_R1_001
#> 5 5126AF_S32_L004_R1_001
#> 6 5126AL_S38_L004_R1_001
创建于2022-11-17,reprex v2.0.2
如果模式是通过匹配_
来提取子字符串,则前面的一个或多个字母后面跟着数字,
library(dplyr)
library(stringr)
df %>%
mutate(col2 = str_extract(col1, "(?<=_)[A-Za-z]+\d+.*"))
-输出
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 S38_L004_R1_001
或使用separate
library(tidyr)
separate(df, col1, into = c("col1", "col2"),
sep = "(?<=[A-Z])_(?=[A-Za-z]+\d+)", extra = "merge")
-输出
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A run379_CTTGTACT_S119_L004_R1_001
2 3779_A run535_TTATAGCC_S91_L003_R1_001
3 4958_BV run685_GCGTACGT_S89_L005_R1_001
4 5126AA S27_L004_R1_001
5 5126AF S32_L004_R1_001
6 5126AL S38_L004_R1_001
tidyr::extract()
获取并返回一个数据帧,在mutate()
中使用会很困难。我会使用类似stringr::str_match()
:的东西
library(dplyr)
library(stringr)
df %>%
mutate(
row = row_number(),
col2 = case_when(
row < 4 ~ str_match(col1, ".+?_.+?_(.+)")[, 2],
row < 7 ~ str_match(col1, ".+?_(.+)")[, 2]
)
)
# A tibble: 6 × 3
col1 row col2
<chr> <int> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 1 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 2 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 3 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 4 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 5 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 6 S38_L004_R1_001