我有一些数据,看起来像:
grp grp2 figures
1 10016 13567 0.55 - 0.65
2 1270 2090 1.90 - 2
3 8757 12259 6.60 - 7n8.50 - 9.50
4 2589 3982 2.75 - 3.15
5 9099 12573 1.24 - 1.30n1.22 - 1.30n0.08 - 0.10n0.10 - 0.12
6 5211 8334 6.90 - 7.40
7 9994 13542 2.73 - 2.78
8 2685 4113 0.72 - 0.74n2.75 - 2.77n3.10 - 3.18
9 9778 13295 32 - 39n190 - 200n142 - 152n90 - 100
10 772 1317 2.20 - 2.40
我想从figures
列中提取一些信息。我想将列拆分为min
和max
,这可以通过使用类似strsplit(dat$figures, split = "-")
的东西拆分-
来完成(但我希望它分为两列min
和max
(。一旦我有了这个,我想添加一个新列,无论观察是否包含小数,它都会给我一个0
或1
。我想一行一行地做这件事,一些观测包含n
,所以我对以下观测1.24 - 1.30n1.22 - 1.30n0.08 - 0.10n0.10 - 0.12
的预期输出是.
min max containsDecimal
1.24 1.30 1
1.22 1.30 1
0.08 0.10 1
0.10 0.12 1
以下32 - 39n190 - 200n142 - 152n90 - 100
示例如下:
min max containsDecimal
32 39 0
190 200 0
142 152 0
90 100 0
其中,现在列containsDecimal
为0
。我试图将结果包含在观察所在的细胞内,并通过n
进行拆分,而不是展开。
数据:
dat <- structure(list(grp = c("10016", "1270", "8757", "2589", "9099",
"5211", "9994", "2685", "9778", "772"), grp2 = c(13567L, 2090L,
12259L, 3982L, 12573L, 8334L, 13542L, 4113L, 13295L, 1317L),
figures = c("0.55 - 0.65", "1.90 - 2", "6.60 - 7n8.50 - 9.50",
"2.75 - 3.15", "1.24 - 1.30n1.22 - 1.30n0.08 - 0.10n0.10 - 0.12",
"6.90 - 7.40", "2.73 - 2.78", "0.72 - 0.74n2.75 - 2.77n3.10 - 3.18",
"32 - 39n190 - 200n142 - 152n90 - 100", "2.20 - 2.40")), class = "data.frame", row.names = c(NA,
-10L))
一个可能的小规模解决方案是
library(dplyr)
library(tidyr)
library(stringr)
dat %>%
mutate(figures = str_split(figures, "n")) %>%
unnest_longer(figures) %>%
separate(figures, into = c("min", "max"), " - ", convert = TRUE) %>%
mutate(contains_decimal = if_else(grepl(".", min, fixed = TRUE) | grepl(".", max, fixed = TRUE), 1, 0))
# A tibble: 19 x 5
grp grp2 min max contains_decimal
<chr> <int> <dbl> <dbl> <dbl>
1 10016 13567 0.55 0.65 1
2 1270 2090 1.9 2 1
3 8757 12259 6.6 7 1
4 8757 12259 8.5 9.5 1
5 2589 3982 2.75 3.15 1
6 9099 12573 1.24 1.3 1
7 9099 12573 1.22 1.3 1
8 9099 12573 0.08 0.1 1
9 9099 12573 0.1 0.12 1
10 5211 8334 6.9 7.4 1
11 9994 13542 2.73 2.78 1
12 2685 4113 0.72 0.74 1
13 2685 4113 2.75 2.77 1
14 2685 4113 3.1 3.18 1
15 9778 13295 32 39 0
16 9778 13295 190 200 0
17 9778 13295 142 152 0
18 9778 13295 90 100 0
19 772 1317 2.2 2.4 1
library(tidyverse)
dat <- mutate(dat, figs = stringr::str_split(figures, "n")) # produces a list column. In the other words, some of the rows of `figs` column have more than one element.
dat <- unnest(dat, col = figs)
dat <- separate(dat, figs,c("min", "max"), "-") # you could use convert = TRUE, to convert min and max into numeric.
dat <- mutate(dat, contains_decimal = grepl("\.", min) & grepl("\.", max)) # since type of min and max are characters, containing decimal mean containing literally a period.
# A tibble: 19 x 6
grp grp2 figures min max contains_decimal
<chr> <int> <chr> <chr> <chr> <lgl>
1 10016 13567 "0.55 - 0.65" "0.55… " 0.6… TRUE
2 1270 2090 "1.90 - 2" "1.90… " 2" FALSE
3 8757 12259 "6.60 - 7n8.50 - 9.50" "6.60… " 7" FALSE
4 8757 12259 "6.60 - 7n8.50 - 9.50" "8.50… " 9.5… TRUE
5 2589 3982 "2.75 - 3.15" "2.75… " 3.1… TRUE
6 9099 12573 "1.24 - 1.30n1.22 - 1.30n0.08 -… "1.24… " 1.3… TRUE
7 9099 12573 "1.24 - 1.30n1.22 - 1.30n0.08 -… "1.22… " 1.3… TRUE
8 9099 12573 "1.24 - 1.30n1.22 - 1.30n0.08 -… "0.08… " 0.1… TRUE
9 9099 12573 "1.24 - 1.30n1.22 - 1.30n0.08 -… "0.10… " 0.1… TRUE
10 5211 8334 "6.90 - 7.40" "6.90… " 7.4… TRUE
11 9994 13542 "2.73 - 2.78" "2.73… " 2.7… TRUE
12 2685 4113 "0.72 - 0.74n2.75 - 2.77n3.10 -… "0.72… " 0.7… TRUE
13 2685 4113 "0.72 - 0.74n2.75 - 2.77n3.10 -… "2.75… " 2.7… TRUE
14 2685 4113 "0.72 - 0.74n2.75 - 2.77n3.10 -… "3.10… " 3.1… TRUE
15 9778 13295 "32 - 39n190 - 200n142 - 152n9… "32 " " 39" FALSE
16 9778 13295 "32 - 39n190 - 200n142 - 152n9… "190 " " 200" FALSE
17 9778 13295 "32 - 39n190 - 200n142 - 152n9… "142 " " 152" FALSE
18 9778 13295 "32 - 39n190 - 200n142 - 152n9… "90 " " 100" FALSE
19 772 1317 "2.20 - 2.40" "2.20… " 2.4… TRUE