我有在数据框中完成一次旅程的分钟调查数据,字符类型。有些条目被写成一个范围,例如:"5 - 10"。我想把这些项改为范围的平均值
我的数据是这样的。
[1] "30" "15"
[3] "30" "15 Minutes "
[5] "15" "20 mins "
[7] "30" "half an hour to 40 minutes"
[9] "30" "40"
[11] "20" "30"
[13] "15" "20"
[15] "40" "20"
[17] "40" "30"
[19] "15" "15"
[21] "20" "30mins"
[23] "20" "20"
[25] "15" "40"
[27] "15" "25"
[29] "30" "20"
[31] "Depends where you live" "30-45"
[33] "30 min " "20"
[35] "30" "20"
[37] "60 minutes" "30 mins"
[39] "15" "10"
[41] "20" "40"
[43] "60" "60"
[45] "30" "49"
[47] "50 minutes" "20 minutes"
[49] "90" "7-10 minutes "
[51] "15-20" "25 minutes"
[53] "25" "45"
[55] "60 minutes " "2-4 hours"
[57] "30" "30 min"
[59] "20" "30"
[61] "20" "25"
[63] "2-4hrs" "30"
[65] "45" "45"
[67] "75" "20"
[69] "60" "45mins"
[71] "60" "20"
我尝试了以下代码:
data <- data %>% mutate(
est_time = case_when(
grepl('-', est_time) ~ mean(as.numeric(unlist(str_split(est_time, '-'))))
))
data <- data %>% mutate(
est_time = ifelse(
grepl('-', est_time),
mean(as.numeric(unlist(str_split(est_time, '-')))),
est_time)
)
每次我收到:
Warning message:
Problem while computing `est_time = case_when(...)`.
ℹ NAs introduced by coercion
我怀疑这可能是因为unlist函数将列表数据分散到多行。
我如何解决这个问题并实现我的目标?
而不是使用case_when
或ifelse
,一个选项是只选择具有-
的行,用read.table
读取,获得rowMeans
并将其分配回
i1 <- grepl('^(\d+)-(\d+)$', data$est_time)
data$est_time[i1] <- rowMeans(read.table(text = data$est_time[i1],
sep = '-', header = FALSE), na.rm = TRUE)
在这种情况下,我们想要取所有具有-
的条目的mean
(即除了15-20
之外还包括2-4 hours
或7-10 minutes
等所有条目)
library(stringr)
library(dplyr)
data %>%
mutate(est_time2 = str_replace_all(est_time, "(\d+-\d+)",
function(x) mean(scan(text = x, what = numeric(),
sep = '-', quiet = TRUE))))
与产出
# A tibble: 9 × 2
est_time est_time2
<chr> <chr>
1 "15 Minutes" "15 Minutes"
2 "20 mins" "20 mins"
3 "40" "40"
4 "15" "15"
5 "Depends where you live" "Depends where you live"
6 "7-10 minutes " "8.5 minutes "
7 "15-20" "17.5"
8 "2-4 hours" "3 hours"
9 "30 min" "30 min"
数据data <- structure(list(est_time = c("15 Minutes", "20 mins", "40", "15",
"Depends where you live", "7-10 minutes ", "15-20", "2-4 hours",
"30 min")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-9L))