我有一个数据帧df有多个列,其中我想清理一些定价列。数据框如下所示:
Col1(char) Col2(char) Col3(char) Col4(char)
CST $ 128,412.00 $ 0.034 +149.628%
FSD $ 138,232.40 $ 0.023 +124.244%
SDD $ 112,234.45 $ 0.023 -123.324%
但是,我希望输出如下所示:
Col1(char) Col2(num) Col3(num) Col4(num)
CST 128412.00 0.034 1.49628
FSD 138232.40 0.023 1.24244
SDD 112234.45 0.023 -1.23324
如何将Col2 - Col4转换为尽可能优雅的数字列?谢谢你!
dat <- structure(list(Col1 = c("CST", "FSD", "SDD"), Col2 = c("$ 128,412.00",
"$ 138,232.40", "$ 112,234.45"), Col3 = c("$ 0.034", "$ 0.023",
"$ 0.023"), Col4 = c("+149.628%", "+124.244%", "-123.324%")),
class = "data.frame", row.names = c(NA, -3L))
# Col1 Col2 Col3 Col4
#1 CST $ 128,412.00 $ 0.034 +149.628%
#2 FSD $ 138,232.40 $ 0.023 +124.244%
#3 SDD $ 112,234.45 $ 0.023 -123.324%
要将除列1以外的所有列转换为数字,可以执行
tonum <- function (x) {
## delete "$", "," and "%" and convert string to numeric
num <- as.numeric(gsub("[$,%]", "", x))
## watch out for "%", that is, 90% should be 90 / 100 = 0.9
if (grepl("%", x[1])) num <- num / 100
## return
num
}
dat[-1] <- lapply(dat[-1], tonum)
dat
# Col1 Col2 Col3 Col4
#1 CST 128412.0 0.034 1.49628
#2 FSD 138232.4 0.023 1.24244
#3 SDD 112234.4 0.023 -1.23324
备注:
我刚从PaulS的回答中得知readr::parse_number()
。这是一个有趣的函数。基本上,它删除了所有不能成为数字有效部分的东西。作为实践,我使用REGEX实现相同的逻辑。这是一个通用的tonum()
。
tonum <- function (x, regex = TRUE) {
## drop everything that is not "+/-", "0-9" or "."
## then convert string to numeric
if (regex) {
num <- as.numeric(stringr::str_remove_all(x, "[^+\-0-9\.]*"))
} else {
num <- readr::parse_number(x)
}
## watch out for "%", that is, 90% should be 90 / 100 = 0.9
ind <- grepl("%", x)
num[ind] <- num[ind] / 100
## return
num
}
下面是一个快速测试:
x <- unlist(dat[-1], use.names = FALSE)
x <- c(x, "euro 300.95", "RMB 888.66", "£1999.98")
# [1] "$ 128,412.00" "$ 138,232.40" "$ 112,234.45" "$ 0.034" "$ 0.023"
# [6] "$ 0.023" "+149.628%" "+124.244%" "-123.324%" "euro 300.95"
#[11] "RMB 888.66" "£1999.98"
tonum(x, regex = TRUE)
# [1] 128412.00000 138232.40000 112234.45000 0.03400 0.02300
# [6] 0.02300 1.49628 1.24244 -1.23324 300.95000
#[11] 888.66000 1999.98000
tonum(x, regex = FALSE)
# [1] 128412.00000 138232.40000 112234.45000 0.03400 0.02300
# [6] 0.02300 1.49628 1.24244 -1.23324 300.95000
#[11] 888.66000 1999.98000
另一种可能的解决方案,基于readr::parse_number
(使用@ ZheyuanLi的数据,我感谢他):
library(tidyverse)
dat %>%
mutate(across(-1, ~ parse_number(.x)),
Col4 = Col4 / 100)
#> Col1 Col2 Col3 Col4
#> 1 CST 128412.0 0.034 1.49628
#> 2 FSD 138232.4 0.023 1.24244
#> 3 SDD 112234.4 0.023 -1.23324
使用tidyverse的另一种方法
library(dplyr)
library(stringr)
# generating Col5, Col6 same as Col4, just for demo
dat <- data.frame(
stringsAsFactors = FALSE,
Col1 = c("CST", "FSD", "SDD"),
Col2 = c("$ 128,412.00", "$ 138,232.40", "$ 112,234.45"),
Col3 = c("$ 0.034", "$ 0.023", "$ 0.023"),
Col4 = c("+149.628%", "+124.244%", "-123.324%"),
Col5 = c("+149.628%", "+124.244%", "-123.324%"),
Col6 = c("+149.628%", "+124.244%", "-123.324%")
)
dat %>%
mutate(
across(Col2:Col6, ~ as.numeric(str_remove_all(.x, pattern = "[$, +%]"))),
across(Col4:Col6, ~ .x/100)
)
#> Col1 Col2 Col3 Col4 Col5 Col6
#> 1 CST 128412.0 0.034 1.49628 1.49628 1.49628
#> 2 FSD 138232.4 0.023 1.24244 1.24244 1.24244
#> 3 SDD 112234.4 0.023 -1.23324 -1.23324 -1.23324
由reprex包(v2.0.1)在2022-07-12创建