在R中,将多列整数转换为因子的最佳方法是什么



这适用于一个玩具示例,但我认为对于较大的数据帧,必须有更好的方法(更快、更少的内存(。欢迎提出任何建议!

library(tidyverse)
library(tictoc)
cyl <- tibble(integer_value = unique(mtcars$cyl), 
as_a_string = paste(unique(mtcars$cyl), " cylinders"))%>%
mutate(variable = "cyl")
gear <- tibble(integer_value = unique(mtcars$gear), 
as_a_string = paste(unique(mtcars$cyl), " gears"))%>%
mutate(variable = "gear")
carb <- tibble(integer_value = unique(mtcars$carb), 
as_a_string = paste(unique(mtcars$carb)," carburetors"))%>%
mutate(variable = "carb")
vs <- tibble(integer_value = unique(mtcars$vs), 
as_a_string = c("V shaped", "straight"))%>%
mutate(variable = "vs")
am <- tibble(integer_value = unique(mtcars$vs), 
as_a_string = c("Automatic", "Manual"))%>%
mutate(variable = "am")
factor_info <- rbind(cyl,gear,carb,vs,am)%>%
select(variable,everything())
df <- mtcars
tic()
for(var in unique(factor_info$variable)){
col <- mtcars%>%
select(all_of(var))%>%
mutate(variable = all_of(var))%>%
rename(integer_value = all_of(var))
fac <- factor_info%>%
filter(variable == all_of(var))
df[[all_of(var)]] <- inner_join(col, fac)%>%
select(as_a_string)%>%
pull()
}
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
df <- df%>%
as_tibble() %>%
mutate(across(where(is.character), factor))
toc()
#> 0.172 sec elapsed

创建于2022-02-25由reprex包(v2.0.1(

我没有足够的rep来添加注释。以下是TarJae的答案下的另一个变体。使用purrr::modify_if:

library(tidyverse)
mtcars %>%
modify_if(is.numeric, as.factor) %>%
str()
#> 'data.frame':    32 obs. of  11 variables:
#>  $ mpg : Factor w/ 25 levels "10.4","13.3",..: 16 16 19 17 13 12 3 20 19 14 ...
#>  $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
#>  $ disp: Factor w/ 27 levels "71.1","75.7",..: 13 13 6 16 23 15 23 12 10 14 ...
#>  $ hp  : Factor w/ 22 levels "52","62","65",..: 11 11 6 11 15 9 20 2 7 13 ...
#>  $ drat: Factor w/ 22 levels "2.76","2.93",..: 16 16 15 5 6 1 7 11 17 17 ...
#>  $ wt  : Factor w/ 29 levels "1.513","1.615",..: 9 12 7 16 18 19 21 15 13 18 ...
#>  $ qsec: Factor w/ 30 levels "14.5","14.6",..: 6 10 22 24 10 29 5 27 30 19 ...
#>  $ vs  : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#>  $ am  : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#>  $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
#>  $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...

创建于2022-02-26由reprex包(v2.0.1(

新答案删除了第一个答案。我想你需要forcats包中的fct_relabel:经过0.04秒:

library(forcats)
library(dplyr)
library(tictoc)
tic()
mtcars %>% 
mutate(cyl = fct_relabel(as.factor(cyl), ~paste(unique(cyl), as.character("cylinder"))),
vs = fct_relabel(as.factor(vs), ~paste(unique(vs), as.character(c("V shaped", "straight")))),
am = fct_relabel(as.factor(am), ~paste(unique(am), as.character(c("Automatic", "Manual")))),
gear = fct_relabel(as.factor(gear), ~paste(unique(gear), as.character("gears"))),
carb = fct_relabel(as.factor(carb), ~paste(unique(carb), as.character("carburetors")))) 
toc()

输出:

mpg cyl         disp    hp  drat    wt  qsec vs         am          gear    carb         
<dbl> <fct>      <dbl> <dbl> <dbl> <dbl> <dbl> <fct>      <fct>       <fct>   <fct>        
1  21   4 cylinder  160    110  3.9   2.62  16.5 0 V shaped 0 Manual    3 gears 3 carburetors
2  21   4 cylinder  160    110  3.9   2.88  17.0 0 V shaped 0 Manual    3 gears 3 carburetors
3  22.8 6 cylinder  108     93  3.85  2.32  18.6 1 straight 0 Manual    3 gears 4 carburetors
4  21.4 4 cylinder  258    110  3.08  3.22  19.4 1 straight 1 Automatic 4 gears 4 carburetors
5  18.7 8 cylinder  360    175  3.15  3.44  17.0 0 V shaped 1 Automatic 4 gears 1 carburetors
6  18.1 4 cylinder  225    105  2.76  3.46  20.2 1 straight 1 Automatic 4 gears 4 carburetors
7  14.3 8 cylinder  360    245  3.21  3.57  15.8 0 V shaped 1 Automatic 4 gears 3 carburetors
8  24.4 6 cylinder  147.    62  3.69  3.19  20   1 straight 1 Automatic 3 gears 1 carburetors
9  22.8 6 cylinder  141.    95  3.92  3.15  22.9 1 straight 1 Automatic 3 gears 1 carburetors
10  19.2 4 cylinder  168.   123  3.92  3.44  18.3 1 straight 1 Automatic 3 gears 3 carburetors
# ... with 22 more rows
> toc()
0.04 sec elapsed

xinteger类型且length(x)较大时,as.factor(x)factor(x)更快且更高效。mtcars中的分类变量为整数值,但存储为double:

nms <- c("cyl", "vs", "am", "gear", "carb")
vapply(mtcars[nms], typeof, "")
##      cyl       vs       am     gear     carb 
## "double" "double" "double" "double" "double"

在这种情况下,您可以强制使用进行有效的因素

dd <- mtcars
dd[nms] <- lapply(dd[nms], function(x) as.factor(as.integer(x)))

然后使用有效地重命名级别

lvs <- list(cyl = paste(levels(dd$cyl), "cylinders"),
vs = c("V-shaped", "straight"),
am = c("automatic", "manual"),
gear = paste(levels(dd$gear), "gears"),
carb = paste(levels(dd$carb), "carburetors"))
for (nm in nms) levels(dd[[nm]]) <- lvs[[nm]]
head(dd[nms])
##                           cyl       vs        am    gear          carb
## Mazda RX4         6 cylinders V-shaped    manual 4 gears 4 carburetors
## Mazda RX4 Wag     6 cylinders V-shaped    manual 4 gears 4 carburetors
## Datsun 710        4 cylinders straight    manual 4 gears 1 carburetors
## Hornet 4 Drive    6 cylinders straight automatic 3 gears 1 carburetors
## Hornet Sportabout 8 cylinders V-shaped automatic 3 gears 2 carburetors
## Valiant           6 cylinders straight automatic 3 gears 1 carburetors
head(mtcars[nms])
##                   cyl vs am gear carb
## Mazda RX4           6  0  1    4    4
## Mazda RX4 Wag       6  0  1    4    4
## Datsun 710          4  1  1    4    1
## Hornet 4 Drive      6  1  0    3    1
## Hornet Sportabout   8  0  0    3    2
## Valiant             6  1  0    3    1

FWIW,在我的机器上,整个操作不到一毫秒。

## Unit: microseconds
##  expr     min     lq     mean median      uq      max neval
##  expr 212.954 217.71 289.4744 230.42 236.693 6116.749   100

相关内容

最新更新