r语言 - 如何处理只有一个值的列?



如何添加一个步骤,以删除一个列的常量值?

我正面临一个相关的问题,所以参考了上面的前一篇文章。我在我的食谱中使用了step_zv(),但我仍然得到以下错误- bake()中的错误,列'X33'中只有一个因素:"TRUE">

library(tidymodels)  
library(readr)       
library(broom.mixed) 
library(dotwhisker)  
library(skimr)           
library(rpart.plot)  
library(vip)    
library(glmnet)
library(naniar) 
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <- 
select (skool, -c(...1, id,   npsn,   public, cert_est,   cert_ops,   name_clean, name,   muh1,   muh2,   muh,    chr1,   chr2,   chr3,   chr,    hindu,  nu1,    nu2,    nu_klaten,  nu_sby, nu, it1,    it, other_swas_international)) 
skool_v2 <- 
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude) 

skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor) 

skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi) 
glimpse(skool_v4)

# Data splitting 
set.seed(123)
splits      <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test  <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4, 
strata = afiliasi, 
prop = 0.80)
# Penalised multinomial regression
lr_mod <- 
logistic_reg(penalty = tune(), mixture = 0.5) %>% 
set_engine("glmnet")
lr_recipe <- 
recipe(afiliasi ~ ., data = school_train) %>%  
step_date(date_est, date_ops) %>% 
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>% 
step_normalize(all_predictors()) 

lr_workflow <- 
workflow() %>% 
add_model(lr_mod) %>% 
add_recipe(lr_recipe)

lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <- 
lr_workflow %>% 
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE,  verbose = TRUE),
metrics = metric_set(roc_auc))

控制台显示

x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column. 

这个错误来自step_dummy(),因为变量X33只有一个因子"TRUE"。在您的问题中处理此问题的最简单方法是在step_dummy()之前对名义预测器使用step_zv()

这会使你的食谱看起来像

lr_recipe <- 
recipe(afiliasi ~ ., data = school_train) %>%  
step_date(date_est, date_ops) %>% 
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>% 
step_normalize(all_predictors()) 

表示正在发生的事情:

library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#> 
#> Inputs:
#> 
#>       role #variables
#>    outcome          1
#>  predictor         12
#> 
#> Training data contained 32 data points and no missing data.
#> 
#> Operations:
#> 
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]

以下是mtcars的示例:

# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL

相关内容

  • 没有找到相关文章

最新更新