r语言 - 错误:无法使用 {Tidymodels} 运行预测时不存在的列进行子集化



我试图用Tidymodels预测R的房地产价格。我遵循这个教程。一切都很顺利,直到我试图对我的测试数据进行预测。

请参阅下面的代码示例和最后的错误。

我看了两个类似的问题(这里和这里),但似乎我已经定义了可变角色,并为我的工作流程提供了一个未准备好的配方。

# libraries ---------------------------------------------------------------
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom     0.7.3      ✓ recipes   0.1.15
#> ✓ dials     0.0.9      ✓ rsample   0.0.8 
#> ✓ dplyr     1.0.3      ✓ tibble    3.0.5 
#> ✓ ggplot2   3.3.3      ✓ tidyr     1.1.2 
#> ✓ infer     0.5.4      ✓ tune      0.1.2 
#> ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
#> ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
#> ✓ purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(data.table)

library(purrr)


# data --------------------------------------------------------------------
# 're' means real estate
# I'm using data.table in general. Using tribble below for cleaner data definition.
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343",      0.534722222222222,        1,         3,
"31914489",      0.476119402985075,        1,         1,
"30972289",      0.507352941176471,        1,         2,
"31739730",      0.472972972972973,        1,         3,
"31783137",                0.49875,        2,         3,
"31809435",      0.439705882352941,        2,         2,
"31943408",      0.469117647058824,        2,         3,
"31944348",       0.56231884057971,        2,         1,
"31961146",      0.472972972972973,        3,         3,
"24314388",      0.649550561797753,        3,         2,
"29840270",      0.719178082191781,        3,         3,
"29840429",      0.719178082191781,        3,         3,
"30873484",      0.822857142857143,        4,         3,
"30969673",      0.533802816901408,        4,         3,
"31333120",      0.741511627906977,        4,         3,
"31788730",      0.527142857142857,        4,         2,
"31948441",      0.734848484848485,        5,         2,
"31962350",                    0.8,        5,         3,
"31962779",      0.670454545454545,        5,         3,
"31979128",      0.689054054054054,        5,         1
)

real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]

# train/test split --------------------------------------------------------
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test  <- testing(re_split)

# workflow (w/ recipe) ----------------------------------------------------
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_numeric(), - district) %>%
step_scale(all_predictors(), all_numeric(), - district) %>%
step_dummy(district) %>%
step_zv(all_predictors())

summary(re_rec)
#> # A tibble: 4 x 4
#>   variable              type    role      source  
#>   <chr>                 <chr>   <chr>     <chr>   
#> 1 re_id                 nominal ID        original
#> 2 district              nominal predictor original
#> 3 num_room              numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome   original

lr_model <-
linear_reg() %>%
set_engine("lm")

re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)

# model training and prediction -------------------------------------------
re_fit <-
re_wflow %>%
fit(data = re_train)

re_pred <- predict(re_fit, re_test)
#> Error: Can't subset columns that don't exist.
#> x Column `price_per_sqm_huf_mil` doesn't exist.

由reprex包(v0.3.0)在2021-01-25创建

多谢!

这里的问题是您使用step_center()来转换结果(price_per_sqm_huf_mil),预测时无结果。您可以指定您想要居中all_predictors() & all_numeric(),像这样:

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom     0.7.3      ✓ recipes   0.1.15
#> ✓ dials     0.0.9      ✓ rsample   0.0.8 
#> ✓ dplyr     1.0.3      ✓ tibble    3.0.5 
#> ✓ ggplot2   3.3.3      ✓ tidyr     1.1.2 
#> ✓ infer     0.5.4      ✓ tune      0.1.2 
#> ✓ modeldata 0.1.0      ✓ workflows 0.2.1 
#> ✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
#> ✓ purrr     0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(dplyr)
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343",      0.534722222222222,        1,         3,
"31914489",      0.476119402985075,        1,         1,
"30972289",      0.507352941176471,        1,         2,
"31739730",      0.472972972972973,        1,         3,
"31783137",                0.49875,        2,         3,
"31809435",      0.439705882352941,        2,         2,
"31943408",      0.469117647058824,        2,         3,
"31944348",       0.56231884057971,        2,         1,
"31961146",      0.472972972972973,        3,         3,
"24314388",      0.649550561797753,        3,         2,
"29840270",      0.719178082191781,        3,         3,
"29840429",      0.719178082191781,        3,         3,
"30873484",      0.822857142857143,        4,         3,
"30969673",      0.533802816901408,        4,         3,
"31333120",      0.741511627906977,        4,         3,
"31788730",      0.527142857142857,        4,         2,
"31948441",      0.734848484848485,        5,         2,
"31962350",                    0.8,        5,         3,
"31962779",      0.670454545454545,        5,         3,
"31979128",      0.689054054054054,        5,         1
) %>%
mutate(district = factor(district))

set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test  <- testing(re_split)
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_predictors() & all_numeric()) %>%
step_scale(all_predictors() & all_numeric()) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#>   variable              type    role      source  
#>   <chr>                 <chr>   <chr>     <chr>   
#> 1 re_id                 nominal ID        original
#> 2 district              nominal predictor original
#> 3 num_room              numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome   original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
re_fit <-
re_wflow %>%
fit(data = re_train)
predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#>   .pred
#>   <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768

由reprex包(v0.3.0)在2021-01-25创建

这已经绊倒了比你更多的人,所以我们正在努力添加一组新的选择器,将很快合并。如果您确实想要尝试转换结果,可以考虑的另一种选择是使用skip = TRUE

相关内容

  • 没有找到相关文章

最新更新