在学校使用R几个月后,我最近学会了Tidymols。
我试图在Kaggle上使用泰坦尼克号数据集制作我的第一个模型,但在拟合模型时遇到了一些问题。有人能帮我吗?
titanic_rec <- recipe(Survived ~ Sex + Age + Pclass + Embarked + Family_Size + Name, data = titanic_train) %>%
step_impute_knn(all_predictors(), k = 3) %>%
step_dummy(Sex, Pclass, Embarked, Family_Size, Name) %>%
step_interact(~ Sex:Age + Sex:Pclass + Pclass:Age)
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
fitted_log_model <- workflow() %>%
add_model(log_model) %>%
add_recipe(titanic_rec) %>%
fit(data = titanic_train) %>%
pull_workflow_fit() %>%
tidy()
每个特征都有一个因子数据类型,除了Age和Survived是双因子。当我包括拟合(数据=…(时,错误似乎就出现了。
Error: Can't rename variables in this context. Run `rlang::last_error()` to see where the error occurred.
24.
stop(fallback)
23.
signal_abort(cnd)
22.
abort("Can't rename variables in this context.")
21.
eval_select_recipes(to_impute, training, info)
20.
impute_var_lists(to_impute = x$terms, impute_using = x$impute_with, training = training, info = info)
19.
prep.step_impute_knn(x$steps[[i]], training = training, info = x$term_info)
18.
prep(x$steps[[i]], training = training, info = x$term_info)
17.
prep.recipe(blueprint$recipe, training = data, fresh = blueprint$fresh)
16.
recipes::prep(blueprint$recipe, training = data, fresh = blueprint$fresh)
15.
blueprint$mold$process(blueprint = blueprint, data = data)
14.
run_mold.recipe_blueprint(blueprint, data)
13.
run_mold(blueprint, data)
12.
mold.recipe(recipe, data, blueprint = blueprint)
11.
hardhat::mold(recipe, data, blueprint = blueprint)
10.
fit.action_recipe(action, workflow = workflow, data = data)
9.
fit(action, workflow = workflow, data = data)
8.
.fit_pre(workflow, data)
7.
fit.workflow(., data = titanic_train)
6.
fit(., data = titanic_train)
5.
is_workflow(x)
4.
validate_is_workflow(x)
3.
pull_workflow_fit(.)
2.
tidy(.)
1.
workflow() %>% add_model(log_model) %>% add_recipe(titanic_rec) %>% fit(data = titanic_train) %>% pull_workflow_fit() %>% tidy()
发布的错误来自step_impute_knn()
,其中邻居的数量应由neighbors
指定。其次,我建议不要使用name
作为预测器,因为它会为每个名称创建一个单独的伪变量,这会影响拟合。
最后一个错误出现在step_interact()
中。不能在step_dummy(Sex)
之后使用step_interact(~ Sex:Age)
,因为在step_dummy()
之后不会有任何名为Sex
的列。相反,它将具有Sex_male
(因为雌性是截距的一部分(。捕获所有创建的伪变量的一种方法是在step_interact()
中使用starts_with()
。
library(tidymodels)
titanic_train <- readr::read_csv("your/path/to/data/train.csv")
titanic_train <- titanic_train %>%
mutate(Survived = factor(Survived),
Pclass = factor(Pclass),
Family_Size = SibSp + Parch + 1)
titanic_rec <- recipe(Survived ~ Sex + Age + Pclass + Embarked + Family_Size,
data = titanic_train) %>%
step_impute_knn(all_predictors(), neighbors = 3) %>%
step_dummy(Sex, Pclass, Embarked) %>%
step_interact(~ starts_with("Sex_"):Age +
starts_with("Sex_"):starts_with("Pclass_") +
starts_with("Pclass_"):Age)
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
fitted_log_model <- workflow() %>%
add_model(log_model) %>%
add_recipe(titanic_rec) %>%
fit(data = titanic_train) %>%
pull_workflow_fit() %>%
tidy()
fitted_log_model
#> # A tibble: 13 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 3.85 0.921 4.18 0.0000289
#> 2 Age 0.0117 0.0226 0.516 0.606
#> 3 Family_Size -0.226 0.0671 -3.36 0.000769
#> 4 Sex_male -2.22 0.886 -2.50 0.0124
#> 5 Pclass_X2 1.53 1.16 1.31 0.189
#> 6 Pclass_X3 -2.42 0.884 -2.74 0.00615
#> 7 Embarked_Q -0.0461 0.368 -0.125 0.900
#> 8 Embarked_S -0.548 0.243 -2.26 0.0241
#> 9 Sex_male_x_Age -0.0488 0.0199 -2.46 0.0140
#> 10 Sex_male_x_Pclass_X2 -1.28 0.879 -1.46 0.144
#> 11 Sex_male_x_Pclass_X3 1.48 0.699 2.11 0.0347
#> 12 Age_x_Pclass_X2 -0.0708 0.0263 -2.69 0.00714
#> 13 Age_x_Pclass_X3 -0.0341 0.0209 -1.63 0.103
由reprex软件包(v2.0.0(于2021-07-01创建