r语言 - 创建Recipes并动态传递列名



我有一个函数,它只创建一对食谱对象。问题是,在函数内部,我必须重命名通过的data.frame/tibble的列,以便我可以使recipes

我不想这样做,因为显而易见的原因,主要是,列名必须是data.frame本身中的内容,否则它们将无法工作。

简单的例子:

library(tidyverse)
data_tbl <- tibble(
visit_date = seq(
from = as.Date("2021-01-01"), 
to   = as.Date("2021-10-15"),
by = 7,
),
visits = rnbinom(
n = 42,
size = 100,
mu = 66
)
)
ts_auto_recipe <- function(.data, .date_col, .pred_col){

# * Tidyeval ----
date_col_var <- rlang::enquo(.date_col)
pred_col_var <- rlang::enquo(.pred_col)

# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}

if(rlang::quo_is_missing(date_col_var)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}

if(rlang::quo_is_missing(pred_col_var)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}

# * Data ----
data_tbl <- tibble::as_tibble(.data)

data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var }}, {{ pred_col_var }}, dplyr::everything()
) %>%
dplyr::rename(
date_col    = {{ date_col_var }}
, value_col = {{ pred_col_var }}
)

# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = date_col ~ . # I have to do the above so I can do this, which I don't like
, data = data_tbl
)

# * Add Steps ----
# ** ts signature and normalize ----
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(date_col) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)

# * Recipe List ----
rec_lst <- list(
rec_base = rec_base_obj,
rec_date = rec_date_obj
)

# * Return ----
return(rec_lst)

}
rec_objs <- ts_auto_recipe(data_tbl, visit_date, visits)

我这样做的原因是因为我不能在配方函数本身内部使用动态名称,所以像rlang::sym(names(data_tbl)[[1]])这样的东西不起作用,也不会像data_tbl[[1]]这样的东西。我正在考虑使用step_rename()之类的东西,但这需要您提前知道名称,并且它不能是配方步骤中的变量。但是你可以将变量传递给timetk::step_time_series_signature

我能想到的唯一一件事就是强迫用户使用特定的列名,就像Facebook的先知R库dsy

我还注意到,当我运行rec_objs时,我得到了一些奇怪的输出到终端,我得到以下内容:

> rec_objs
$rec_base
Recipe
Inputs:
role #variables
outcome          1
predictor          1
$rec_date
Recipe
Inputs:
role #variables
outcome          1
predictor          1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("ÿþindex.numÿþ"), dplyr::contains("ÿþdate_col...

然而当我这样做的时候:

> rec_objs[[2]]
Recipe
Inputs:
role #variables
outcome          1
predictor          1
Operations:
Timeseries signature features from date_col
Centering and scaling for dplyr::contains("index.num"), dplyr::contains("date_col_year")

不可能。

谢谢你,

我想我已经找到了解决这个问题的方法,参见以下自定义函数:

ts_auto_recipe_b <- function(.data
, .date_col
, .pred_col
, .step_ts_sig = TRUE
, .step_ts_rm_misc = TRUE
, .step_ts_dummy = TRUE
, .step_ts_fourier = TRUE
, .step_ts_fourier_period = 1
, .K = 1
, .step_ts_yeo = TRUE
, .step_ts_nzv = TRUE) {

# * Tidyeval ----
date_col_var_expr      <- rlang::enquo(.date_col)
pred_col_var_expr      <- rlang::enquo(.pred_col)
step_ts_sig            <- .step_ts_sig
step_ts_rm_misc        <- .step_ts_rm_misc
step_ts_dummy          <- .step_ts_dummy
step_ts_fourier        <- .step_ts_fourier
step_ts_fourier_k      <- .K
step_ts_fourier_period <- .step_ts_fourier_period
step_ts_yeo            <- .step_ts_yeo
step_ts_nzv            <- .step_ts_nzv

# * Checks ----
if(!is.data.frame(.data)){
stop(call. = FALSE, "You must supply a data.frame/tibble.")
}

if(rlang::quo_is_missing(date_col_var_expr)){
stop(call. = FALSE, "The (.date_col) must be supplied.")
}

if(rlang::quo_is_missing(pred_col_var_expr)){
stop(call. = FALSE, "The (.pred_col) must be supplied.")
}

# * Data ----
data_tbl <- tibble::as_tibble(.data)

data_tbl <- data_tbl %>%
dplyr::select(
{{ date_col_var_expr }}
, {{ pred_col_var_expr }}
, dplyr::everything()
) 
# %>%
#   dplyr::rename(
#     date_col    = {{ date_col_var_expr }}
#     , value_col = {{ pred_col_var_expr }}
#   )

# Original Col names ----
ds <- rlang::sym(names(data_tbl)[[1]])
v  <- rlang::sym(names(data_tbl)[[2]])
f <- as.formula(paste(v, " ~ ."))

# * Recipe Objects ----
# ** Base recipe ----
rec_base_obj <- recipes::recipe(
formula = f
, data = data_tbl
)

# * Add Steps ----
# ** ts signature and normalize ----
if(step_ts_sig){
rec_date_obj <- rec_base_obj %>%
timetk::step_timeseries_signature(ds) %>%
recipes::step_normalize(
dplyr::contains("index.num")
, dplyr::contains("date_col_year")
)
}

# ** Step rm ----
if(step_ts_rm_misc){
rec_date_obj <- rec_date_obj %>%
recipes::step_rm(dplyr::matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
}

# ** Step Dummy ----
if(step_ts_dummy){
rec_date_obj <- rec_date_obj %>%
recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = TRUE)
}

# ** Step Fourier ----
if(step_ts_fourier){
rec_date_fourier_obj <- rec_date_obj %>%
timetk::step_fourier(
ds
, period = 1#step_ts_fourier_period
, K      = 1#step_ts_fourier_k
)
}
# ** Step YeoJohnson ----
if(step_ts_yeo){
rec_date_fourier_obj <- rec_date_fourier_obj %>%
recipes::step_YeoJohnson(!!v, limits = c(0, 1))
}

# ** Step NZV ----
if(step_ts_nzv){
rec_date_fourier_nzv_obj <- rec_date_fourier_obj %>%
recipes::step_nzv(recipes::all_predictors())
}

# * Recipe List ----
rec_lst <- list(
rec_base             = rec_base_obj,
rec_date             = rec_date_obj,
rec_date_fourier     = rec_date_fourier_obj,
rec_date_fourier_nzv = rec_date_fourier_nzv_obj
)

# * Return ----
return(rec_lst)

}

然后运行如下命令:

> rec_objs <- ts_auto_recipe_b(.data = data_tbl, .date_col = visit_date, .pred_col = visits)
> rec_objs[[1]] %>% prep() %>% juice() %>% names()
[1] "visit_date" "visits"    
> rec_objs[[2]] %>% prep() %>% juice() %>% names()
[1] "visit_date"              "visits"                  "visit_date_index.num"   
[4] "visit_date_year"         "visit_date_half"         "visit_date_quarter"     
[7] "visit_date_month"        "visit_date_day"          "visit_date_wday"        
[10] "visit_date_mday"         "visit_date_qday"         "visit_date_yday"        
[13] "visit_date_mweek"        "visit_date_week"         "visit_date_week2"       
[16] "visit_date_week3"        "visit_date_week4"        "visit_date_mday7"       
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1"   "visit_date_wday.lbl_2"   "visit_date_wday.lbl_3"  
[34] "visit_date_wday.lbl_4"   "visit_date_wday.lbl_5"   "visit_date_wday.lbl_6"  
[37] "visit_date_wday.lbl_7"  
> rec_objs[[3]] %>% prep() %>% juice() %>% names()
[1] "visit_date"              "visits"                  "visit_date_index.num"   
[4] "visit_date_year"         "visit_date_half"         "visit_date_quarter"     
[7] "visit_date_month"        "visit_date_day"          "visit_date_wday"        
[10] "visit_date_mday"         "visit_date_qday"         "visit_date_yday"        
[13] "visit_date_mweek"        "visit_date_week"         "visit_date_week2"       
[16] "visit_date_week3"        "visit_date_week4"        "visit_date_mday7"       
[19] "visit_date_month.lbl_01" "visit_date_month.lbl_02" "visit_date_month.lbl_03"
[22] "visit_date_month.lbl_04" "visit_date_month.lbl_05" "visit_date_month.lbl_06"
[25] "visit_date_month.lbl_07" "visit_date_month.lbl_08" "visit_date_month.lbl_09"
[28] "visit_date_month.lbl_10" "visit_date_month.lbl_11" "visit_date_month.lbl_12"
[31] "visit_date_wday.lbl_1"   "visit_date_wday.lbl_2"   "visit_date_wday.lbl_3"  
[34] "visit_date_wday.lbl_4"   "visit_date_wday.lbl_5"   "visit_date_wday.lbl_6"  
[37] "visit_date_wday.lbl_7"   "visit_date_sin1_K1"      "visit_date_cos1_K1"     
> rec_objs[[4]] %>% prep() %>% juice() %>% names()
[1] "visit_date"              "visits"                  "visit_date_index.num"   
[4] "visit_date_half"         "visit_date_quarter"      "visit_date_month"       
[7] "visit_date_day"          "visit_date_mday"         "visit_date_qday"        
[10] "visit_date_yday"         "visit_date_mweek"        "visit_date_week"        
[13] "visit_date_week2"        "visit_date_week3"        "visit_date_week4"       
[16] "visit_date_mday7"        "visit_date_month.lbl_01" "visit_date_month.lbl_02"
[19] "visit_date_month.lbl_03" "visit_date_month.lbl_04" "visit_date_month.lbl_05"
[22] "visit_date_month.lbl_06" "visit_date_month.lbl_07" "visit_date_month.lbl_08"
[25] "visit_date_month.lbl_09" "visit_date_month.lbl_10" "visit_date_sin1_K1"     
[28] "visit_date_cos1_K1"   

将显示visit_datevisits通过使用!!v作为食谱函数来传递给函数,而timetk允许传递变量。

相关内容

  • 没有找到相关文章

最新更新