如何在r中使用训练好的xgb模型将其应用于新的数据集?



我这样训练了一个xgb模型:

candidates_var_train <- model.matrix(job_change ~ 0 + ., data = candidates_train)
candidates_train_xgb <- xgb.DMatrix(data = candidates_var_train, 
label = ifelse(candidates_train$job_change == "Interested", 1, 0))
candidates_var_test <- model.matrix(job_change ~ 0 + ., data = candidates_test)
candidates_test_xgb <- xgb.DMatrix(data = candidates_var_test, 
label = ifelse(candidates_test$job_change == "Interested", 1, 0))

得到了一个不错的AUC,想把它应用到我的新数据集上。新数据保存为数据帧,除目标变量"job_change"外,与测试/训练数据具有相同的列。我试着把它转换成这样的稀疏矩阵:

candidates_predict_sparse <- as(as.matrix(candidates_predict), "sparseMatrix")
candidates_predict_xgb <- xgb.DMatrix(data = candidates_predict_sparse)

但是NAs被引入到稀疏矩阵中,当我尝试使用predict()进行预测时,会出现以下错误:

Error in predict.xgb.Booster(xgb_model, newdata = candidates_predict_sparse,  : 
Feature names stored in `object` and `newdata` are different!

编辑:可复制的例子

最小数据集:

candidates_predict(我想要预测的数据集)

structure(list(enrollee_id = c(23427, 17605, 20912, 13948, 15205, 
15140, 21736, 19800, 23755, 12148), city_development_index = c(0.698, 
0.896, 0.754, 0.926, 0.92, 0.878, 0.926, 0.767, 0.689, 0.92), 
gender = structure(c(4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), levels = c("Female", "Male", "Other", "keine Angabe"
), class = "factor"), enrolled_university = structure(c(4L, 
2L, 1L, 2L, 1L, 3L, 3L, 2L, 2L, 2L), levels = c("Full time course", 
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"), 
company_size = structure(c(9L, 9L, 9L, 5L, 3L, 9L, 3L, 6L, 
2L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999", 
"10000+", "50-99", "500-999", "5000-9999", "keine Angabe"
), class = "factor"), company_type = structure(c(7L, 7L, 
7L, 6L, 6L, 7L, 6L, 6L, 6L, 7L), levels = c("Early Stage Startup", 
"Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd", 
"keine Angabe"), class = "factor"), last_new_job = structure(c(6L, 
6L, 6L, 1L, 1L, 1L, 1L, 1L, 5L, 5L), levels = c("1", "2", 
"3", "4", ">4", "never", "keine Angabe"), class = "factor"), 
training_hours = c(63, 10, 46, 18, 55, 4, 324, 26, 140, 158
), education_detail = structure(c(8L, 7L, 7L, 21L, 8L, 22L, 
7L, 7L, 7L, 19L), levels = c("Graduate Arts", "Graduate Business Degree", 
"Graduate Humanities", "Graduate No Major", "Graduate no major discipline", 
"Graduate Other", "Graduate STEM", "High School", "keine Angabe", 
"Masters Arts", "Masters Business Degree", "Masters Humanities", 
"Masters No Major", "Masters no major discipline", "Masters Other", 
"Masters STEM", "Phd Arts", "Phd Business Degree", "Phd Humanities", 
"Phd Other", "Phd STEM", "Primary School"), class = "factor"), 
experience_detail = structure(c(23L, 23L, 23L, 23L, 23L, 
21L, 23L, 17L, 10L, 23L), levels = c("<1", ">20", "1", "10", 
"11", "12", "13", "14", "15", "16", "17", "18", "19", "2", 
"20", "3", "4", "5", "6", "7", "8", "9", "no relevant experience"
), class = "factor")), row.names = c(NA, -10L), class = c("tbl_df", 
"tbl", "data.frame"))

candidates_train(我用来训练xgboost模型的数据集)

structure(list(enrollee_id = c(26270, 3166, 20087, 8518, 8899, 
25403, 14514, 3300, 10364, 5220), city_development_index = c(0.92, 
0.887, 0.698, 0.92, 0.92, 0.92, 0.624, 0.84, 0.926, 0.754), gender = structure(c(1L, 
2L, 2L, 2L, 4L, 2L, 2L, 4L, 4L, 2L), levels = c("Female", "Male", 
"Other", "keine Angabe"), class = "factor"), enrolled_university = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L), levels = c("Full time course", 
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"), 
company_size = structure(c(7L, 9L, 1L, 9L, 9L, 3L, 9L, 2L, 
5L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999", 
"10000+", "50-99", "500-999", "5000-9999", "keine Angabe"
), class = "factor"), company_type = structure(c(2L, 7L, 
2L, 7L, 7L, 6L, 7L, 6L, 4L, 7L), levels = c("Early Stage Startup", 
"Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd", 
"keine Angabe"), class = "factor"), last_new_job = structure(c(3L, 
1L, 1L, 1L, 6L, 1L, 6L, 3L, 5L, 4L), levels = c("1", "2", 
"3", "4", ">4", "never", "keine Angabe"), class = "factor"), 
training_hours = c(127, 36, 7, 39, 53, 168, 111, 52, 107, 
46), job_change = c("Interested", "Not interested", "Not interested", 
"Not interested", "Not interested", "Not interested", "Not interested", 
"Not interested", "Not interested", "Not interested"), education_detail = structure(c(3L, 
7L, 16L, 22L, 22L, 3L, 8L, 7L, 8L, 6L), levels = c("Graduate Arts", 
"Graduate Business Degree", "Graduate Humanities", "Graduate No Major", 
"Graduate no major discipline", "Graduate Other", "Graduate STEM", 
"High School", "keine Angabe", "Masters Arts", "Masters Business Degree", 
"Masters Humanities", "Masters No Major", "Masters no major discipline", 
"Masters Other", "Masters STEM", "Phd Arts", "Phd Business Degree", 
"Phd Humanities", "Phd Other", "Phd STEM", "Primary School"
), class = "factor"), experience_detail = structure(c(17L, 
5L, 18L, 23L, 23L, 14L, 23L, 8L, 5L, 2L), levels = c("<1", 
">20", "1", "10", "11", "12", "13", "14", "15", "16", "17", 
"18", "19", "2", "20", "3", "4", "5", "6", "7", "8", "9", 
"no relevant experience"), class = "factor")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`505` = 505L, 
`688` = 688L, `1355` = 1355L, `1498` = 1498L, `1594` = 1594L, 
`3607` = 3607L, `4897` = 4897L, `5743` = 5743L, `5863` = 5863L, 
`5908` = 5908L, `6377` = 6377L, `7449` = 7449L, `7578` = 7578L
), class = "omit"))  

candidates_test(我用来测试xgboost模型的数据集)

structure(list(enrollee_id = c(402, 27107, 8722, 6588, 4167, 
19061, 17139, 14928, 10164, 8612), city_development_index = c(0.762, 
0.92, 0.624, 0.926, 0.92, 0.926, 0.624, 0.92, 0.926, 0.92), gender = structure(c(2L, 
2L, 4L, 2L, 4L, 2L, 4L, 2L, 2L, 4L), levels = c("Female", "Male", 
"Other", "keine Angabe"), class = "factor"), enrolled_university = structure(c(2L, 
2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 2L), levels = c("Full time course", 
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"), 
company_size = structure(c(1L, 6L, 9L, 2L, 6L, 3L, 7L, 3L, 
3L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999", 
"10000+", "50-99", "500-999", "5000-9999", "keine Angabe"
), class = "factor"), company_type = structure(c(6L, 6L, 
7L, 6L, 6L, 6L, 6L, 6L, 6L, 7L), levels = c("Early Stage Startup", 
"Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd", 
"keine Angabe"), class = "factor"), last_new_job = structure(c(5L, 
1L, 6L, 5L, 6L, 2L, 1L, 3L, 4L, 4L), levels = c("1", "2", 
"3", "4", ">4", "never", "keine Angabe"), class = "factor"), 
training_hours = c(18, 46, 26, 18, 106, 50, 148, 40, 42, 
50), job_change = c("Interested", "Interested", "Not interested", 
"Not interested", "Not interested", "Not interested", "Interested", 
"Not interested", "Interested", "Not interested"), education_detail = structure(c(7L, 
7L, 8L, 7L, 7L, 16L, 7L, 7L, 21L, 7L), levels = c("Graduate Arts", 
"Graduate Business Degree", "Graduate Humanities", "Graduate No Major", 
"Graduate no major discipline", "Graduate Other", "Graduate STEM", 
"High School", "keine Angabe", "Masters Arts", "Masters Business Degree", 
"Masters Humanities", "Masters No Major", "Masters no major discipline", 
"Masters Other", "Masters STEM", "Phd Arts", "Phd Business Degree", 
"Phd Humanities", "Phd Other", "Phd STEM", "Primary School"
), class = "factor"), experience_detail = structure(c(7L, 
20L, 23L, 10L, 3L, 5L, 8L, 2L, 2L, 23L), levels = c("<1", 
">20", "1", "10", "11", "12", "13", "14", "15", "16", "17", 
"18", "19", "2", "20", "3", "4", "5", "6", "7", "8", "9", 
"no relevant experience"), class = "factor")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`531` = 531L, 
`615` = 615L, `715` = 715L, `1000` = 1000L, `1148` = 1148L, `1318` = 1318L, 
`1416` = 1416L), class = "omit"))
使用<<p>库/strong>
library(Matrix)
library(xgboost)
library(dplyr)
library(readr)

xgboost模型是在一个有73个特征的数据集上训练的。这是由于这种模式。Matrix将因子扩展为一组虚拟变量(数据集中的每个唯一条目对应一列),但是&;candidates_predict_sparse&;只有10个功能,因为它们不是假的。

> colnames(candidates_train)
[1] "enrollee_id"            "city_development_index" "gender"                 "enrolled_university"    "company_size"          
[6] "company_type"           "last_new_job"           "training_hours"         "job_change"             "education_detail"      
[11] "experience_detail"     
> colnames(candidates_var_train)
[1] "enrollee_id"                                  "city_development_index"                      
[3] "genderFemale"                                 "genderMale"                                  
[5] "genderOther"                                  "genderkeine Angabe"                                                 
..... 
[69] "experience_detail6"                           "experience_detail7"                          
[71] "experience_detail8"                           "experience_detail9"                          
[73] "experience_detailno relevant experience"
> colnames(candidates_predict_sparse)
[1] "enrollee_id"            "city_development_index" "gender"                 "enrolled_university"    "company_size"          
[6] "company_type"           "last_new_job"           "training_hours"         "education_detail"       "experience_detail"   

你看。xgboost模型期望预测73个特性,但只得到10个。为了实现这一点,xgboost模型需要与训练中使用的模型相同数量的特征进行预测。所以你需要模拟"候选人预测"矩阵。幸运的是,这很容易:

# arbitrary value to ensure model.matrix has a formula
candidates_predict$job_change <- 0
candidates_predict_dummied <- model.matrix(job_change ~ 0 + ., data = candidates_predict)
# Now you have the same structure and you can use it to predict:
> predict(xgb_model, candidates_predict_dummied)
[1]  0.3696896434  0.1225184500  0.0037288326 -0.0001312745 -0.1928645670 -0.0001312745 -0.2914776802  0.1280405670  0.3696896434
[10] -0.0001312745

最新更新