如何在newdata上使用Ranger模型

  • 本文关键字:Ranger 模型 newdata r
  • 更新时间 :
  • 英文 :


我正试图从一系列房地产租赁变量中预测price。目标是获得低RMSE,但每当我将ranger模型应用于newdata而不是仅应用于data时,我都会得到以下错误:Error in predict.ranger(model_forest5, newdata = train) : Error: Argument 'data' is required for non-quantile prediction.

以下是我可以提供的仍能提供上下文的最少代码量(不幸的是,我不确定如何提供可复制的数据样本(:


analysisData = read.csv(file = 'analysisData.csv', stringsAsFactors = F)
scoringData = read.csv(file = 'scoringData.csv', stringsAsFactors = F)
scoringData$zipcode <- as.character(scoringData$zipcode)
library(ggplot2)
library(tidyr)
library(dplyr)
library(caret)
library(leaps)
library(tidyverse)
library(leaps)
library(ggthemes)
library(glmnet)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(ranger)
library(randomForest)
library(xgboost)
library(vtreat)
library(fastDummies)




####################  SPLITTING AND PIPING PART ONE  ###################     STEP 1
set.seed(5656)
ksplit <- createDataPartition(y = analysisData$price, p=.7, list=F, groups=50)
train <- analysisData[ksplit,]
test <- analysisData[-ksplit,]
train$train_test_score <- "train"
test$train_test_score <- "test"
scoringData$train_test_score <- "score"
baseData <- bind_rows(train,test,scoringData)


####################  INITIAL FACTORING  ####################      STEP 3 
baseData$host_response_time = factor(baseData$host_response_time)
baseData$host_is_superhost = factor(baseData$host_is_superhost)
baseData$host_has_profile_pic = factor(baseData$host_has_profile_pic)
baseData$host_identity_verified = factor(baseData$host_identity_verified)
baseData$state = factor(baseData$state)
baseData$market = factor(baseData$market)
baseData$country_code = factor(baseData$country_code)
baseData$is_location_exact = factor(baseData$is_location_exact)
baseData$property_type = factor(baseData$property_type)
baseData$room_type = factor(baseData$room_type)
baseData$bed_type = factor(baseData$bed_type)
baseData$has_availability = factor(baseData$has_availability)
baseData$requires_license = factor(baseData$requires_license)
baseData$instant_bookable = factor(baseData$instant_bookable)
baseData$is_business_travel_ready = factor(baseData$is_business_travel_ready)
baseData$require_guest_profile_picture = factor(baseData$require_guest_profile_picture)
baseData$require_guest_phone_verification = factor(baseData$require_guest_phone_verification)
baseData$cancellation_policy = factor(baseData$cancellation_policy)




####################  CLEANING DATA  ####################       STEP 4
# dates to years
baseData$host_since = as.numeric(substr(baseData$host_since, 1, 4))
baseData$first_review = as.numeric(substr(baseData$first_review, 1, 4))
baseData$last_review = as.numeric(substr(baseData$last_review, 1, 4))
# fix zipcode
baseData$zipcode = as.numeric(baseData$zipcode)
# analysisData$cancellation_strict <- ifelse(grepl("strict", analysisData$cancellation_policy), "yes", "no")
# length of rules
baseData$house_rules_length = nchar(baseData$house_rules)
# indicating if N/A in case trend tells a specific story
baseData$host_listings_count_exists = factor(ifelse(is.na(baseData$host_listings_count),"no","yes"))
baseData$square_feet_exists = factor(ifelse(is.na(baseData$square_feet),"no","yes"))
baseData$host_total_listings_count_exists = factor(ifelse(is.na(baseData$host_total_listings_count),"no","yes"))
baseData$beds_exists = factor(ifelse(is.na(baseData$beds),"no","yes"))
baseData$weekly_price_exists = factor(ifelse(is.na(baseData$weekly_price),"no","yes"))
baseData$monthly_price_exists = factor(ifelse(is.na(baseData$monthly_price),"no","yes"))
baseData$security_deposit_exists = factor(ifelse(is.na(baseData$security_deposit),"no","yes"))
baseData$cleaning_fee_exists = factor(ifelse(is.na(baseData$cleaning_fee),"no","yes"))
baseData$reviews_per_month_exists = factor(ifelse(is.na(baseData$reviews_per_month),"no","yes"))
baseData$host_response_rate = as.numeric(gsub("([0-9]+).*$", "\1", baseData$host_response_rate))
# numeric version of variables for use in Ranger (trying to fix error)
baseData$num_accommodates = as.numeric(baseData$accommodates)
baseData$num_room_type = as.numeric(baseData$room_type)
baseData$num_review_scores_rating = as.numeric(baseData$review_scores_rating)
baseData$num_minimum_nights = as.numeric(baseData$minimum_nights)
baseData$num_host_response_time = as.numeric(baseData$host_response_time)




####################  IMPUTATION  ####################       STEP 5
sapply(baseData, function(x) sum(is.na(x)))
baseData$host_listings_count = 
ifelse(is.na(baseData$host_listings_count), 
mean(baseData$host_listings_count, na.rm = TRUE), 
baseData$host_listings_count)
baseData$host_total_listings_count = 
ifelse(is.na(baseData$host_total_listings_count), 
mean(baseData$host_total_listings_count, na.rm = TRUE), 
baseData$host_total_listings_count)
baseData$beds = 
ifelse(is.na(baseData$beds), 
mean(baseData$beds, na.rm = TRUE), 
baseData$beds)
baseData$square_feet = 
ifelse(is.na(baseData$square_feet), 
mean(baseData$square_feet, na.rm = TRUE), 
baseData$square_feet)
baseData$weekly_price = 
ifelse(is.na(baseData$weekly_price), 
mean(baseData$weekly_price, na.rm = TRUE), 
baseData$weekly_price)
baseData$monthly_price = 
ifelse(is.na(baseData$monthly_price), 
mean(baseData$monthly_price, na.rm = TRUE), 
baseData$monthly_price)
baseData$security_deposit = 
ifelse(is.na(baseData$security_deposit), 
mean(baseData$security_deposit, na.rm = TRUE), 
baseData$security_deposit)
baseData$cleaning_fee = 
ifelse(is.na(baseData$cleaning_fee), 
mean(baseData$cleaning_fee, na.rm = TRUE), 
baseData$cleaning_fee)
baseData$reviews_per_month = 
ifelse(is.na(baseData$reviews_per_month), 
mean(baseData$reviews_per_month, na.rm = TRUE), 
baseData$reviews_per_month)
baseData$host_since = 
ifelse(is.na(baseData$host_since), 
mean(baseData$host_since, na.rm = TRUE), 
baseData$host_since)
baseData$first_review = 
ifelse(is.na(baseData$first_review), 
mean(baseData$first_review, na.rm = TRUE), 
baseData$first_review)
baseData$last_review = 
ifelse(is.na(baseData$last_review), 
mean(baseData$last_review, na.rm = TRUE), 
baseData$last_review)
baseData$host_response_rate = 
ifelse(is.na(baseData$host_response_rate), 
mean(baseData$host_response_rate, na.rm = TRUE), 
baseData$host_response_rate)
baseData$zipcode = 
ifelse(is.na(baseData$zipcode), 
median(baseData$zipcode, na.rm = TRUE), 
baseData$zipcode)

####################  PIPING PART TWO  ####################        STEP 8
train <- baseData  %>% 
filter(train_test_score == "train")
test <- baseData  %>% 
filter(train_test_score == "test")
score <- baseData  %>% 
filter(train_test_score == "score")
nrow(analysisData); nrow(train); nrow(test); nrow(score);


####################  MODELING - RANGER  ####################        STEP 13
set.seed(617)
model_forest5 = ranger(price~num_minimum_nights+num_host_response_time+num_review_scores_rating+num_room_type+num_accommodates,
data = train,
num.trees = 1000)
pred_ranger = predict(model_forest5, data = test, num.trees = 1000)
rmse_ranger = sqrt(mean((pred_ranger$predictions-test$price)^2)); rmse_ranger

####################  SCORING  ####################        STEP 13
pred_train <- predict(model_forest5, newdata=train)
caret::postResample(pred = pred_train, train$price)
# Model Testing
pred_test <- predict(model_forest5, newdata=test)
caret::postResample(pred = pred_train, test$price)

如有任何帮助,我们将不胜感激。感谢

我写这个答案是为了进一步解释如何找到输入参数以及为什么参数不同。

执行predict(model_forest5, data=train)时,它取决于对象model_forest5。由于它属于ranger:类

mdl = ranger(mpg ~. ,data=mtcars)
class(mdl)
[1] "ranger"

调用函数predict.ranger。如果您查看帮助手册了解预测。愤怒:

Arguments:
object: Ranger ‘ranger’ object.
data: New test data of class ‘data.frame’ or ‘gwaa.data’ (GenABEL).

所以输入应该是data=

如果使用lm():

mdl = lm(mpg ~. ,data=mtcars)
class(mdl)
[1] "lm"

调用的函数是predict.lm,而参数实际上是您所使用的newdata=。

最新更新