在 R 中将数据集拆分为训练、交叉验证和测试数据集. ifelse 返回意外结果



我想编写一个函数来拆分数据帧以进行训练、交叉验证和测试集。

我的代码如下,以一个小数据集为例:

    library(ISLR)
    library(data.table)
    data <- Auto
    seed <- 12
    train <- 0.7
    test <- 0.6
    # Function_split_test_train_regression <- function(data, train, test, seed){
      set.seed(seed)
      setDT(data)
      data[, index := row.names(data)]
      train_index <- sample(data$index, train * nrow(data))
      test_index <- ifelse(test == 1, setdiff(data$index, train_index), 
                                      sample(setdiff(data$index, train_index),  test * length(setdiff(data$index, train_index))))  
    # etc
    #}

在这一点上,我做了一些检查,我得到了一个令我惊讶的结果:

       > test == 1
        [1] FALSE
        > sample(setdiff(data$index, train_index), 
                 test * length(setdiff(data$index, train_index)))
         [1] "225" "186" "41"  "381" "356" "178" "147" "158" "21"  "259" "207" "159" "250" "167" "128" "218" "271" "197" "376" "19"  "77" 
        [22] "205" "46"  "3"   "212" "238" "61"  "11"  "68"  "130" "200" "274" "127" "305" "201" "32"  "48"  "184" "290" "349" "155" "370"
        [43] "366" "333" "243" "161" "108" "65"  "125" "306" "357" "189" "337" "118" "364" "6"   "149" "87"  "252" "194" "362" "383" "93" 
        [64] "38"  "18"  "322" "220" "307" "60"  "353"
        > test_index <- ifelse(test == 1, setdiff(data$index, train_index), 
    sample(setdiff(data$index, train_index), 
          test * length(setdiff(data$index, train_index))))
        > test_index
        [1] "219"

为什么 iflese 返回 219 而不是第二个参数的值(因为条件测试 == 1 的计算结果为 FALSE(?

您的建议将不胜感激。

================================================================================================================================================================================================================================================

==

编辑

按照评论中提出的建议,我更改了代码,将名称测试替换为名称test_fraction但问题仍然存在。 新代码:

library(ISLR)
library(data.table)
data <- Auto
seed <- 12
train_fraction <- 0.7
test_fraction <- 0.6
# Function_split_test_crossval_train_regr <- function(data, train, test, seed){
  set.seed(seed)
  setDT(data)
  data[, index := row.names(data)]
  train_index <- sample(data$index, train_fraction * nrow(data))
  test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index), 
                                                     test_fraction * length(setdiff(data$index, train_index))))  
#}

结果:

> train_index
  [1] "119" "118" "143" "344" "293" "341" "305" "95"  "82"  "58"  "226" "35"  "363" "111" "84"  "137" "24"  "151" "381" "110" "93" 
 [22] "198" "133" "6"   "112" "228" "62"  "36"  "165" "353" "271" "385" "322" "291" "316" "268" "333" "37"  "377" "176" "343" "281"
 [43] "245" "75"  "238" "183" "215" "68"  "274" "64"  "224" "391" "26"  "83"  "66"  "308" "1"   "372" "161" "170" "300" "52"  "30" 
 [64] "15"  "57"  "148" "312" "311" "194" "367" "27"  "342" "260" "181" "163" "171" "193" "210" "327" "248" "172" "263" "47"  "351"
 [85] "166" "292" "278" "61"  "116" "204" "309" "200" "96"  "330" "383" "346" "249" "368" "41"  "38"  "235" "4"   "77"  "273" "191"
[106] "212" "99"  "31"  "286" "79"  "184" "284" "267" "374" "355" "358" "124" "114" "335" "70"  "203" "379" "14"  "287" "67"  "34" 
[127] "340" "127" "91"  "222" "240" "387" "357" "242" "310" "347" "142" "103" "105" "117" "189" "361" "177" "126" "392" "5"   "317"
[148] "174" "352" "87"  "234" "147" "202" "261" "277" "214" "290" "339" "109" "43"  "120" "169" "318" "56"  "94"  "115" "314" "320"
[169] "276" "237" "296" "307" "23"  "186" "360" "146" "313" "152" "206" "328" "60"  "195" "69"  "107" "97"  "92"  "325" "20"  "362"
[190] "157" "101" "10"  "192" "134" "251" "259" "2"   "29"  "265" "331" "144" "63"  "384" "81"  "338" "364" "213" "380" "150" "48" 
[211] "54"  "354" "187" "283" "356" "389" "72"  "32"  "121" "376" "33"  "359" "349" "239" "241" "232" "196" "74"  "156" "201" "390"
[232] "326" "285" "51"  "131" "304" "85"  "45"  "336" "280" "178" "128" "98"  "275" "246" "65"  "39"  "188" "55"  "90"  "197" "9"  
[253] "173" "40"  "295" "149" "230" "140" "135" "236" "21"  "369" "301" "220" "122" "253" "208" "388" "159" "282" "88"  "158" "167"
[274] "257"
> sample(setdiff(data$index, train_index), 
+                                                      test_fraction * length(setdiff(data$index, train_index)))
 [1] "337" "378" "164" "225" "16"  "44"  "221" "179" "25"  "28"  "324" "175" "139" "154" "17"  "252" "211" "155" "233" "162" "130"
[22] "216" "255" "190" "365" "373" "73"  "207" "42"  "3"   "348" "227" "49"  "12"  "53"  "315" "199" "256" "129" "375" "205" "18" 
[43] "289" "168" "264" "160" "145" "382" "136" "302" "185" "323" "100" "270" "113" "294" "247" "345" "209" "104" "321" "7"   "138"
[64] "78"  "386" "366" "298" "231" "86"  "19" 
> test_fraction == 1
[1] FALSE
> test_index <- ifelse(test_fraction == 1, setdiff(data$index, train_index), sample(setdiff(data$index, train_index), 
+                                                      test_fraction * length(setdiff(data$index, train_index))))
> test_index
[1] "28"
我不知道

为什么会这样,我希望有人来解释一下。

但是我找到了解决您问题的方法。您需要将参数传递给ifelse()中的对象:

ifelse(
    test_fraction == 1,
    test_index <- setdiff(data$index, train_index),
    test_index <- sample(setdiff(data$index, train_index),test_fraction * length(setdiff(data$index, train_index)))
  )  

我不知道这是否是不好的做法,但它有效。它还可用于在条件中分配多个条件,例如我在这里的答案。

最新更新