r-生成用于无监督学习的合成数据



我想用随机森林为无监督学习准备数据。程序如下:

  • 获取数据并将值为1的属性"class"添加到所有示例中
  • 从原始数据生成合成数据:
    • 而您没有与原始数据构建示例中相同数量的示例:
      • 从原始数据中该属性的所有值中采样新属性值
      • 对所有属性执行此操作,并将它们组合到新的示例中
  • 分配给合成数据值2的属性"class"
  • 将两个数据绑定在一起

最后看起来是这样的:

        ...      Class
                |1
     Original   |1
     Data       |1
                |1
    --------------
                |2
     Synthetic  |2
     Data       |2
                |2

我的R代码如下:

library(gtools) #for smartbind()
sample1 <- function(X)   { sample(X, replace=T) } 
g1      <- function(dat) { apply(dat,2,sample1) }
data$class <- rep(1, times=nrow(data)) #add attribute 'class' with value 1
synthData<-data.frame(g1(data[,1:ncol(data)])) #generate synthetic data with sampling from data
synthData$class <- rep(2, times=nrow(synthData)) #attribute 'class' is 2
colnames(synthData) <- colnames(data)
newData <- smartbind(data, synthData) #bind the data together

很明显,我对R真的很陌生,但它确实有效——只有一个问题:合成数据中的属性类型与原始数据中的不同。如果在最初它们是数字,那么现在它们就变成了因子。如何在生成合成数据时保留相同的类型?

谢谢!

数据1(数字变为因子):

结构(列表(V2=c(1.51793、1.51711、1.51645、1.51916、1.51131),V3=c(13.21,12.89,13.44,14.15,13.69),V4=c(3.48,3.62,3.61,0,3.2),V5=c(1.41,1.57,1.54,2.09,1.81),V6=c(72.64,72.96,72.39,72.74,72.81),V7=c(0.59,0.61,0.66,0,1.76),V8=c(8.43,8.11,8.03,10.88,5.43),V9=c(0,0,0,1.19),V10=c(0,0,0、0,0),realClass=结构(c(1L,2L,2L,5L,6L)。Label=c("1","2","3","5","6","7"),class="factor")。Names=c("V2","V3"、"V4"、"V5"、"V6"、"V7"、"V8"、"V9"、"V10"one_answers"realClass"),行名称=c(27L,138L、77L、183L、186L),class="data.frame")

数据2(因子变为chrs):

structure(list(realClass=structure。Label=c("e","p"),class="factor"),V2=结构(c(6L,3L,4L,6L,6L)。标签=c("b","c","f","k","s","x"),class="factor"),V3=结构(c(4L,4L,3L,1L,1L)。Label=c("f","g","s","y"),class="factor"),V4=结构(c(5L,5L,5L,3L,4L)。Label=c("b","c","e"、"g"、"n"、"p"、"r"、"u"、"w"、"y"),class="factor"),V5=结构(c(1L,1L,2L,1L)。Label=c("f","t"),class="factor"),V6=结构(c(3L,9L,3L,6L,3L)。Label=c("a"、"c"、"f"、"l"、"m"、"n"、"p"、"s"、"y"),class="factor"),V7=结构)。Label=c("a","f"),class="factor"),V8=结构(c(1L,1L,1L,1L,1L)。Label=c("c","w"),class="factor"),V9=结构(c(2L,2L,3L,1L)。Label=c("b","n"),class="factor"),V10=结构(c(1L,1L,1R,10L,4L)。Label=c("b"、"e"、"g"、"h"、"k"、"n"、"o"、"p"、"r","u","w","y"),class="factor"),V11=结构(c(2L,2L,2L,2L,1L)。Label=c("e","t"),class="factor"),V12=结构(c(NA,NA,NA、1L、1L)。Label=c("b","c","e","r"),class="factor"),V13=结构(c(3L,2L,3L,3L、2L)。Label=c("f","k","s","y"),class="factor"),V14=结构(c(3L,3L,2L,3L,2L)。Label=c("f","k","s","y"),class="factor"),V15=结构(c(7L,8L,7L,4L、7L)。Label=c("b"、"c"、"e"、"g"、"n"、"o"、"p"、"w","y"),class="factor"),V16=结构(c(7L,7L,8L,4L,1L)。Label=c("b"、"c"、"e"、"g"、"n"、"o"、"p"、"w"、"y"),class="factor"),V17=结构)。Label="p",class="factor"),V18=结构(c(3L,3L,3L,3L,3L)。Label=c("n","o","w","y"),class="factor"),V19=结构(c(2L,2L,3L,2L)。Label=c("n","o","t"),class="factor"),V20=结构(c(1L,1L,5L,3L)。Label=c("e"、"f"、"l"、"n"、"p"),class="factor"),V21=结构(c(8L,8L,8k,4L,2L)。Label=c("b","h","k","n","o","r","u","w","y"),class="factor"),V22=结构(c(5L,5L,5L,5L,6L)。Label=c("a"、"c"、"n"、"s"、"v"、"y"),class="factor"),V23=结构(c(3L,3L,5L,1L,2L)。Label=c("d","g","l","m","p","u","w"),class="factor"))。Names=c("realClass","V2"、"V3"、"V4"、"V5"、"V6"、"v27"、"V88"、"V9"、"V10"、"V11","V12"V13"V14"V15"V16"V17"V18"V19"V20","V21"、"V22"one_answers"V23"),行名称=c(4105L、6207L、6696L、2736L,3756L),class="data.frame")

您可以随时使用此技巧使数字列

numcol <- as.numeric(as.character(factcol))

但我怀疑您的data.frame中存在因子变量。由于apply返回一个矩阵,如果数据中有一个因子,那么所有数值变量也将被强制因子。

下面是一个例子,使用玩具数据集

set.seed(123)
toydat <- data.frame(A = 1:10, B = rnorm(10), C = LETTERS[1:10])
str(toydat)
## 'data.frame':    10 obs. of  3 variables:
##  $ A: int  1 2 3 4 5 6 7 8 9 10
##  $ B: num  -0.5605 -0.2302 1.5587 0.0705 0.1293 ...
##  $ C: Factor w/ 10 levels "A","B","C","D",..: 1 2 3 4 5 6 7 8 9 10
set.seed(1)
str(data.frame(apply(toydat[,1:2], 2, sample, replace = TRUE)))
## 'data.frame':    10 obs. of  2 variables:
##  $ A: num  3 4 6 10 3 9 10 7 7 1
##  $ B: num  1.5587 -0.2302 0.4609 0.0705 -1.2651 ...
# with the factor column C     
set.seed(2)
str(data.frame(apply(toydat[,1:3], 2, sample, replace = TRUE)))
## 'data.frame':    10 obs. of  3 variables:
##  $ A: Factor w/ 6 levels "10"," 2"," 5",..: 2 5 4 2 1 1 2 6 3 4
##  $ B: Factor w/ 8 levels " 0.129288","-0.230177",..: 8 7 6 2 1 5 3 7 1 4
##  $ C: Factor w/ 6 levels "B","D","E","G",..: 4 2 5 1 2 3 1 2 6 1

这就是plyr包变得有用的地方,因为您可以控制输出(使用**ply)。但在这种情况下,colwise函数是足够的

require(plyr)
set.seed(2)
mysamplingfun <- colwise(function(x) sample(x, replace = TRUE))
str(mysamplingfun(toydat[,1:3]))
## 'data.frame':    10 obs. of  3 variables:
##  $ A: int  2 8 6 2 10 10 2 9 5 6
##  $ B: num  1.715 1.559 -1.265 -0.23 0.129 ...
##  $ C: Factor w/ 10 levels "A","B","C","D",..: 7 4 9 2 4 5 2 4 10 2

最新更新