如何使用动态列名重塑成多层次、多顺序的宽数据

很抱歉，如果这有一个明显的答案。我试图执行一个重塑，有很多stackoverflow的答案，当只有一列被使用或当列名可以硬编码，但我需要一个答案，将动态工作时，ordered.cols和unique.cols向量不设置从一开始

# these two sets of columns need to be dynamic
# they might be any two sets of columns!
ordered.cols <- c( 'cyl' , 'gear' )
unique.cols <- c( 'am' , 'vs' )
# neither of the above two character vectors will be known beforehand

# so here's the example starting data set
x <- mtcars[ , c( ordered.cols , unique.cols ) ]
# the desired output should have this many records:
unique( x[ , ordered.cols ] )
# but i'm unsure of the smartest way to add the additional columns that i want--

# for *each* unique level in *each* of the variables in
# `unique.cols` there should be one additional column added
# to the final output.  then, for that `ordered.cols` combination
# the cell should be populated with the value if it exists
# and NA otherwise
desired.output <-
    structure(list(cyl = c(4L, 4L, 4L, 6L, 6L, 6L, 8L, 8L), gear = c(3L, 
    4L, 5L, 3L, 4L, 5L, 3L, 5L), am1 = c(0L, 0L, 1L, 0L, 0L, 1L, 
    0L, 1L), am2 = c(NA, 1L, NA, NA, 1L, NA, NA, NA), vs1 = c(1L, 
    1L, 0L, 1L, 0L, 0L, 0L, 0L), vs2 = c(NA, NA, 1L, NA, 1L, NA, 
    NA, NA)), .Names = c("cyl", "gear", "am1", "am2", "vs1", "vs2"
    ), class = "data.frame", row.names = c(NA, -8L))

desired.output

我并不关心新列的名称是am1、am2、vs1、vs2还是其他更方便的名称。但如果数据中有两个不同的am值，则需要在最终输出中有两个数据保存列，如果该组合不具有该值，则应该缺少其中一个。

# second example #
ordered.cols <- c( 'a' , 'b' )
unique.cols <- 'd'
# starting data set
y <-
    data.frame( 
        a = c( 1 , 1 , 1 , 2 ) , 
        b = c( 1 , 1 , 2 , 2 ) , 
        d = c( 'z' , 'y' , 'x' , 'x' ) 
    )
# the desired output here should have this many rows..
unique( y[ , ordered.cols ] )
# now the contents of all columns in `unique.cols`
# (which in this case is only column `d`)
# need to be appended as a widened data set
second.desired.output <-
    data.frame( 
        a = c( 1 , 1 , 2 ) ,
        b = c( 1 , 2 , 2 ) ,
        d1 = c( 'z' , 'x' , 'x' ) ,
        d2 = c( 'y' , NA , NA )
    )
second.desired.output

谢谢! !

library(data.table)
reshapeMyData <- function(x, unique.cols, ordered.cols, NA_class="integer") {
  DT <- as.data.table(x)
  unique.values <- lapply(DT[, unique.cols, with=FALSE], unique)
  ## If your NA is of the wrong class, it can potentially throw an error, 
  ##    depending on when it first shows up.  It is better to be explicit about the expected class
  NA.classed <- as(NA, NA_class)
  ###  -- This is all one line.. it iterates over the unique combinations of ordered.cols values
  DT[, {browser(expr=FALSE)
    ## These three functions shape the data as needed
    setDT(as.list(unlist(
      ## This mapply call checks if each value is in the given group
      mapply(function(v, C) {ifelse(v %in% C, v, NA.classed)}, v=unique.values, C=.SD, SIMPLIFY=FALSE)
    )))
  }
  , keyby=ordered.cols, .SDcols=unique.cols]
} ## // end function reshapeMyData

输出

reshapeMyData(x, unique.cols, ordered.cols)
   cyl gear am1 am2 vs1 vs2
1:   4    3  NA   0  NA   1
2:   4    4   1   0  NA   1
3:   4    5   1  NA   0   1
4:   6    3  NA   0  NA   1
5:   6    4   1   0   0   1
6:   6    5   1  NA   0  NA
7:   8    3  NA   0   0  NA
8:   8    5   1  NA   0  NA
reshapeMyData(y, "d", c("a", "b"), NA_class="character")
   a b d1 d2 d3
1: 1 1  z  y NA
2: 1 2 NA NA  x
3: 2 2 NA NA  x

对于我的目的，这个解决方案似乎工作得很好:

aggregate( x[ , unique.cols ] , by = x[ , ordered.cols ] , function( w ) paste( sort( unique( w ) ) , collapse = "," ) )
aggregate( y[ , unique.cols ] , by = y[ , ordered.cols ] , function( w ) paste( sort( unique( w ) ) , collapse = "," ) )

有时(我不确定为什么，但我认为这是一个因素强制问题)nrow( unique( x[ , ordered.cols ] ) )不等于上述命令输出的nrow。在这些情况下，这个解决方法似乎可以奏效:

halfway <- aggregate( x[ , unique.cols ] , by = list( apply( x[ , ordered.cols ] , 1 , paste , collapse = "" ) ) , function( w ) paste( sort( unique( w ) ) , collapse = "," )  )
cbind( unique( x[ , ordered.cols ] ) , halfway[ , -1 ] )

相关内容

最新更新

热门标签：