R转换事件数据


WANT=data.frame(STUDENT=c(1,1,1,2,2,2,3,3,3,4,4,4),
X1=c(0,0,0,1,1,1,1,1,1,0,0,0),
X2=c(1,1,1,0,0,0,1,1,1,1,1,1),
CAT=c(9,6,8,8,9,8,5,9,8,8,7,7),
TIME=c(1,2,3,1,2,3,1,2,3,1,2,3),
EVENT=c(0,0,0,0,1,1,1,1,1,0,0,1))

HAVE=data.frame(STUDENT=c(1,2,3,4),
X1=c(0,1,1,0),
X2=c(1,0,1,1),
CAT1=c(9,8,5,8),
CAT2=c(6,9,9,7),
CAT3=c(8,8,8,7),
TIME=c(NA,2,1,3),
EVENT=c(0,1,1,1))

我有数据"have",希望数据"WANT"在数据"HAVE"中,每个学生都有一行。X1和X2是固定的,CAT#是一个随时间变化的变量,time表示事件发生的时间,如果没有发生,则time为NA,event表示event是否发生。

我希望将"HAVE"转换为"WANT",每个学生都得到必要的行数,时间从1-3;EVENT填充,如图所示。

我试着用dcast和重拍2来做到这一点,但没有成功。请告知!

以下是使用rep的基本R选项

inds <- grep("CAT", names(HAVE))
WANT <- `row.names<-`(transform(
HAVE[rep(1:nrow(HAVE), each = length(inds)), ],
CAT = c(t(HAVE[inds]))
)[-inds], NULL)

它给出

> WANT
STUDENT X1 X2 TIME EVENT CAT
1        1  0  1   NA     0   9
2        1  0  1   NA     0   6
3        1  0  1   NA     0   8
4        2  1  0    2     1   8
5        2  1  0    2     1   9
6        2  1  0    2     1   8
7        3  1  1    1     1   5
8        3  1  1    1     1   9
9        3  1  1    1     1   8
10       4  0  1    3     1   8
11       4  0  1    3     1   7
12       4  0  1    3     1   7

我们可以从data.table使用melt。使用melt将数据集转换为长格式,按"STUDENT"分组,如果"TIME"值大于或等于行序列,我们可以将"EVENT"列值替换为0

library(data.table)
melt(setDT(HAVE), measure = paste0('CAT',1:3), value.name = 'CAT')[,
variable := NULL][, c('EVENT', 'TIME') :=
.(seq_len(.N) >= TIME, seq_len(.N)), STUDENT][is.na(EVENT), 
EVENT := 0][order(STUDENT)]
#   STUDENT X1 X2 TIME EVENT CAT
# 1:       1  0  1    1     0   9
# 2:       1  0  1    2     0   6
# 3:       1  0  1    3     0   8
# 4:       2  1  0    1     0   8
# 5:       2  1  0    2     1   9
# 6:       2  1  0    3     1   8
# 7:       3  1  1    1     1   5
# 8:       3  1  1    2     1   9
# 9:       3  1  1    3     1   8
#10:       4  0  1    1     0   8
#11:       4  0  1    2     0   7
#12:       4  0  1    3     1   7

或使用tidyverse

library(dplyr)
library(tidyr)
HAVE %>%
pivot_longer(cols = CAT1:CAT3, values_to = 'CAT') %>%
select(-name) %>% 
group_by(STUDENT) %>% 
mutate(EVENT = +(replace_na(row_number() >= TIME, 0)), TIME = row_number())
# A tibble: 12 x 6
# Groups:   STUDENT [4]
#   STUDENT    X1    X2  TIME EVENT   CAT
#     <dbl> <dbl> <dbl> <int> <dbl> <dbl>
# 1       1     0     1     1     0     9
# 2       1     0     1     2     0     6
# 3       1     0     1     3     0     8
# 4       2     1     0     1     0     8
# 5       2     1     0     2     1     9
# 6       2     1     0     3     1     8
# 7       3     1     1     1     1     5
# 8       3     1     1     2     1     9
# 9       3     1     1     3     1     8
#10       4     0     1     1     0     8
#11       4     0     1     2     0     7
#12       4     0     1     3     1     7

这很棘手。我不得不把一些基本的R加入到这个有趣的解决方案中:

library(dplyr)
library(tidyr)
HAVE %>% 
pivot_longer(cols = tidyselect::starts_with("CAT")) %>% 
mutate(EVENT = do.call(c, lapply(split(TIME, STUDENT), 
function(x) if(any(is.na(x))) rep(0, length(x)) 
else cumsum(`[<-`(rep(0, 3), median(x), 1)))), 
TIME = rep(seq(length(unique(name))), length.out = length(TIME))) %>% 
select(-name)
#> # A tibble: 12 x 6
#>    STUDENT    X1    X2  TIME EVENT value
#>      <dbl> <dbl> <dbl> <int> <dbl> <dbl>
#>  1       1     0     1     1     0     9
#>  2       1     0     1     2     0     6
#>  3       1     0     1     3     0     8
#>  4       2     1     0     1     0     8
#>  5       2     1     0     2     1     9
#>  6       2     1     0     3     1     8
#>  7       3     1     1     1     1     5
#>  8       3     1     1     2     1     9
#>  9       3     1     1     3     1     8
#> 10       4     0     1     1     0     8
#> 11       4     0     1     2     0     7
#> 12       4     0     1     3     1     7

由reprex包于2020-09-11创建(v0.3.0(

最新更新