WANT=data.frame(STUDENT=c(1,1,1,2,2,2,3,3,3,4,4,4),
X1=c(0,0,0,1,1,1,1,1,1,0,0,0),
X2=c(1,1,1,0,0,0,1,1,1,1,1,1),
CAT=c(9,6,8,8,9,8,5,9,8,8,7,7),
TIME=c(1,2,3,1,2,3,1,2,3,1,2,3),
EVENT=c(0,0,0,0,1,1,1,1,1,0,0,1))
HAVE=data.frame(STUDENT=c(1,2,3,4),
X1=c(0,1,1,0),
X2=c(1,0,1,1),
CAT1=c(9,8,5,8),
CAT2=c(6,9,9,7),
CAT3=c(8,8,8,7),
TIME=c(NA,2,1,3),
EVENT=c(0,1,1,1))
我有数据"have",希望数据"WANT"在数据"HAVE"中,每个学生都有一行。X1和X2是固定的,CAT#是一个随时间变化的变量,time表示事件发生的时间,如果没有发生,则time为NA,event表示event是否发生。
我希望将"HAVE"转换为"WANT",每个学生都得到必要的行数,时间从1-3;EVENT填充,如图所示。
我试着用dcast和重拍2来做到这一点,但没有成功。请告知!
以下是使用rep
的基本R选项
inds <- grep("CAT", names(HAVE))
WANT <- `row.names<-`(transform(
HAVE[rep(1:nrow(HAVE), each = length(inds)), ],
CAT = c(t(HAVE[inds]))
)[-inds], NULL)
它给出
> WANT
STUDENT X1 X2 TIME EVENT CAT
1 1 0 1 NA 0 9
2 1 0 1 NA 0 6
3 1 0 1 NA 0 8
4 2 1 0 2 1 8
5 2 1 0 2 1 9
6 2 1 0 2 1 8
7 3 1 1 1 1 5
8 3 1 1 1 1 9
9 3 1 1 1 1 8
10 4 0 1 3 1 8
11 4 0 1 3 1 7
12 4 0 1 3 1 7
我们可以从data.table
使用melt
。使用melt
将数据集转换为长格式,按"STUDENT"分组,如果"TIME"值大于或等于行序列,我们可以将"EVENT"列值替换为0
library(data.table)
melt(setDT(HAVE), measure = paste0('CAT',1:3), value.name = 'CAT')[,
variable := NULL][, c('EVENT', 'TIME') :=
.(seq_len(.N) >= TIME, seq_len(.N)), STUDENT][is.na(EVENT),
EVENT := 0][order(STUDENT)]
# STUDENT X1 X2 TIME EVENT CAT
# 1: 1 0 1 1 0 9
# 2: 1 0 1 2 0 6
# 3: 1 0 1 3 0 8
# 4: 2 1 0 1 0 8
# 5: 2 1 0 2 1 9
# 6: 2 1 0 3 1 8
# 7: 3 1 1 1 1 5
# 8: 3 1 1 2 1 9
# 9: 3 1 1 3 1 8
#10: 4 0 1 1 0 8
#11: 4 0 1 2 0 7
#12: 4 0 1 3 1 7
或使用tidyverse
library(dplyr)
library(tidyr)
HAVE %>%
pivot_longer(cols = CAT1:CAT3, values_to = 'CAT') %>%
select(-name) %>%
group_by(STUDENT) %>%
mutate(EVENT = +(replace_na(row_number() >= TIME, 0)), TIME = row_number())
# A tibble: 12 x 6
# Groups: STUDENT [4]
# STUDENT X1 X2 TIME EVENT CAT
# <dbl> <dbl> <dbl> <int> <dbl> <dbl>
# 1 1 0 1 1 0 9
# 2 1 0 1 2 0 6
# 3 1 0 1 3 0 8
# 4 2 1 0 1 0 8
# 5 2 1 0 2 1 9
# 6 2 1 0 3 1 8
# 7 3 1 1 1 1 5
# 8 3 1 1 2 1 9
# 9 3 1 1 3 1 8
#10 4 0 1 1 0 8
#11 4 0 1 2 0 7
#12 4 0 1 3 1 7
这很棘手。我不得不把一些基本的R加入到这个有趣的解决方案中:
library(dplyr)
library(tidyr)
HAVE %>%
pivot_longer(cols = tidyselect::starts_with("CAT")) %>%
mutate(EVENT = do.call(c, lapply(split(TIME, STUDENT),
function(x) if(any(is.na(x))) rep(0, length(x))
else cumsum(`[<-`(rep(0, 3), median(x), 1)))),
TIME = rep(seq(length(unique(name))), length.out = length(TIME))) %>%
select(-name)
#> # A tibble: 12 x 6
#> STUDENT X1 X2 TIME EVENT value
#> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
#> 1 1 0 1 1 0 9
#> 2 1 0 1 2 0 6
#> 3 1 0 1 3 0 8
#> 4 2 1 0 1 0 8
#> 5 2 1 0 2 1 9
#> 6 2 1 0 3 1 8
#> 7 3 1 1 1 1 5
#> 8 3 1 1 2 1 9
#> 9 3 1 1 3 1 8
#> 10 4 0 1 1 0 8
#> 11 4 0 1 2 0 7
#> 12 4 0 1 3 1 7
由reprex包于2020-09-11创建(v0.3.0(