我希望将事件开始和结束日期的列表转换为状态向量,其中开始和结束之间的任何一天都是 1,外部是 0(例如 2,4 -> c(0,1,1,1,0,0))
每个主题(以 id 为键)可能在不同的行中有多个开始和结束日期,需要合并。
我有一个非常依赖lapply的解决方案(如果需要,我可以访问超级计算机,因此可以将它们切换到mclapply),但希望尽可能矢量化,因为输入数据可能很大(~250MB)。
谁能看到减少这里任何步骤的途径?
require(data.table)
#The days that will be assessed for state
period = as.integer(1:8)
#Indices for days (they are not necessarily sequential)
dayInds = as.integer(1:length(period))
#Events for same ID will never overlap
dt = data.table(id = c("a","a","b","c","d","d","e"),
start = c(1,6,3,3,3,5,5),
end = c(4,7,6,7,4,6,5))
# setkeyv(dt,colnames(dt))
setkeyv(dt,c("start","end"))
#Setup output table
stateData = data.table(id = dt$id)
#Remove "-" from days before index, they could get confusing, and initialise
#columns with zero
dayStrings = paste("d",gsub("-", "m", period),sep="")
stateData[,(dayStrings) := 0L]
#Find whether there is an overlap between a specified day in period and a
#subject's events
getStateOnDay = function(dayInd) {
#Get day
day = period[dayInd]
#Create a table with the same number of rows as input dt, with a one day long
#event on the input day
overlapDays = unlist(foverlaps(data.table(start = day,end = day),
dt,
which=TRUE,
nomatch = 0L)$yid)
#Set those days to 1 in the state table
set(stateData,overlapDays,dayInd+1L,1L)
}
#Get states for each row
lapply(dayInds,getStateOnDay)
#Create table for data with one row for each unique ID
reducedStateData = data.table(id = unique(stateData$id))
reducedStateData[,(dayStrings) := 0L]
#Sum a vector of logicals using OR
orSum = function(inputVec) {
return(Reduce("|", c(inputVec)))
}
#Function for finding for each ID if they were in the state on a given day
reduceStatesByID = function(dayInd) {
set(reducedStateData,
NULL,
dayInd+1L,
stateData[,c(1,dayInd+1),with=FALSE][,as.integer(lapply(.SD, orSum)), by=id][,V1])
return(NA)
}
#Apply reduction and sort
lapply(dayInds,reduceStatesByID)
setkey(reducedStateData,id)
下面是使用Map
和序列的尝试,然后dcast
-ed 为宽格式:
dcast(
dt[, .(d=unlist(Map(seq, start, end)), val=1), by=id],
id ~ d, value.var="val", fun.aggregate=sum, na.rm=TRUE
)
# id 1 2 3 4 5 6 7
#1: a 1 1 1 1 0 1 1
#2: b 0 0 1 1 1 1 0
#3: c 0 0 1 1 1 1 1
#4: d 0 0 1 1 1 1 0
#5: e 0 0 0 0 1 0 0
@Frank在评论中的建议似乎更快,可能主要是由于避免by=
:
dt[
, .(t = unlist(L <- Map(seq, start, end)), id = rep(id, lengths(L)))
][, dcast(.SD, id ~ t, fun.agg = length)]
这是一个使用 data.table极其高效的set
函数的方法,在构造一个具有正确维度 (res) 的空 data.table 以及原始矩阵的行和新矩阵中的行 (resRows) 的映射后。
# construct empty data.table (ids and appropriate number of variables with 0s)
res <- data.table(id=unique(dt$id), matrix(0L, dt[, uniqueN(id)], max(dt$end)))
# get values for rows from id variable for placement into final data.table
resRows <- dt[, cumsum(rowid(id) == 1L)]
# fill in appropriate elements in data.table with 1s using set
for(i in seq_along(resRows)) set(res, resRows[i], dt[i, seq(start, end)] + 1L, 1L)
这返回
res
id V1 V2 V3 V4 V5 V6 V7
1: a 1 1 1 1 0 1 1
2: b 0 0 1 1 1 1 0
3: c 0 0 1 1 1 1 1
4: d 0 0 1 1 1 1 0
5: e 0 0 0 0 1 0 0