我有一个问题,我就是不知道如何有效地解决。我想找出所有类型为("A")的传感器数据在+- 5分钟内对类型为"B"的传感器数据的观测,并对识别的观测做一些总结。对于所有的观测都必须这样做。每一个数据中的个体。数据集很大,所以我的for循环解决方案非常慢。
我将试着通过下面的例子来解释它:
我有时间戳传感器数据(两种类型)","B"),
Individual <- c(rep("Anna",7),rep("Mark",8))
datetime <- as.POSIXct(c("2021-11-18 07:02:12","2021-11-18 07:10:25","2021-11-18 07:22:18","2021-11-18 07:24:04","2021-11-18 07:27:43","2021-11-18 07:29:01","2021-11-18 07:50:01","2021-11-21 12:19:28","2021-11-21 10:55:58","2021-11-21 11:14:22","2021-11-21 11:18:05","2021-11-21 11:22:01","2021-11-21 11:26:09","2021-11-21 11:27:38","2021-11-21 12:15:50"), format = "%Y-%m-%d %H:%M:%S")
datatype <- as.factor(c("A","A","A","B","A","A","A","A","A","B","A","B","A","A","A")) #type of sensordata
value <- c(7.85,6.54,7.82,5.43,7.34,8.93,2.54,5.62,7.84,5.56,8.95,6.85,5.88,4.95,8.46) #value of the sensor
df<-data.frame(Individual, datetime, datatype, value)
我想做的是,对于每一个"B"在"数据类型"栏中,我想确定"a"的所有观测值。距离"B"观察。然后我想计算这些识别出的平均值。观察值,以及"a"的数量。观察。最后,我想把这些信息添加到"B"观察。所有这一切都必须为每一次观测做准备。在数据中,必须对数据中的每个个体进行处理。
我构造了一个非常复杂的for循环,它可以做到这一点,但是它非常慢,而且由于原始数据有几十万行,所以运行起来并不可行(需要几个小时…)。
library(dplyr)
library(plyr)
df_A<-df %>% filter(datatype == "A")
df_B<-df %>% filter(datatype == "B")
df_list <- list()
name_list <- list()
ind<-unique(df_B$Individual)
for (i in 1:length(ind)) { # loop over all individuals in the data
for (j in 1:nrow(df_B[df_B$Individual==paste(ind[i]),])) { #loop over each observation in the data per individual
row_number<-which(near(df_B[df_B$Individual==paste(ind[i]),]$datetime[j],
df_A[df_A$Individual==paste(ind[i]),]$datetime, tol=5)) #find observations of "A" within +- 5 min an observation of "B"
#summarize results in a dataframe format
df1 <- df_B[df_B$Individual==paste(ind[i]),][j,]
mean_for_A <- mean(df_A[df_A$Individual==paste(ind[i]),][row_number,]$value) # calc. mean of "A"
number_of_A <- length(df_A[df_A$Individual==paste(ind[i]),][row_number,]$value) #calc. number of "A"
df2<-cbind(df1, mean_for_A, number_of_A) #combine into dataframe
#store dataframe in list
df_list[[j]]<-df2 #store dataframe of observations of "A" for each "B"
name_list[[i]]<-ldply(df_list) #store completed dataframes per individual
}
}
ldply(name_list) #final product as I want it to be
有没有更有效的方法?
谢谢!
这是一个data.table
方法。应该比for循环
library(data.table)
# set to data.table format
setDT(df)
# create a unique key (can be removend at the end)
df[, id := .I]
setkey(df, id)
# selfjoin the subset of df$datatype == B to df where datatype == A and the
# datetime is within +/- 300 seconds
df[datatype == "B", ][df[datatype == "B", ],c("mean_for_A", "number_of_A") := {
temp <- df[Individual == i.Individual & datatype == "A" &
datetime >= (i.datetime - 300) & datetime <= (i.datetime + 300), ]
list(mean(temp$value), nrow(temp))
}, by = .EACHI][, id := NULL][]
Individual datetime datatype value mean_for_A number_of_A
1: Anna 2021-11-18 07:24:04 B 5.43 8.030 3
2: Mark 2021-11-21 11:14:22 B 5.56 8.950 1
3: Mark 2021-11-21 11:22:01 B 6.85 7.415 2
这是另一种data.table
方法
library(data.table)
# set to data.table format
setDT(df)
# split to list, by datatype-column
L <- split(df, by = "datatype")
# left cartesian join
ans <- L$A[L$B, on = .(Individual), allow.cartesian = TRUE]
# remove <> 300 seconds
ans <- ans[abs(as.numeric(datetime) - as.numeric(i.datetime)) <= 300, ]
# summarise
ans[, .(datatype = i.datatype[1], value = i.value[1],
mean_for_A = mean(value), number_of_A = .N),
by = .(Individual, dateime = i.datetime)]
Individual dateime datatype value mean_for_A number_of_A
1: Anna 2021-11-18 07:24:04 B 5.43 8.030 3
2: Mark 2021-11-21 11:14:22 B 5.56 8.950 1
3: Mark 2021-11-21 11:22:01 B 6.85 7.415 2
这取决于你的实际数据。设置哪种方法运行更快。但是在一个平均的系统上,两个解决方案的运行时间不应该超过几秒钟(最多1分钟),假设您有足够的内部内存来存储数据集。
data.table
包可能快得多。例如:
## create the index start and end times
datB = as.data.table(df[which(df$datatype=="B"),])
datB$start = datB$datetime - (5*60) #subtract 5min
datB$end = datB$datetime + (5*60) #add 5min
## create the reference times
datA = as.data.table(df[which(df$datatype=="A"),])
datA$datetime2 = datA$datetime #duplicate to have a start and end time
## set the keys
setkeyv(datA,c("Individual","datetime", "datetime2"))
setkeyv(datB,c("Individual","start", "end"))
## now find the overlaps
overlaps = foverlaps(datA, datB, type="any", nomatch=0L)
## and summarise by the value (you may want to consider a uID if this value could be the same)
### get the mean times
meanTimes = aggregate(overlaps$i.value ~ paste0(overlaps$Individual,"_",overlaps$datetime,"_",overlaps$value), FUN=mean)
### get the counts
countTimes = aggregate(overlaps$i.value ~ paste0(overlaps$Individual,"_",overlaps$datetime,"_",overlaps$value), FUN=length)
## Put in requested output format
merged = merge(meanTimes, countTimes, by=1)
out =as.data.frame( t(sapply(strsplit(merged[,1], "_"), paste) ))
names(out) = c("Individual", "datetime", "value")
out$meanTimes = merged[,2]
out$countTimes = merged[,3]
out
输出为:
Individual datetime value meanTimes countTimes
1 Anna 2021-11-18 07:24:04 5.43 8.030 3
2 Mark 2021-11-21 11:14:22 5.56 8.950 1
3 Mark 2021-11-21 11:22:01 6.85 7.415 2