r-查找时间序列数据在指定间隔内的最小值



我有一组实验室值,我想随着时间的推移与入院日期相关。每个患者都有不同的实验室/随访时间条目。我的目标是在他们入院后的不同时间间隔(df中的日期(,即第0-30天、第31-90天、1-2年、2-3年、3-4年等,直到他们最后一次随访,确定该实验室的最小值,以帮助我识别高于基线某个阈值的异常值。由于这个实验室值可以随着时间的推移而自然变化,我想找到这些最小值来建立新的基线。由于每个患者都有可变的随访,可能长达20年,我很难找到一个函数来找到没有使用过滤和变异的局部最小值,为我想要的每个间隔创建一个新的列。我的dput输出如下,如果格式不正确,请告诉我!

structure(list(lab_date = structure(c(10006, 10007, 10008, 10009, 
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019, 
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225, 
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858, 
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199, 
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226, 
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281, 
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401, 
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663, 
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698, 
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733, 
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1, 
1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1, 
1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2, 
1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1, 
2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2, 
2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1, 
2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2, 
2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4, 
3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
182, 182), Date_One = structure(c(10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

以下是tidyverse的一个可能选项(但我不确定您希望输出的格式(:

library(tidyverse)
df %>% 
group_by(ID, Date_One) %>% 
mutate(years = as.numeric(difftime(Date_One,lab_date,units = "days")/365)) %>% 
filter(years >= 0) %>% 
group_by(gr=cut(years, breaks= c(-Inf, c((30/365), (60/365)), seq(1, 20, by = 1))), ID) %>% 
summarise(lab_value = min(lab_value))

输出

gr               ID lab_value
<fct>         <dbl>     <dbl>
1 (-Inf,0.0822]   182       1.2
2 (0.164,1]       182       1.2
3 (1,2]           182       1.2
4 (2,3]           182       1  

这样的东西怎么样?它让您将不同的分段中断指定为天(可以很容易地将其转换为月或其他时间,但必须更改其他代码(,然后针对这些分段中的每一个,隔离这些中断范围内的行,然后找到其中的最小行。如果这些日期中没有值,它将返回NA。这应该适用于您提供的数据,如果您想将其应用于具有多个ID的数据帧,请告诉我,这应该只是一个额外的小循环。

#Convert object to dataframe
Data=data.frame(structure(list(lab_date = structure(c(10006, 10007, 10008, 10009, 
10010, 10011, 10012, 10013, 10014, 10015, 10016, 10018, 10019, 
10020, 10021, 10022, 10023, 10024, 10025, 10026, 10099, 10225, 
10242, 10361, 10575, 10729, 10785, 10849, 10856, 10857, 10858, 
10859, 10872, 10975, 11071, 11151, 11179, 11197, 11198, 11199, 
11201, 11202, 11203, 11204, 11206, 11207, 11208, 11210, 11226, 
11228, 11229, 11230, 11254, 11256, 11257, 11258, 11270, 11281, 
11282, 11282, 11309, 11310, 11338, 11339, 11372, 11373, 11401, 
11499, 11536, 11564, 11582, 11597, 11598, 11625, 11660, 11663, 
11664, 11665, 11666, 11667, 11668, 11695, 11696, 11697, 11698, 
11699, 11700, 11701, 11723, 11729, 11730, 11731, 11732, 11733, 
11734, 11735, 11736, 11737, 11765, 11828), class = "Date"), lab_value = c(1.1, 
                                                          1, 1.1, 1.8, 2.3, 2.4, 1.3, 1.3, 1.2, 1.2, 1.2, 1.5, 1.3, 1.1, 
                                                          1.1, 1.1, 1, 1, 1, 1, 1.2, 1.2, 1.2, 1.2, 1.2, 1.2, 1.3, 1.2, 
                                                          1.2, 1.7, 1.7, 1.7, 1.8, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6, 1.7, 2.1, 
                                                          2.1, 2.5, 2.6, 2.7, 2.6, 2.3, 2, 2, 1.8, 1.9, 2, 1.6, 1.8, 2, 
                                                          2.1, 1.9, 1.8, 1.7, 1.8, 1.9, 1.8, 2.1, 1.9, 1.9, 1.9, 2.1, 2.1, 
                                                          2, 1.9, 2.1, 2, 2, 2, 2.1, 2, 1.8, 1.8, 2, 2.2, 2.4, 2.2, 2.2, 
                                                          2.1, 1.9, 2.1, 2.2, 2.4, 2.4, 2.3, 2.3, 2.5, 2.6, 3.1, 3.2, 3.4, 
                                                          3.6, 3.3, 3.1, 3), ID = c(182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 
                                                                                    182, 182), Date_One = structure(c(10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 10856, 
                                                                                                                      10856, 10856, 10856, 10856, 10856, 10856), class = "Date")), class = c("grouped_df", 
                                                                                                                                                                                             "tbl_df", "tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
                                                                                                                                                                                               ID = 182, .rows = structure(list(1:100), ptype = integer(0), class = c("vctrs_list_of", 
                                                                                                                                                                                                                                                                      "vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                                                                 "tbl", "data.frame"), .drop = TRUE)))
#Define Segment Breaks in days
SegmentBreaks=c(0,30,90,365,730)
#Function for finding min date
MinAtSegments=function(Data,SegmentBreaks){
IDNumber=length(unique(Data$ID))
UniqueIDs=unique(Data$ID)
OutputLength=length(SegmentBreaks)

Date1=min(Data$lab_date)

DateBreaks=Date1+SegmentBreaks

Output=matrix(NA,nrow=IDNumber,ncol=length(SegmentBreaks))

DateBreaks=c(DateBreaks,Sys.Date())
for(j in 1:IDNumber){
DataID=Data[Data$ID==UniqueIDs[j],]
for(i in 1:length(Output)){
LabVals=Data$lab_value[Data$lab_date>=DateBreaks[i] & Data$lab_date<DateBreaks[i+1]]
Output[j,i]=ifelse(length(LabVals)>0,min(LabVals),NA)
}
}
Output=data.frame('ID'=UniqueIDs,'MinVals'=Output)
return(Output)
}
#Run Function
MinAtSegments(Data,SegmentBreaks)

最新更新