r-数据表中POSIXct的有效比较



你好,我正在寻找一种从data.table中选择POSIXct行的有效方法,以便一天中的时间小于12:00:00(注意,不需要毫秒,因此我们可以使用ITime作为示例(

set.seed(1); N = 1e7;
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
DT
                               dts
#       1: 1969-12-31 06:35:54.618925
#       2: 1970-01-01 05:06:04.332422
#     ---                           
# 9999999: 1970-01-03 00:37:00.035565
#10000000: 1969-12-30 08:30:23.624506

一种解决方案(这里的问题是,如果N很大,铸造成本可能很高(

f <- function(t, st, et) {time <- as.ITime(t); return(time>=as.ITime(st) & time<=as.ITime(et))}
P <- function(t, s) { #geekTrader solution
    ep <- .parseISO8601(s) 
    if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
        first.time <- as.double(ep$first.time)
        last.time <- as.double(ep$last.time)-31449600
        SecOfDay <- as.double(t) %% 86400
        return(SecOfDay >= first.time & SecOfDay <= last.time )
    } else {
        return(t >= ep$first.time & t <= ep$last.time)    
    }
}

快速查看性能

system.time(resf <- DT[f(dts,'00:00:00','11:59:59')])
   user  system elapsed 
   1.01    0.28    1.29
system.time(resP <- DT[P(dts,'T00:00:00/T11:59:59')])
   user  system elapsed 
   0.64    0.13    0.76 
identical(resf,resP)
[1] TRUE
 P <- function(t, s) {
  ep <- .parseISO8601(s)
  if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
    first.time <- as.double(ep$first.time)
    last.time <- as.double(ep$last.time)-31449600
    SecOfDay <- as.double(t) %% 86400
    return(SecOfDay >= first.time & SecOfDay <= last.time )
  } else {
    return(t >= ep$first.time & t <= ep$last.time)    
  }
}
F <- function(t, st, et) {
  time <- as.ITime(t) 
  return(time>=as.ITime(st) & time<=as.ITime(et))
}

 Sys.setenv(TZ='GMT')
 N = 1e7;
 set.seed(1);
 DT <- data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))

 system.time(resP <- DT[P(dts, 'T00:00:00/T12:00:00'), ])
##   user  system elapsed 
##   1.11    0.11    1.22 
 system.time(resF <- DT[F(dts,'00:00:00','12:00:00')])
##   user  system elapsed 
##   2.22    0.29    2.51 
 resP
##                         dts
##      1: 1969-12-31 06:35:54
##      2: 1970-01-01 05:06:04
##      3: 1969-12-31 00:47:17
##      4: 1970-01-01 09:09:10
##      5: 1969-12-31 01:12:33
##     ---                    
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
 resF
##                         dts
##      1: 1969-12-31 06:35:54
##      2: 1970-01-01 05:06:04
##      3: 1969-12-31 00:47:17
##      4: 1970-01-01 09:09:10
##      5: 1969-12-31 01:12:33
##     ---                    
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
 #Check the correctness
 resP[,list(mindts=max(dts)),by=list(as.Date(dts))]
##       as.Date              mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55
 resF[,list(mindts=max(dts)),by=list(as.Date(dts))]
##       as.Date              mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55

现在展示一些漂亮的xts风格的子集

 DT[P(dts, '1970')]
##                         dts
##      1: 1970-01-01 05:06:04
##      2: 1970-01-02 20:18:48
##      3: 1970-01-01 09:09:10
##      4: 1970-01-01 13:32:22
##      5: 1970-01-01 20:30:32
##     ---                    
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
 DT[P(dts, '197001')]
##                         dts
##      1: 1970-01-01 05:06:04
##      2: 1970-01-02 20:18:48
##      3: 1970-01-01 09:09:10
##      4: 1970-01-01 13:32:22
##      5: 1970-01-01 20:30:32
##     ---                    
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
 DT[P(dts, '19700102')]
##                         dts
##      1: 1970-01-02 20:18:48
##      2: 1970-01-02 17:59:38
##      3: 1970-01-02 07:14:53
##      4: 1970-01-02 02:13:03
##      5: 1970-01-02 01:31:37
##     ---                    
##1519426: 1970-01-02 11:25:24
##1519427: 1970-01-02 10:00:21
##1519428: 1970-01-02 05:21:25
##1519429: 1970-01-02 05:11:26
##1519430: 1970-01-02 15:51:12
 DT[P(dts, '19700102 00:00:00/19700103 12:00:00')]
##                         dts
##      1: 1970-01-02 20:18:48
##      2: 1970-01-02 17:59:38
##      3: 1970-01-02 07:14:53
##      4: 1970-01-02 02:13:03
##      5: 1970-01-02 01:31:37
##     ---                    
##1785762: 1970-01-02 05:21:25
##1785763: 1970-01-02 05:11:26
##1785764: 1970-01-02 15:51:12
##1785765: 1970-01-03 01:41:31
##1785766: 1970-01-03 00:37:00
 #Check the correctness again
 DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), max(dts)]
##[1] "1970-01-03 12:00:00 GMT"
 DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), min(dts)]
##[1] "1970-01-02 00:00:00 GMT"

执行此操作的规范方法是转换为POSIXlt并提取小时组件。

hour(as.POSIXlt(DT$dts, "GMT")) < 12

这似乎在性能上与所讨论的其他技术相当(并且更容易理解(。

以下是一种使用xts中的一些功能来实现您想要的功能的方法。这不是一个很好的解决方案,因为xts对象必须按时间排序,但data.table对象不必按时间排序。此外,由于xtsdata.table正在做一些冗余的工作,所以速度可能不会太快。尽管如此,我觉得这可能很有趣。

library(data.table)
library(xts)
set.seed(1); N = 1e5;
# I tweaked the following line to make this reproducible in other timezones.
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
setkey(DT, dts)  # must sort on time first so that the `xts` object we're about 
                 # to create has the same order
DT[, XTS:=xts(rep(NA, .N), dts)]  # add a dummy xts object as a column
DT[XTS["T00:00:00/T11:59:59.999999", which=TRUE]][, list(dts)] 
                       dts
    1: 1969-12-27 00:28:41
    2: 1969-12-27 00:34:00
    3: 1969-12-27 03:11:21
    4: 1969-12-27 04:20:27
    5: 1969-12-28 00:00:21
   ---                    
49825: 1970-01-05 08:05:22
49826: 1970-01-05 09:35:32
49827: 1970-01-05 09:49:49
49828: 1970-01-05 09:50:27
49829: 1970-01-05 11:07:32

上面使用xts样式的子设置字符串来获取每天时间在00:00:00到12:00:00之间的行。使用which=TRUE返回行号,而不是该行的数据,这样我们就可以用这些行对data.table进行子集设置。

您可以使用类似"1970-01-01"的字符串来获取当天的所有数据,或使用"1970-01"来获取1970年1月的所有数据;或使用"197 0-01-01/1970 0-01-02"来获取这两天的所有行。

这是一个较晚的条目,但我认为as.POSIXlt解决方案将创建一个命名的矢量列表,其中您只需要小时

我会按ITime列键入关键字,然后使用二进制搜索在下午12点之前对这些时间进行子集搜索

下午12点前有60*60 *12 - 1秒,因此seq_len(43199)将返回所有直到(但不包括(下午12点的

# create IDate and ITime columns and key by time
setkey(DT[, c('Date','Time') := IDateTime(dts)],Time)
# subset times before 12pm
DT[.(seq_len(43199))]

最新更新