R-通过data.table进行循环的性能



hi im当前正在进行Web使用挖掘。为此,我需要循环浏览所有数据条目(204002行((每行都是一个包含时间戳和访问的页面的Web会话(,并对它们进行一些工作。这是数据的数据:

structure(list(cookie = "1", 
    paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "SYSTEM", "SYSTEM", "SYSTEM")), time = list(c("2017-05-01T00:00:00.000Z", 
    "2017-05-01T00:00:10.000Z", "2017-05-01T00:00:41.000Z", "2017-05-01T00:00:48.000Z", 
    "2017-05-01T00:03:28.000Z", "2017-05-01T00:03:40.000Z", "2017-05-01T00:03:53.000Z", 
    "2017-05-01T00:04:09.000Z", "2017-05-01T00:04:17.000Z", "2017-05-01T00:04:26.000Z", 
    "2017-05-01T00:04:30.000Z", "2017-05-01T00:04:34.000Z", "2017-05-01T00:04:40.000Z", 
    "2017-05-01T00:05:36.000Z", "2017-05-01T00:05:46.000Z", "2017-05-01T00:05:52.000Z", 
    "2017-05-01T00:06:00.000Z", "2017-05-01T00:06:38.000Z", "2017-05-01T00:06:57.000Z", 
    "2017-05-01T00:07:01.000Z")), length = 20L, durationInMin = 7.01666666666667), .Names = c("cookie", 
"paths", "time", "length", "durationInMin"), class = c("data.table", 
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x00000000001f0788>)

我看看是否需要将会话分为两个或多个会话。为此,我查看了会话中的每个时间戳,并将它们与本届会议中的先前时间戳进行比较。如果衍射越过边界,则会分为两个会话。结果是一个新数据。该代码有效,但非常慢(多小时(。随着时间的流逝,速度越慢。首先,我认为这是循环内的增长列表,但是我通过无需结果列表进行循环进行了检查。我的代码如下:

function(sessions) {
      durationCalc <- function(timeList) {
        last <-
          strptime(timeList[[1]][length(timeList[[1]])], format = "%Y-%m-%dT%H:%M:%S")
        first <-
          strptime(timeList[[1]][length(1)], format = "%Y-%m-%dT%H:%M:%S")
        res <- as.numeric(difftime(last, first, units = 'mins'))
      }


      id <- 1
      border <- 30
      maxCount <- nrow(sessions)
      # list for the final sessions
      finalSessions <- vector("list", maxCount)
      # iterate over every session to break down into smaller sessions
      for (i in 1:maxCount) {
        print(paste("working on session", i, "of", maxCount))
        currentStartPosition <- 1
        row <- sessions[i, ]
        sessionLength <- length(row$time[[1]])
        # if the session containts only one path/timestamp, there is no further processing required
    # if it contains two or more, each timestamp has to be checked.
        if (sessionLength < 2) {
          finalSessions[[id]] <- row
          id <- id + 1
        }
        else{
          currentTime <-
            strptime(row$time[[1]][1], format = "%Y-%m-%dT%H:%M:%S")
          for (j in 2:sessionLength) {
            nextTime = strptime(row$time[[1]][j], format = "%Y-%m-%dT%H:%M:%S")
            diff <-
              as.numeric(difftime(nextTime, currentTime, units = 'mins'))
        # if the timestamp is 30 minutes or more later the current sessions (row) gets split 
            if (diff > border) {
        # make a copy of the original row and modify values, then add the modified row to the finalSessions
        # the currentStartposition gets the currentTimestamp and the loop continues
              currentSession <- row
              currentSession$cookie = id
              currentSession$time[[1]] <-
                list(row$time[[1]][currentStartPosition:j - 1])
              currentSession$paths[[1]] <-
                list(row$paths[[1]][currentStartPosition:j - 1])
              currentSession$durationInMin <-
                durationCalc(currentSession$time)
              currentSession$length <- length(currentSession$paths[[1]])
              currentStartPosition = j
              finalSessions[[id]] <- currentSession
              id <- id + 1
            }
            # at last the currentTimestamp gets the next Time stamp, it iterates over the whole timestamp list
            currentTime = nextTime
          }
      # after the loop the final session gets built. copy the original row, modify the values and add it to the finalSessions
          currentSession <- row
          currentSession$cookie = id
          currentSession$time[[1]] <-
            list(row$time[[1]][currentStartPosition:sessionLength])
          currentSession$paths[[1]] <-
            list(row$paths[[1]][currentStartPosition:sessionLength])
          currentSession$durationInMin <-
            durationCalc(currentSession$time)
          currentSession$length <- length(currentSession$paths[[1]])
          finalSessions[[id]] <- currentSession
          id <- id + 1
        }
      }
      finalSessions <- rbindlist(finalSessions)

    }  

尝试以下:

sessions <- structure(list(cookie = "1", 
               paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
                              "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
                              "LMCash", 
                              "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
                              "LMCash", 
                              "SYSTEM", "SYSTEM", "SYSTEM")),
               time = list(c(
"2017-05-01T00:00:00.000Z",
"2017-05-01T00:00:10.000Z", 
"2017-05-01T00:00:41.000Z", 
"2017-05-01T00:00:48.000Z", 
"2017-05-01T00:03:28.000Z", 
"2017-05-01T00:03:40.000Z",
"2017-05-01T00:03:53.000Z", 
"2017-05-01T00:04:09.000Z", 
"2017-05-01T00:04:17.000Z", 
"2017-05-01T00:04:26.000Z", 
"2017-05-01T00:04:30.000Z", 
"2017-05-01T00:04:34.000Z", 
"2017-05-01T00:04:40.000Z", 
"2017-05-01T00:05:36.000Z", 
"2017-05-01T00:05:46.000Z",
"2017-05-01T00:05:52.000Z", 
"2017-05-01T00:06:00.000Z", 
"2017-05-01T00:06:38.000Z", 
"2017-05-01T00:06:57.000Z", 
"2017-05-01T00:40:01.000Z")),
length = 20L,
durationInMin = 7.01666666666667), .Names = c("cookie",
"paths", "time", "length", "durationInMin"),
class = c("data.table", "data.frame"),
row.names = c(NA, -1L))

s <- replicate(1000, sessions, simplify = F)
# str(s)
s <- rbindlist(s)

ff <- function(s) {
  dFormat <- "%Y-%m-%dT%H:%M:%S"
  durationCalc2 <- function(timeList) {
    tt <- timeList
    (tt[length(tt)] - tt[1]) / 60
  }
  id <- 1
  border <- 30
  maxCount <- nrow(s)
  finalSessions <- vector("list", maxCount)
  for (i in 1:maxCount) {
    # print(paste("working on session", i, "of", maxCount))
    cSP <- 1
    row <- s[i, ]
    TIME <- row$time[[1]]
    PATHS <- row$paths[[1]]
    sessionLength <- length(TIME)
    TIMES <- strptime(TIME, format = dFormat)
    TIMES <- as.numeric(TIMES)
    if (sessionLength < 2) {
      finalSessions[[id]] <- row
      id <- id + 1
    } else{
      # currentTime <- strptime(TIME[1], format = dFormat)
      cT2 <- TIMES[1]
      for (j in 2:sessionLength) {
        # nextTime <-  strptime(TIME[j], format = dFormat)
        nT2 <- TIMES[j]
        # diff <- as.numeric(difftime(nextTime, currentTime, units = 'mins'))
        diff <- (nT2 - cT2) /60
        if (diff > border) {
          cS <- row
          cS$cookie = id
          index <- cSP:j - 1
          cS$time[[1]] <- list(TIME[index])
          cS$paths[[1]] <- list(PATHS[index])
          cS$durationInMin <- durationCalc2(TIMES[index])
          cS$length <- length(cS$paths[[1]])
          cSP <- j
          finalSessions[[id]] <- cS
          id <- id + 1
        }
        cT2 <- nT2
      }
      cS <- row
      cS$cookie = id
      cS$time[[1]] <- list(TIME[cSP:sessionLength])
      cS$paths[[1]] <- list(PATHS[cSP:sessionLength])
      newTIMES <- TIMES[cSP:sessionLength]
      cS$durationInMin <- durationCalc2(newTIMES)
      cS$length <- length(cS$paths[[1]])
      finalSessions[[id]] <- cS
      id <- id + 1
    }
  }
  finalSessions <- rbindlist(finalSessions)
  finalSessions
}  

应该快2倍:

system.time(rez1 <- yourFunction(s)) #5.81
system.time(rez2 <- ff(s)) # 2.74
2.58 / 5.81
all.equal(rez1, rez2)

下次尝试介绍您的代码(如果您愿意的话,那么您会发现difftime很慢,并且可能会加快速度(。您提供的示例数据也很糟糕,它不包含所有代码执行的示例!

最新更新