hi im当前正在进行Web使用挖掘。为此,我需要循环浏览所有数据条目(204002行((每行都是一个包含时间戳和访问的页面的Web会话(,并对它们进行一些工作。这是数据的数据:
structure(list(cookie = "1",
paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"SYSTEM", "SYSTEM", "SYSTEM")), time = list(c("2017-05-01T00:00:00.000Z",
"2017-05-01T00:00:10.000Z", "2017-05-01T00:00:41.000Z", "2017-05-01T00:00:48.000Z",
"2017-05-01T00:03:28.000Z", "2017-05-01T00:03:40.000Z", "2017-05-01T00:03:53.000Z",
"2017-05-01T00:04:09.000Z", "2017-05-01T00:04:17.000Z", "2017-05-01T00:04:26.000Z",
"2017-05-01T00:04:30.000Z", "2017-05-01T00:04:34.000Z", "2017-05-01T00:04:40.000Z",
"2017-05-01T00:05:36.000Z", "2017-05-01T00:05:46.000Z", "2017-05-01T00:05:52.000Z",
"2017-05-01T00:06:00.000Z", "2017-05-01T00:06:38.000Z", "2017-05-01T00:06:57.000Z",
"2017-05-01T00:07:01.000Z")), length = 20L, durationInMin = 7.01666666666667), .Names = c("cookie",
"paths", "time", "length", "durationInMin"), class = c("data.table",
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x00000000001f0788>)
我看看是否需要将会话分为两个或多个会话。为此,我查看了会话中的每个时间戳,并将它们与本届会议中的先前时间戳进行比较。如果衍射越过边界,则会分为两个会话。结果是一个新数据。该代码有效,但非常慢(多小时(。随着时间的流逝,速度越慢。首先,我认为这是循环内的增长列表,但是我通过无需结果列表进行循环进行了检查。我的代码如下:
function(sessions) {
durationCalc <- function(timeList) {
last <-
strptime(timeList[[1]][length(timeList[[1]])], format = "%Y-%m-%dT%H:%M:%S")
first <-
strptime(timeList[[1]][length(1)], format = "%Y-%m-%dT%H:%M:%S")
res <- as.numeric(difftime(last, first, units = 'mins'))
}
id <- 1
border <- 30
maxCount <- nrow(sessions)
# list for the final sessions
finalSessions <- vector("list", maxCount)
# iterate over every session to break down into smaller sessions
for (i in 1:maxCount) {
print(paste("working on session", i, "of", maxCount))
currentStartPosition <- 1
row <- sessions[i, ]
sessionLength <- length(row$time[[1]])
# if the session containts only one path/timestamp, there is no further processing required
# if it contains two or more, each timestamp has to be checked.
if (sessionLength < 2) {
finalSessions[[id]] <- row
id <- id + 1
}
else{
currentTime <-
strptime(row$time[[1]][1], format = "%Y-%m-%dT%H:%M:%S")
for (j in 2:sessionLength) {
nextTime = strptime(row$time[[1]][j], format = "%Y-%m-%dT%H:%M:%S")
diff <-
as.numeric(difftime(nextTime, currentTime, units = 'mins'))
# if the timestamp is 30 minutes or more later the current sessions (row) gets split
if (diff > border) {
# make a copy of the original row and modify values, then add the modified row to the finalSessions
# the currentStartposition gets the currentTimestamp and the loop continues
currentSession <- row
currentSession$cookie = id
currentSession$time[[1]] <-
list(row$time[[1]][currentStartPosition:j - 1])
currentSession$paths[[1]] <-
list(row$paths[[1]][currentStartPosition:j - 1])
currentSession$durationInMin <-
durationCalc(currentSession$time)
currentSession$length <- length(currentSession$paths[[1]])
currentStartPosition = j
finalSessions[[id]] <- currentSession
id <- id + 1
}
# at last the currentTimestamp gets the next Time stamp, it iterates over the whole timestamp list
currentTime = nextTime
}
# after the loop the final session gets built. copy the original row, modify the values and add it to the finalSessions
currentSession <- row
currentSession$cookie = id
currentSession$time[[1]] <-
list(row$time[[1]][currentStartPosition:sessionLength])
currentSession$paths[[1]] <-
list(row$paths[[1]][currentStartPosition:sessionLength])
currentSession$durationInMin <-
durationCalc(currentSession$time)
currentSession$length <- length(currentSession$paths[[1]])
finalSessions[[id]] <- currentSession
id <- id + 1
}
}
finalSessions <- rbindlist(finalSessions)
}
尝试以下:
sessions <- structure(list(cookie = "1",
paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash",
"SYSTEM", "SYSTEM", "SYSTEM")),
time = list(c(
"2017-05-01T00:00:00.000Z",
"2017-05-01T00:00:10.000Z",
"2017-05-01T00:00:41.000Z",
"2017-05-01T00:00:48.000Z",
"2017-05-01T00:03:28.000Z",
"2017-05-01T00:03:40.000Z",
"2017-05-01T00:03:53.000Z",
"2017-05-01T00:04:09.000Z",
"2017-05-01T00:04:17.000Z",
"2017-05-01T00:04:26.000Z",
"2017-05-01T00:04:30.000Z",
"2017-05-01T00:04:34.000Z",
"2017-05-01T00:04:40.000Z",
"2017-05-01T00:05:36.000Z",
"2017-05-01T00:05:46.000Z",
"2017-05-01T00:05:52.000Z",
"2017-05-01T00:06:00.000Z",
"2017-05-01T00:06:38.000Z",
"2017-05-01T00:06:57.000Z",
"2017-05-01T00:40:01.000Z")),
length = 20L,
durationInMin = 7.01666666666667), .Names = c("cookie",
"paths", "time", "length", "durationInMin"),
class = c("data.table", "data.frame"),
row.names = c(NA, -1L))
s <- replicate(1000, sessions, simplify = F)
# str(s)
s <- rbindlist(s)
ff <- function(s) {
dFormat <- "%Y-%m-%dT%H:%M:%S"
durationCalc2 <- function(timeList) {
tt <- timeList
(tt[length(tt)] - tt[1]) / 60
}
id <- 1
border <- 30
maxCount <- nrow(s)
finalSessions <- vector("list", maxCount)
for (i in 1:maxCount) {
# print(paste("working on session", i, "of", maxCount))
cSP <- 1
row <- s[i, ]
TIME <- row$time[[1]]
PATHS <- row$paths[[1]]
sessionLength <- length(TIME)
TIMES <- strptime(TIME, format = dFormat)
TIMES <- as.numeric(TIMES)
if (sessionLength < 2) {
finalSessions[[id]] <- row
id <- id + 1
} else{
# currentTime <- strptime(TIME[1], format = dFormat)
cT2 <- TIMES[1]
for (j in 2:sessionLength) {
# nextTime <- strptime(TIME[j], format = dFormat)
nT2 <- TIMES[j]
# diff <- as.numeric(difftime(nextTime, currentTime, units = 'mins'))
diff <- (nT2 - cT2) /60
if (diff > border) {
cS <- row
cS$cookie = id
index <- cSP:j - 1
cS$time[[1]] <- list(TIME[index])
cS$paths[[1]] <- list(PATHS[index])
cS$durationInMin <- durationCalc2(TIMES[index])
cS$length <- length(cS$paths[[1]])
cSP <- j
finalSessions[[id]] <- cS
id <- id + 1
}
cT2 <- nT2
}
cS <- row
cS$cookie = id
cS$time[[1]] <- list(TIME[cSP:sessionLength])
cS$paths[[1]] <- list(PATHS[cSP:sessionLength])
newTIMES <- TIMES[cSP:sessionLength]
cS$durationInMin <- durationCalc2(newTIMES)
cS$length <- length(cS$paths[[1]])
finalSessions[[id]] <- cS
id <- id + 1
}
}
finalSessions <- rbindlist(finalSessions)
finalSessions
}
应该快2倍:
system.time(rez1 <- yourFunction(s)) #5.81
system.time(rez2 <- ff(s)) # 2.74
2.58 / 5.81
all.equal(rez1, rez2)
下次尝试介绍您的代码(如果您愿意的话,那么您会发现difftime
很慢,并且可能会加快速度(。您提供的示例数据也很糟糕,它不包含所有代码执行的示例!