r-在数据表中对n个或多个观测值进行分组,而不交叉计算连续值的序列



在R中,我有以下示例数据表:

library(data.table)
type <- c("d1", "d1", "d2", "d3", "d3", "d3", "d4", "d4", "d4", "d4", "d4", "d5", "d5", "d6", "d6")
DT <- data.table(type)
DT[, id := seq(.N), by = .(type)]

看起来像这样:

# Input:
#
type id
1:   d1  1
2:   d1  2
3:   d2  1
4:   d3  1
5:   d3  2
6:   d3  3
7:   d4  1
8:   d4  2
9:   d4  3
10:   d4  4
11:   d4  5
12:   d5  1
13:   d5  2
14:   d6  1
15:   d6  2

我想通过添加一个新列,将type列中的观察结果分组为五个组块,每个组块包含一个唯一的ID。然而,type列中具有相同值的序列被而不是分配给不同的组ID,这意味着一个块可能包含五个以上的元素。换句话说,我试图实现的是添加一列chunk,该列具有一个计数器,该计数器在计数了五个元素后增加+1,并且来自type列的最后一个连续相同值序列完成。因此,所需输出为:

# Desired output
type id chunk
1:   d1  1     1
2:   d1  2     1
3:   d2  1     1
4:   d3  1     1
5:   d3  2     1
6:   d3  3     1
7:   d4  1     2
8:   d4  2     2
9:   d4  3     2
10:   d4  4     2
11:   d4  5     2
12:   d5  1     3
13:   d5  2     3
14:   d6  1     3
15:   d6  2     3

欢迎任何建议和帮助,尤其是矢量化解决方案。事先非常感谢。

DT[, grp := .GRP, type]

i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
#     type id chunk
#  1:   d1  1     1
#  2:   d1  2     1
#  3:   d2  1     1
#  4:   d3  1     1
#  5:   d3  2     1
#  6:   d3  3     1
#  7:   d4  1     2
#  8:   d4  2     2
#  9:   d4  3     2
# 10:   d4  4     2
# 11:   d4  5     2
# 12:   d5  1     3
# 13:   d5  2     3
# 14:   d6  1     3
# 15:   d6  2     3

@Frank在评论中发布了一个更简单的解决方案

gDT = DT[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){ 
s = 0 
gg = gg + 1L
gDT[ii:.N, g := gg][]
} 
else s = s + gDT$N[ii]
}
DT[gDT, on=.(type), g := i.g]

添加Rcpp方法和一些定时:

样本数据:

library(Rcpp)
library(data.table)
library(microbenchmark)
set.seed(0L)
ntypes <- 1e4L
x <- unlist(mapply(rep, 1:ntypes, sample(1:10, ntypes, replace=TRUE)))
DT <- data.table(type=x)
DT1 <- copy(DT)

函数defn:

system.time(cppFunction(
'NumericVector lumpGrp(NumericVector x) {
int counter = 1, grp = 1;
NumericVector ret(x.size());
ret[0] = grp;
for (int n = 1; n < x.size(); n++) {
if (counter >= 5 && x[n] != x[n-1]) {
counter = 1;
grp += 1;
} else {
counter += 1;
}
ret[n] = grp;
}
return ret;
}'))

mtd0 <- function() {
gDT = DT1[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){
s = 0
gg = gg + 1L
gDT[ii:.N, g := gg][]
}
else s = s + gDT$N[ii]
}
DT1[gDT, on=.(type), g := i.g]
DT1
}
mtd1 <- function() {
DT[, grp := .GRP, type][]
i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
}

检查:

#identical(mtd0()$g, mtd1()$chunk)
#identical(mtd0()$g, lumpGrp(x))
head(x, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 3 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 7 7 7 7 7 7
#[42] 7 7 7 8 8 8 8 8 8

head(mtd0()$g, 50)
# [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4
#[42] 4 4 4 5 5 5 5 5 5
head(mtd1()$chunk, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6
head(lumpGrp(x), 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6

tail(mtd1()$chunk, 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809    1    1    1    1    1    1
tail(lumpGrp(x), 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809 6810 6810 6810 6810 6810 6810

定时码:

mtd0 <- compiler::cmpfun(mtd0)
mtd1 <- compiler::cmpfun(mtd1)
microbenchmark(mtd0(), mtd1(), lumpGrp(x), times=3L)

时间:

Unit: microseconds
expr         min           lq         mean      median          uq         max neval
mtd0() 2930021.679 2952683.7490 2972758.5293 2975345.819 2994126.954 3012908.090     3
mtd1()    7306.673    7573.0050    7763.6367    7839.337    7992.119    8144.900     3
lumpGrp(x)     431.032     431.3635     476.6073     431.695     499.395     567.095     3

一个选项是

DT[, chunk := cumsum(shift(!cumsum(id != shift(id, fill = id[2])) %% 5,
fill = TRUE))]
DT
#    type id chunk
# 1:   d1  1     1
# 2:   d1  2     1
# 3:   d2  1     1
# 4:   d3  1     1
# 5:   d3  2     1
# 6:   d3  3     1
# 7:   d4  1     2
# 8:   d4  2     2
# 9:   d4  3     2
#10:   d4  4     2
#11:   d4  5     2
#12:   d5  1     3
#13:   d5  2     3
#14:   d6  1     3
#15:   d6  2     3

最新更新