在R中,我有以下示例数据表:
library(data.table)
type <- c("d1", "d1", "d2", "d3", "d3", "d3", "d4", "d4", "d4", "d4", "d4", "d5", "d5", "d6", "d6")
DT <- data.table(type)
DT[, id := seq(.N), by = .(type)]
看起来像这样:
# Input:
#
type id
1: d1 1
2: d1 2
3: d2 1
4: d3 1
5: d3 2
6: d3 3
7: d4 1
8: d4 2
9: d4 3
10: d4 4
11: d4 5
12: d5 1
13: d5 2
14: d6 1
15: d6 2
我想通过添加一个新列,将type
列中的观察结果分组为五个组块,每个组块包含一个唯一的ID。然而,type
列中具有相同值的序列被而不是分配给不同的组ID,这意味着一个块可能包含五个以上的元素。换句话说,我试图实现的是添加一列chunk
,该列具有一个计数器,该计数器在计数了五个元素后增加+1,并且来自type
列的最后一个连续相同值序列完成。因此,所需输出为:
# Desired output
type id chunk
1: d1 1 1
2: d1 2 1
3: d2 1 1
4: d3 1 1
5: d3 2 1
6: d3 3 1
7: d4 1 2
8: d4 2 2
9: d4 3 2
10: d4 4 2
11: d4 5 2
12: d5 1 3
13: d5 2 3
14: d6 1 3
15: d6 2 3
欢迎任何建议和帮助,尤其是矢量化解决方案。事先非常感谢。
DT[, grp := .GRP, type]
i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
# type id chunk
# 1: d1 1 1
# 2: d1 2 1
# 3: d2 1 1
# 4: d3 1 1
# 5: d3 2 1
# 6: d3 3 1
# 7: d4 1 2
# 8: d4 2 2
# 9: d4 3 2
# 10: d4 4 2
# 11: d4 5 2
# 12: d5 1 3
# 13: d5 2 3
# 14: d6 1 3
# 15: d6 2 3
@Frank在评论中发布了一个更简单的解决方案
gDT = DT[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){
s = 0
gg = gg + 1L
gDT[ii:.N, g := gg][]
}
else s = s + gDT$N[ii]
}
DT[gDT, on=.(type), g := i.g]
添加Rcpp
方法和一些定时:
样本数据:
library(Rcpp)
library(data.table)
library(microbenchmark)
set.seed(0L)
ntypes <- 1e4L
x <- unlist(mapply(rep, 1:ntypes, sample(1:10, ntypes, replace=TRUE)))
DT <- data.table(type=x)
DT1 <- copy(DT)
函数defn:
system.time(cppFunction(
'NumericVector lumpGrp(NumericVector x) {
int counter = 1, grp = 1;
NumericVector ret(x.size());
ret[0] = grp;
for (int n = 1; n < x.size(); n++) {
if (counter >= 5 && x[n] != x[n-1]) {
counter = 1;
grp += 1;
} else {
counter += 1;
}
ret[n] = grp;
}
return ret;
}'))
mtd0 <- function() {
gDT = DT1[, .N, by=type][, g := 1L]
s = first(gDT$N)
gg = 1L
for (ii in 1:nrow(gDT)){
if (s >= 5){
s = 0
gg = gg + 1L
gDT[ii:.N, g := gg][]
}
else s = s + gDT$N[ii]
}
DT1[gDT, on=.(type), g := i.g]
DT1
}
mtd1 <- function() {
DT[, grp := .GRP, type][]
i <- 1
DT[1:5, chunk := i] # set chunk = i for first five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
while((last.I <- DT[, last(.I[!is.na(chunk)])]) < nrow(DT)){
i <- i + 1
DT[last.I + seq(min(c(5, nrow(DT) - last.I))), chunk := i] # set chunk = i for next five rows
DT[grp == last(grp[!is.na(chunk)]), chunk := i] # make chunk = i for any rows with same type
}
DT[, grp := NULL][]
}
检查:
#identical(mtd0()$g, mtd1()$chunk)
#identical(mtd0()$g, lumpGrp(x))
head(x, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 3 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 7 7 7 7 7 7
#[42] 7 7 7 8 8 8 8 8 8
head(mtd0()$g, 50)
# [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4
#[42] 4 4 4 5 5 5 5 5 5
head(mtd1()$chunk, 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6
head(lumpGrp(x), 50)
# [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5
#[42] 5 5 5 6 6 6 6 6 6
tail(mtd1()$chunk, 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809 1 1 1 1 1 1
tail(lumpGrp(x), 20)
#[1] 6807 6807 6807 6808 6808 6808 6808 6808 6809 6809 6809 6809 6809 6809 6810 6810 6810 6810 6810 6810
定时码:
mtd0 <- compiler::cmpfun(mtd0)
mtd1 <- compiler::cmpfun(mtd1)
microbenchmark(mtd0(), mtd1(), lumpGrp(x), times=3L)
时间:
Unit: microseconds
expr min lq mean median uq max neval
mtd0() 2930021.679 2952683.7490 2972758.5293 2975345.819 2994126.954 3012908.090 3
mtd1() 7306.673 7573.0050 7763.6367 7839.337 7992.119 8144.900 3
lumpGrp(x) 431.032 431.3635 476.6073 431.695 499.395 567.095 3
一个选项是
DT[, chunk := cumsum(shift(!cumsum(id != shift(id, fill = id[2])) %% 5,
fill = TRUE))]
DT
# type id chunk
# 1: d1 1 1
# 2: d1 2 1
# 3: d2 1 1
# 4: d3 1 1
# 5: d3 2 1
# 6: d3 3 1
# 7: d4 1 2
# 8: d4 2 2
# 9: d4 3 2
#10: d4 4 2
#11: d4 5 2
#12: d5 1 3
#13: d5 2 3
#14: d6 1 3
#15: d6 2 3