我有如下示例数据:
library(data.table)
sample <- fread("
1,0,2,NA,cat X, type 1
3,4,3,1,cat X, type 2
1,0,2,2,cat X, type 3
3,4,3,0,cat X, type 4
1,0,2,NA,cat Y, type 1
3,4,3,NA,cat Y, type 2
1,0,2,2,cat Y, type 3
3,4,3,35,cat Y, type 4
1,0,2,NA,cat X, type 1
3,4,3,1,cat X, type 2
1,0,2,2,cat X, type 3
3,4,3,NA,cat X, type 4
1,0,2,NA,cat Y, type 1
3,4,3,NA,cat Y, type 2
1,0,2,2,cat Y, type 3
3,4,3,1,cat Y, type 4
1,0,2,4,cat X, type 1
3,4,3,1,cat X, type 2
1,0,2,2,cat X, type 3
3,4,3,2,cat X, type 4
1,0,2,NA,cat Y, type 1
3,4,3,NA,cat Y, type 2
1,0,2,2,cat Y, type 3
3,4,3,2,cat Y, type 4
")
names(sample) <- c("A","B","C", "D", "cat", "type")
sample <- sample[, observations := sum(!is.na(D)), by = c("cat", "type")]
A B C D cat type observations
1: 1 0 2 NA cat X type 1 1
2: 3 4 3 1 cat X type 2 3
3: 1 0 2 2 cat X type 3 3
4: 3 4 3 0 cat X type 4 2
5: 1 0 2 NA cat Y type 1 0
6: 3 4 3 NA cat Y type 2 0
7: 1 0 2 2 cat Y type 3 3
8: 3 4 3 35 cat Y type 4 3
9: 1 0 2 NA cat X type 1 1
10: 3 4 3 1 cat X type 2 3
...
24: 3 4 3 0 cat Y type 4 3
如果相邻组type
的观测值少于两个,我想将它们相加。
例如:将只有1个观测值的type 1
组添加到第2组的观测值中(请参见所需输出的第一行(。
需要将类型汇集在一起,直到所有剩余类别至少有2个观测值。因此category Y
的type 1
和type 2
需要与type 3
合并。
我很难想出为此编写代码的方法。
有人能提出一个自动创建新类型的好方法吗?
我意识到,在某些情况下,可能会有两种可能的解决方案来汇集团队。然而,只要添加在一起的组是相邻组(因此type 1
不添加到type 4
,则哪些组添加在一起并不重要
期望输出:
A B C D cat type new_type observations
1: 1 0 2 NA cat X type 1 type 2 4
2: 3 4 3 1 cat X type 2 type 2 4
3: 1 0 2 2 cat X type 3 type 3 3
4: 3 4 3 0 cat X type 4 type 4 2
5: 1 0 2 2 cat Y type 1 type 3 3
6: 3 4 3 NA cat Y type 2 type 3 3
7: 1 0 2 2 cat Y type 3 type 3 3
8: 3 4 3 0 cat Y type 4 type 4 3
9: 1 0 2 NA cat X type 1 type 2 4
10: 3 4 3 1 cat X type 2 type 2 4
...
24: 3 4 3 0 cat Y type 4 type 4 3
解决方案不必使用数据。表
带Reduce
和accumulate = T
选项:
sample[,`:=`(type = last(type),observations=sum(observations)),
.(cat,sapply(Reduce(f = function(x,y) {
grp= x$grp
if (x$nxtgrp) {grp=grp+1; x$cumsum=0}
nxtgrp=!((x$cumsum+y)<2)
list(grp = grp,
cumsum=x$cumsum + y,
nxtgrp = nxtgrp)},
x = observations,
init = list(grp = 0, cumsum=0, nxtgrp = F),
accumulate = T),
function(x) x$grp)[-1])
][]
A B C D cat type observations
<int> <int> <int> <int> <char> <char> <int>
1: 1 0 2 NA cat X type 2 4
2: 3 4 3 1 cat X type 2 4
3: 1 0 2 2 cat X type 3 3
4: 3 4 3 0 cat X type 4 2
5: 1 0 2 NA cat Y type 3 3
6: 3 4 3 NA cat Y type 3 3
7: 1 0 2 2 cat Y type 3 3
8: 3 4 3 35 cat Y type 4 3
9: 1 0 2 NA cat X type 2 4
10: 3 4 3 1 cat X type 2 4
11: 1 0 2 2 cat X type 3 3
12: 3 4 3 NA cat X type 4 2
13: 1 0 2 NA cat Y type 3 3
14: 3 4 3 NA cat Y type 3 3
15: 1 0 2 2 cat Y type 3 3
16: 3 4 3 1 cat Y type 4 3
17: 1 0 2 4 cat X type 2 4
18: 3 4 3 1 cat X type 2 4
19: 1 0 2 2 cat X type 3 3
20: 3 4 3 2 cat X type 4 2
21: 1 0 2 NA cat Y type 3 3
22: 3 4 3 NA cat Y type 3 3
23: 1 0 2 2 cat Y type 3 3
24: 3 4 3 2 cat Y type 4 3
A B C D cat type observations
这个想法是用生成一个累积列表
- 当前组:
grp
- 当前累计数:
cumsum
- 为下一行递增组的标志:
nxtgrp
一旦观测次数超过2,就会设置递增组的标志
设置标志后,在下一行,cumsum
重置为零,grp
递增。
CCD_ 16列表元素然后可以用作CCD_ 18中的CCD_ 17参数。
另一种可能性是在R或Rcpp:中使用for-loop
函数实现相同的分组
observations_grp <- function(x) {
cumsum_i <- 0
nxtgrp <- F
n <- length(x)
grp <- rep(0,n)
grp_i <- 0;
for (i in 1:n) {
if (nxtgrp) {grp_i <- grp_i + 1; cumsum_i <- 0;}
nxtgrp <- !((cumsum_i + x[i]) < 2)
cumsum_i <- cumsum_i + x[i]
grp[i] <- grp_i
}
grp
}
sample[,`:=`(type = last(type), observations=sum(observations)),
.(cat,observations_grp(observations))
][]
性能比较表明,Reduce
并不比R环路快:
Unit: milliseconds
expr min lq mean median uq max neval
Reduce 1.3458 1.45025 1.732185 1.56405 1.73740 6.3339 100
Loop 1.3374 1.44175 1.685722 1.53120 1.67665 3.7091 100
如果你需要速度,Rcpp肯定会帮你很多忙。
也许您可以创建一个helper
函数,如下所示
helper <- function(v) {
s <- grp <- 0
y <- vector("numeric", length(v))
for (i in seq_along(v)) {
y[i] <- grp
s <- s + v[i]
if (s >= 2) {
s <- 0
grp <- grp + 1
}
}
y
}
然后运行
dt <- sample[
,
c(.(grp = helper(observations)), .SD),
.(id = rleid(cat))
][
,
`:=`(type = last(type), observations = sum(observations)),
.(id, grp)
][, -(1:2)]
您将获得
> dt
A B C D type observations
1: 1 0 2 NA type 2 4
2: 3 4 3 1 type 2 4
3: 1 0 2 2 type 3 3
4: 3 4 3 0 type 4 2
5: 1 0 2 NA type 3 3
6: 3 4 3 NA type 3 3
7: 1 0 2 2 type 3 3
8: 3 4 3 35 type 4 3
9: 1 0 2 NA type 2 4
10: 3 4 3 1 type 2 4
11: 1 0 2 2 type 3 3
12: 3 4 3 NA type 4 2
13: 1 0 2 NA type 3 3
14: 3 4 3 NA type 3 3
15: 1 0 2 2 type 3 3
16: 3 4 3 1 type 4 3
17: 1 0 2 4 type 2 4
18: 3 4 3 1 type 2 4
19: 1 0 2 2 type 3 3
20: 3 4 3 2 type 4 2
21: 1 0 2 NA type 3 3
22: 3 4 3 NA type 3 3
23: 1 0 2 2 type 3 3
24: 3 4 3 2 type 4 3
A B C D type observations
这里有一个tidyverse解决方案。当观测<2,其他类型会更高一个。
library(dplyr)
sample %>%
mutate(
new_type = as.numeric(factor(type)),
new_type = paste0(
"type ",
ifelse(observations<2,
ifelse(new_type != max(new_type), new_type + 1, new_type - 1),
new_type)
)
) %>%
group_by(cat, new_type) %>%
mutate(observations = sum(!is.na(D))) %>%
ungroup()