我正在寻找一种优化 data.table 聚合的方法,我有数百万个数据,而且我目前的实现很慢。
可重现的示例:
library(data.table)
df <- data.table(Factor = as.factor(rep(LETTERS[1:3], 3)),
Variable = 1:9)
当前实现:
aux <- df[, .(sumVar = sum(Variable)/sum(df$Variable)), by = .(Factor)]
df[aux, sumVar := sumVar, on = .(Factor = Factor)]
期望输出:
> df
Factor Variable sumVar
1: A 1 0.2666667
2: B 2 0.3333333
3: C 3 0.4000000
4: A 4 0.2666667
5: B 5 0.3333333
6: C 6 0.4000000
7: A 7 0.2666667
8: B 8 0.3333333
9: C 9 0.4000000
我认为我的问题出在merge
,但我不知道如何改进它,我不熟悉dplyr
,也没有找到任何方法可以一步到位地进行操作data.table
.
任何帮助不胜感激!
您的示例中有很多重复,因此不确定我是否解释了它。尽管如此,似乎最好只计算一次分母并使用gsum
:
BigTotal <- df[, sum(Variable)]
df[, sumVar1 := sum(Variable), by = .(Factor)][, propVar := sumVar1 / BigTotal]
大约是 Ben 最快解决方案的一半时间。
df <- data.table(
Factor = as.factor(sample(LETTERS, size = 10^8, replace = T)),
Variable = sample(10^3, size = 10^8, replace = T)
)
microbenchmark::microbenchmark(dt1 = {
aux <- df[, .(sumVar = sum(Variable)/sum(df$Variable)), keyby = .(Factor)]
df[aux, sumVar := sumVar, on = .(Factor = Factor)]
},
dt2 = {
BigTotal <- df[, sum(Variable)]
df[, sumVar1 := sum(Variable), by = .(Factor)][, propVar := sumVar1 / BigTotal]
},
times = 2)
Unit: seconds
expr min lq mean median uq max neval cld
dt1 9.523696 9.523696 9.567555 9.567555 9.611414 9.611414 2 b
dt2 3.996581 3.996581 4.521274 4.521274 5.045967 5.045967 2 a
类似
df[ , ':='(sumVar = sum(Variable)/sum(df$Variable)), by = .(Factor)]
您拥有什么样的数据和期望的时间?在以下具有 100M 行的示例中,我得到以下计时
library(data.table)
df <- data.table(
Factor = as.factor(sample(LETTERS, size = 10^8, replace = T)),
Variable = sample(10^3, size = 10^8, replace = T)
)
# data.table solution 1
system.time({
aux <- df[, .(sumVar = sum(Variable)/sum(df$Variable)), by = .(Factor)]
df[aux, sumVar := sumVar, on = .(Factor = Factor)]
}) # ~10.5 seconds
# data.table solution 2
system.time({
df[, sumVar := sum(Variable)/sum(df$Variable), by = Factor]
}) # ~8.3 seconds
# dplyr solution 1
system.time({
df %>% dplyr::group_by(Factor) %>% dplyr::mutate(A=sum(Variable)/sum(df$Variable))
}) # ~10.0 seconds
请注意,随着因子基数的增加,data.table 的加速变得更加令人印象深刻。
df <- data.table(
Factor = as.factor(sample(as.character(10^6), size = 10^8, replace = T)),
Variable = as.numeric(sample(10^3, size = 10^8, replace = T))
)
# data.table solution 1
system.time({
aux <- df[, .(sumVar = sum(Variable)/sum(df$Variable)), by = .(Factor)]
df[aux, sumVar := sumVar, on = .(Factor = Factor)]
}) # ~5.0 seconds
# data.table solution 2
system.time({
df[, sumVar := sum(Variable)/sum(df$Variable), by = Factor]
}) # ~3.1 seconds
# dplyr solution 1
system.time({
df %>% dplyr::group_by(Factor) %>% dplyr::mutate(A=sum(Variable)/sum(df$Variable))
}) # ~6.9 seconds