如何在R中使用混合数据类型制作汇总数据框架



充分意识到这种类型的问题已经被问过几百次了。
对于我所描述的具体问题,我仍然找不到答案,这个问题是关于:

  • 性能(即我知道如何做我需要的,但在某些情况下它太慢了,所以我正在寻找一个更快的解决方案)
  • 良好的编程实践(即我质疑我选择的方法是否"干净",而不是迂回或低效的其他原因)

我有一个数据框架与数字和字符列。我需要从中创建一个data.frame摘要,由字符列(ID)之一分组,并报告1)关于每个组内的一些数值列的一些统计数据, 2)一些字符连接(即报告具有混合数据类型-这就是使它变得棘手的原因,至少对我来说,这就是为什么我在征求意见)。

下面是R脚本:

# Simulate original data.frame
set.seed(384092)
N <- 10000
d <- data.frame("ID" = paste0(sample(LETTERS, N, replace = T),  sprintf("%03.0f", sample(1:floor(sqrt(N)), N, replace = T )) ), stringsAsFactors = F)
d["set"] <- sample(LETTERS, N, replace = T)
d["P"] <- runif(N, -20, 120)
d["K"] <- rnorm(N, 10, 0.5)
# Make summary
# For each unique ID, report: ID, number of rows of d, mean of P, sd of P, comma-separated list of unique set's
# Method 1: rbind data.frames from 'by'
time.1 <- system.time({
  d_summary.1 <- do.call(rbind, by(d, d$ID, function(dd) {
    data.frame("ID" = dd$ID[1], "N" = nrow(dd), "P_mean" = mean(dd$P), "P_sd" = sd(dd$P), "sets" = paste(unique(dd$set), collapse = ","))
  })
  )
})
cat("ntime.1 =",time.1,"n")
print(sapply(d_summary.1, class))
# Method 2: create a list of lists and combine them at the end
# https://stackoverflow.com/a/68162050/6376297
time.2 <- system.time({
  time.2.1 <- system.time({d_summary.2 <- by(d, d$ID, function(dd) {
    list("ID" = dd$ID[1], "N" = nrow(dd), "P_mean" = mean(dd$P), "P_sd" = sd(dd$P), "sets" = paste(unique(dd$set), collapse = ","))
  })
  })
  d_summary.2 <- do.call(rbind, lapply(d_summary.2, data.frame))
})
cat("ntime.2.1 =",time.2.1)
cat("ntime.2 =",time.2,"n")
print(sapply(d_summary.2, class))

,在我的PC上产生以下输出:

time.1 = 1.72 0 1.72 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 
time.2.1 = 0.3 0 0.29 NA NA
time.2 = 1.79 0 1.82 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character"

链接的帖子https://stackoverflow.com/a/68162050/6376297特别提到,方法2中使用的那种处理是必要的,以避免将所有列强制为单一数据类型。
事实上,我所尝试的任何解决方案都依赖于制作一个中间矩阵,正如完全预期的那样,结果是强制转换为字符。

这真的很不幸,因为如time.2.1所示,包含所需信息的列表的列表的初始形成(仍然保留所有原始数据类型)只需要总时间的1/6 - 1/5。
你需要想象一下,我在d上做这个,它至少比这个例子大10-100倍。

有人能建议/建议一个更快的方法来做到这一点吗?

谢谢!


编辑:跟进用户反馈

dplyr(4)和data.table(5)方法的试验,加上几个更基本的R方法(使用aggregate,(6)和(7)),这些方法更复杂,但可能与这两个方法有一定的竞争。

# Method 4: dplyr
require(dplyr)
time.4 <- system.time({
  d %>% 
    group_by(ID) %>% 
    summarise(N = n(),
              P_mean = mean(P),
              P_sd = sd(P),
              sets = paste(unique(set), collapse = ",")) -> d_summary.4
})
cat("ntime.4 =",time.4,"n")
print(sapply(d_summary.4, class))
# Method 5: data.table
require(data.table)
time.5 <- system.time({
  setDT(d)
  
  d_summary.5 <- d[, .(N = .N, 
        P_mean = mean(P), 
        P_sd = sd(P), 
        sets = toString(unique(set))), ID]
  
  d_summary.5 <- as.data.frame(d_summary.5)
  
})
cat("ntime.5 =",time.5,"n")
print(sapply(d_summary.5, class))
# Method 6: aggregate each column separately and merge
time.6 <- system.time({
  
  d_summary.6 <- setNames(as.data.frame(table(d$ID), stringsAsFactors = F),c("ID","N"))
  d_summary.6 <- merge(d_summary.6, setNames(aggregate(P ~ ID, data = d, FUN = mean),c("ID","P_mean")), by = "ID")
  d_summary.6 <- merge(d_summary.6, setNames(aggregate(P ~ ID, data = d, FUN = sd),c("ID","P_sd")), by = "ID")
  d_summary.6 <- merge(d_summary.6, setNames(aggregate(set ~ ID, data = d, FUN = function(x) {paste(unique(x),collapse=",")}),c("ID","sets")), by = "ID")
  
})
cat("ntime.6 =",time.6,"n")
print(sapply(d_summary.6, class))
# Method 7: aggregate each column separately and cbind (this assumes that both table and aggregate will report all values of ID, sorted)
time.7 <- system.time({
  
  d_summary.7 <- setNames(as.data.frame(table(d$ID), stringsAsFactors = F),c("ID","N"))
  d_summary.7 <- cbind(d_summary.7, "P_mean" = aggregate(P ~ ID, data = d, FUN = mean)[,2])
  d_summary.7 <- cbind(d_summary.7, "P_sd" = aggregate(P ~ ID, data = d, FUN = sd)[,2])
  d_summary.7 <- cbind(d_summary.7, "sets" = aggregate(set ~ ID, data = d, FUN = function(x) {paste(unique(x),collapse=",")})[,2])
  
})
cat("ntime.7 =",time.7,"n")
print(sapply(d_summary.7, class))

时间:

time.1 = 1.73 0.02 1.77 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 
time.2.1 = 0.29 0 0.3 NA NA
time.2 = 1.83 0.01 1.84 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 
time.4 = 0.13 0 0.13 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 
time.5 = 0.08 0 0.08 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 
time.6 = 0.25 0 0.25 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 
time.7 = 0.25 0 0.25 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 

您可以使用dplyr来完成此任务:

library(dplyr)
d %>% 
  group_by(ID) %>% 
  summarise(N = n(),
            P_mean = mean(P),
            P_sd = sd(P),
            sets = paste(unique(set), collapse = ","))

返回
# A tibble: 2,553 x 5
   ID        N P_mean  P_sd sets     
   <chr> <int>  <dbl> <dbl> <chr>    
 1 A001      4   27.4  42.1 N,Z,C    
 2 A002      3   46.6  40.6 Z,R,L    
 3 A003      5   31.8  28.0 S,F,X,H,U
 4 A004      5   46.4  36.0 H,W,U,P,R
 5 A005      3   53.6  24.7 I,Y,B    
 6 A006      2   58.9  61.9 V,J      
 7 A007      5   68.2  53.8 Y,X,W,N,F
 8 A008      4   64.5  14.0 X,I,V,D  
 9 A009      1   61.4  NA   L        
10 A010      2   95.5  30.0 S,L      
# ... with 2,543 more rows

(在我的机器上)与您的其他方法比较:

time.1 = 1.02 0 1.02 NA NA 
time.2.1 = 0.17 0 0.17 NA NA
time.2 = 1.11 0 1.11 NA NA 
# dplyr-method
time.3 = 0.07 0 0.08 NA NA 
         ID           N      P_mean        P_sd        sets 
"character"   "integer"   "numeric"   "numeric" "character" 

您可以尝试data.table方法-

library(data.table)
setDT(d)
d[, .(N = .N, 
      P_mean = mean(P), 
      P_sd = sd(P), 
      sets = toString(unique(set))), ID]
#        ID N P_mean P_sd             sets
#   1: M074 6  66.30 32.1 I, O, K, S, W, Y
#   2: E016 4  60.23 25.3       E, Y, I, L
#   3: W043 3  46.62 46.2          Q, U, L
#   4: Y059 5  93.59 26.8    G, T, L, O, S
#   5: R073 7  61.16 44.1    N, P, M, I, S
#  ---                                    
#2549: B012 2   6.68 27.7             Z, G
#2550: H088 1  -4.08   NA                X
#2551: T052 1  27.65   NA                E
#2552: C087 1  74.33   NA                M
#2553: Q021 1  30.29   NA                P

考虑使用collapse

library(collapse)
fpaste <- function(x) toString(funique(x))
out <- collap(d, ~ ID, custom = list(fnobs = "set",
      fmean = "P", fsd = "P", fpaste = "set"))

与产出

head(out)
    ID fnobs.set    fpaste.set  fmean.P    fsd.P
1 A001         4       N, Z, C 27.43196 42.10786
2 A002         3       Z, R, L 46.57773 40.55696
3 A003         5 S, F, X, H, U 31.84874 27.96048
4 A004         5 H, W, U, P, R 46.37885 36.03823
5 A005         3       I, Y, B 53.62615 24.67470
6 A006         2          V, J 58.91548 61.88600

基准
 N <- 1000000
system.time({
out <- collap(d, ~ ID, custom = list(fnobs = "set",
      fmean = "P", fsd = "P", fpaste = "set"))
})
# user  system elapsed 
#  0.513   0.015   0.526 
system.time({
setDT(d)
d[, .(N = .N, 
      P_mean = mean(P), 
      P_sd = sd(P), 
      sets = toString(unique(set))), ID]
}) 
# user  system elapsed 
#  0.646   0.015   0.659 

最新更新