通过更高效、更快的功能减少数据清理

  • 本文关键字:数据 功能 高效 r
  • 更新时间 :
  • 英文 :


我有一个巨大的df,有1000万个观测值和50个变量作为x。目前,我正在使用"grepl"、"str_replace"one_answers"gsub"函数进行数据清理,这些函数非常耗时(每行5分钟(。

有没有更有效的函数或方法来重写代码以减少运行时间?

x <-x[!grepl("A",x$ITEM_1, perl=TRUE,]
x <-x[!grepl("B",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("C",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("D",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("E",x$ITEM_2),perl=TRUE,]
x <- x %>% mutate_at(vars(2:50), funs(gsub("\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"@","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\(","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\)","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"/","")

问候,

下面显示了问题中OP代码的比较时序以及该代码的简化
使用n = 10000行和50字符列向量的数据帧对其进行了测试。加速是值得的。

library(dplyr)
library(stringr)
library(stringi)
library(microbenchmark)
fun.OP <- function(x){
x <- x[!grepl("A", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("B", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("C", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("D", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("E", x$ITEM_2, perl = TRUE), ]
x <- x %>% mutate_at(vars(2:ncol(x)), list(~gsub("\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"@","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\(","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\)","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"/","")
x
}
fun.Rui <- function(x){
x <- x[!grepl('[A-D]', x$ITEM_1, perl = TRUE), ]
x <- x[!grepl('E', x$ITEM_2, perl = TRUE), ]
x[2:ncol(x)] <- lapply(x[2:ncol(x)], function(y) stri_replace_all_fixed(y, '?', ''))
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '#|@|~|\(|\)|&|/|', '')
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '\\', '')
row.names(x) <- NULL
x
}
y1 <- fun.OP(x)
y2 <- fun.Rui(x)
dim(y1)
dim(y2)
identical(y1, y2)
mb <- microbenchmark(
OP = fun.OP(x),
Rui = fun.Rui(x)
)
print(mb, order = 'median')
#Unit: milliseconds
# expr      min       lq     mean   median       uq       max neval cld
#  Rui 17.05596 17.21667 21.41270 17.30466 17.44592  62.58906   100  a 
#   OP 42.88685 43.25211 54.68897 43.53331 43.98865 501.98495   100   b

数据创建代码

makeString <- function(col, N){
y <- character(N)
if(col == 1){
L <- LETTERS
}else if(col == 2){
L <- c(LETTERS, '?')
} else{
L <- c(LETTERS, '@', '#', '~', '(', ')', '\', '/')
}
for(i in seq_len(N)){
y[i] <- paste(sample(L, sample(50, 1), TRUE), collapse = '')
}
y
}
set.seed(1234)
n <- 1e4
x <- lapply(1:50, function(i) makeString(i, n))
names(x) <- sprintf("V%02d", seq_along(x))
x <- do.call(cbind.data.frame, x)
names(x)[1:3] <- c('ITEM_1', 'ITEM_2', 'SUBNAMEZ')

最新更新