对数据帧中的元素对进行操作

我有两个数据帧，x和weights，其中列是成对的。下面是一些示例数据帧:

x = read.table(text = "
  yr1  yr2  yr3  yr4
   10   15    6    8
   10   20   30   NA
   NA    5    2    3
  100  100   NA   NA", 
sep = "", header = TRUE)
weights = read.table(text = "
  yr1  yr2  yr3  yr4
    2    4    1    3
    2    2    4    2
    3    2    2    3
    4    2    2    4", 
sep = "", header = TRUE)

列yr1和yr2是一对，列yr3和yr4是另一对。我的实际数据列上升到yr100，有50对列。

如果yr1或yr2在x中缺失，我想填充缺失的观测值，例如:

(5 / 2) * 3

同理yr3或yr4:

(30 / 4) * 2

其中5(或30)是x列中给定一对元素中不缺失的元素。第一个示例中的值2和3(第二个示例中的值4和2)是x数据帧中给定的一对元素在weights数据帧中的对应元素。如果一对中的两个元素在x中都缺失，我希望将它们保留为缺失。

下面是R代码，它使用嵌套的for loops执行上述操作。然而，在我的实际数据集中有2000或3000行，嵌套的for loops现在已经运行了>10小时。

for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {
    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  NA 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  NA
 }
}

我已经意识到第三和第四if语句可能是不必要的。如果我简单地删除这两个if语句，也许运行这段代码的时间将大大减少。

然而，我也提出了以下替代解决方案，使用reshape而不是嵌套的for loops:

n.years <- 4
x2  <- reshape(x      , direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
wt2 <- reshape(weights, direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x2$yr1  <- ifelse(is.na(x2$yr1), (x2$yr2 / wt2$yr2) * wt2$yr1, x2$yr1)
x2$yr2  <- ifelse(is.na(x2$yr2), (x2$yr1 / wt2$yr1) * wt2$yr2, x2$yr2)
x3  <- reshape(x2, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x3

在我关闭当前R会话并尝试上述方法之前，请建议可能更有效的替代方法。我已经稍微使用了microbenchmark，但还没有尝试在这里这样做，部分原因是为每个可能的解决方案编写函数对我来说有点吓人。我还尝试使用apply函数族提出解决方案，但无法提出一个。

我的reshape解决方案来自这个问题:

用多个度量变量重塑数据帧

除了计算时间，我还担心可能的内存耗尽。

我尽量坚持base R，但会考虑使用其他选项来获得所需的输出。谢谢你的建议。

这对你有用吗?

注意，我没有使用你的替换函数，因为我发现它有点混乱，所以你必须修正如何用公式替换yr1和yr2变量。此外，如果您需要能够将结果附加到原始数据框上，则可能需要reshape。

newx <- 
reshape(x, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("v1","v2"))
newwt <- 
reshape(weights, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("w1","w2"))
condwtmean <- function(x,y,wtx,wty){
    if(xor(is.na(x),is.na(y))){
        if(is.na(x))
            x <- y # replacement function
        if(is.na(y))
            y <- x # replacement function
        return(weighted.mean(c(x,y),c(wtx,wty)))
    }
    else if(!is.na(x) & !is.na(y))
        return(weighted.mean(c(x,y),c(wtx,wty)))
    else
        return(NA)  
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)

Thomas的答案比我尝试的三种方法中的任何一种都要好得多。这里我将四种方法与microbenchmark进行比较。我还没有用实际数据尝试Thomas的答案。我原来的嵌套for循环方法在22小时后仍然运行。

Unit: milliseconds
             expr       min        lq   median       uq      max neval
 fn.1(x, weights)  98.69133  99.47574 100.5313 101.7315 108.8757    20
 fn.2(x, weights) 755.51583 758.12175 762.3775 776.0558 801.9615    20
 fn.3(x, weights) 564.21423 567.98822 568.5322 571.0975 575.1809    20
 fn.4(x, weights) 367.05862 370.52657 371.7439 373.7367 395.0423    20
#########################################################################################
# create data
set.seed(1234)
n.rows <- 40
n.cols <- 40
n.sample <- n.rows * n.cols
x <- sample(20, n.sample, replace=TRUE)
x.NA <- sample(n.rows*n.cols, 10*(n.sample / n.rows), replace=FALSE)
x[x.NA] <- NA
x <- as.data.frame(matrix(x, nrow = n.rows))
weights <- sample(4, n.sample, replace=TRUE)
weights <- as.data.frame(matrix(weights, nrow = n.rows))
weights
#########################################################################################
# Thomas's function
fn.1 <- function(x, weights){
newx <- reshape(x, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("v1", "v2"))
newwt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("w1", "w2"))
condwtmean <- function(x,y,wtx,wty){
    if(xor(is.na(x),is.na(y))){
        if(is.na(x))
            x <- (y / wty) * wtx # replacement function
        if(is.na(y))
            y <- (x / wtx) * wty # replacement function
        return(weighted.mean(c(x,y),c(wtx,wty)))
    }
    else if(!is.na(x) & !is.na(y))
        return(weighted.mean(c(x,y),c(wtx,wty)))
    else
        return(NA)  
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)
newx2 <- reshape(newx[,c(1,4:5)], v.names = "wtmean", timevar = "time", direction = "wide")
newx2 <- newx2[,2:(n.cols/2+1)]
names(newx2) <- paste('X', 1:(n.cols/2), sep = "")
return(newx2)
}
fn.1.output <- fn.1(x, weights)
#########################################################################################
# nested for-loops with 4 if statements
fn.2 <- function(x, weights){
for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {
    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  NA 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  NA
 }
}
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {
    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 
 }
}
return(weighted.x)
}
fn.2.output <- fn.2(x, weights)
fn.2.output <- as.data.frame(fn.2.output)
names(fn.2.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
# nested for-loops with 2 if statements
fn.3 <- function(x, weights){
for(i in 1: (ncol(x)/2)) {
  for(j in 1: nrow(x)) {
    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] =  (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] =  (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)] 
 }
}
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {
    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 
 }
}
return(weighted.x)
}
fn.3.output <- fn.3(x, weights)
fn.3.output <- as.data.frame(fn.3.output)
names(fn.3.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
# my reshape solution
fn.4 <- function(x, weights){
new.x    <- reshape(x      , direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("v1", "v2"))
wt       <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("w1", "w2"))
new.x$v1 <- ifelse(is.na(new.x$v1), (new.x$v2 / wt$w2) * wt$w1, new.x$v1)
new.x$v2 <- ifelse(is.na(new.x$v2), (new.x$v1 / wt$w1) * wt$w2, new.x$v2)
x2  <- reshape(new.x, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("v1", "v2")) 
x <- x2[,2:(n.cols+1)]
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
  apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
  apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
  for(j in 1:   nrow(x)      ) {
    if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if(!is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE) 
    if( is.na(x[j,(1 + (i-1)*2)]) &  is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] =  NA 
 }
}
return(weighted.x)
}
fn.4.output <- fn.4(x, weights)
fn.4.output <- as.data.frame(fn.4.output)
names(fn.4.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
rownames(fn.1.output) <- NULL
rownames(fn.2.output) <- NULL
rownames(fn.3.output) <- NULL
rownames(fn.4.output) <- NULL
all.equal(fn.1.output, fn.2.output)
all.equal(fn.1.output, fn.3.output)
all.equal(fn.1.output, fn.4.output)
all.equal(fn.2.output, fn.3.output)
all.equal(fn.2.output, fn.4.output)
all.equal(fn.3.output, fn.4.output)
library(microbenchmark)
microbenchmark(fn.1(x, weights), fn.2(x, weights), fn.3(x, weights), fn.4(x, weights), times=20)
#########################################################################################

相关内容

最新更新

热门标签：