在不使用循环的情况下对数据进行滤波和求和以填充矩阵



我正在处理一个大约230k行和11列的数据集,需要创建一组矩阵(或3D阵列(。给定矩阵的元素是根据3个条件(比如NAME、DATE和NUMBER(过滤时数据集的某一列(比如COST(的总和。以下是我目前的设置方式。

for(k in 1:length(names)) {
x <- matrix(0, nrow = length(dates), ncol = 20)
for(i in 1:length(dates)) {
for(j in 1:20) {
x[i, j] <- sum(claims[as.logical((data$NAME == names[k]) * (data$DATE == dates[i]) * (data$NUMBER == j)), 4])
}
}
assign(names[k], x)
}

有没有一种方法可以在不使用循环的情况下实现这一点?如果没有,至少可以更有效地实现?

谢谢!

编辑:这是我根据要求编造的一些测试数据。我只包含了这部分代码所必需的列。出于测试目的,我还在上面的代码中将矩阵的列数从40减少到20。日期和名称是for循环所需的向量。test_input是测试数据。test_output_market1是将在循环中创建的第一个矩阵。

名称:

c("Market1"  "Market2"   "Market3"   "Market4"   "Market5")

日期:

structure(c(18993   18994   18995   18996   18997   18998   18999       
19000   19001   19002   19003   19004   19005   19006   19007   19008
19009   19010   19011   19012   19013   19014   19015   19016   19017
19018   19019   19020   19021   19022    19023)  class = "Date")

test_input:

structure(list(DATE = structure(c(19021 19007   18994   19022                           
19010   19007   19007   18996   19012    19007)  class = "Date")     NUMBER = c(4           
2   8   11  5   5   7   12  15   5)  NAME = c("Market1"  "Market3"
Market1  "Market1"   "Market5"   "Market4"   "Market2"   "Market3"                  
Market4  "Market5")  COST = c(9384.3    93.23   6354.12 346.98                  
5743.48 845.73  3948.37 264.34  38790.24     8723.2))    class = "data.frame"    row.names = c(NA           
-10L))  

test_output_market1:

structure(c(0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0               
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   9384.3
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   6354.12 0   0   0   0   0   0       
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   346.98  0   0   0   0   0   0   0   0   0       
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0    0)  .Dim = c(31L    
20L))   
  

我需要指出的是,我不确定您对所提供的structure()元素做了什么。它们不是dput()的输出。例如,每个元素之间都有逗号。您是否手动创建或修改了它们?在未来,请查看此资源:制作R可重复的问题。

还有一件事。。。您使用了对象名称namesdates。这两个都是关键词。我已将对象更改为nmdts。这里又是对象——从dput的输出中看,它们的名称是我使用过的。我重新创建了names,而不是修复它。(这不是dput的输出。(

dts <- structure(c(18993, 18994, 18995, 18996, 18997, 18998, 18999, 19000, 19001, 
19002, 19003, 19004, 19005, 19006, 19007, 19008, 19009, 19010, 
19011, 19012, 19013, 19014, 19015, 19016, 19017, 19018, 19019, 
19020, 19021, 19022, 19023),  class = "Date")
test_input <- structure(list(
DATE = structure(c(19021, 19007, 18994, 19022, 19010, 19007, 19007, 18996, 19012,  
19007),  class = "Date"),
NUMBER = c(4, 2, 8, 11, 5, 5, 7, 12, 15, 5),
NAME = c("Market1", "Market3", "Market1", "Market1", "Market5",
"Market4", "Market2", "Market3", "Market4", "Market5"),
COST = c(9384.3, 93.23, 6354.12, 346.98, 5743.48, 845.73, 
3948.37, 264.34, 38790.24, 8723.2)), 
row.names = c(NA, -10L),
class = "data.frame")

# create name vector
nm <- sort(unique(test_input$NAME))

在创建了可用的对象之后,我创建了一个空列表来存储将来自循环的每个矩阵。

library(tidyverse) # really just for %>%
# create a list to store each of the matrices
v <- vector(mode = "list",
length = nm) # length unique NAME
for(k in 1:length(nm)){.  # for each unique NAME
x <- matrix(0, 
nrow = length(dts),  # creating the empty matrix
ncol = 20)
for(i in 1:length(dts)){         # for each DATE in vector
for(j in 1:20){                # for each NUMBER in vector
x[i, j] <- test_input[which(
test_input$NAME == nm[k] &     # NAME == name
test_input$DATE == dts[i] &  # DATE == date
test_input$NUMBER == j), "COST"] %>% # NUMBER = number and select cost
unlist(use.names = F) %>%      # make any values obtained a vector
{sum(.)}                   # add the vector of values
}
}
v[[k]] <- x                     # add the matrix to the list of matrices
}

如果执行v[[1]],您将看到nm中第一个值的输出,即"0";Market1;

#       [,1] [,2] [,3]   [,4] [,5] [,6] [,7]    [,8] [,9] [,10]  [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
#  [1,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [2,]    0    0    0    0.0    0    0    0 6354.12    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [3,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [4,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [5,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [6,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [7,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [8,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
#  [9,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [10,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [11,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [12,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [13,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [14,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [15,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [16,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [17,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [18,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [19,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [20,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [21,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [22,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [23,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [24,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [25,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [26,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [27,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [28,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [29,]    0    0    0 9384.3    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0
# [30,]    0    0    0    0.0    0    0    0    0.00    0     0 346.98     0     0     0     0     0     0     0     0     0
# [31,]    0    0    0    0.0    0    0    0    0.00    0     0   0.00     0     0     0     0     0     0     0     0     0 

这与test_output_market1中的数据相匹配。

all.equal(v3[[1]], test_output_market1)
# [1] TRUE 

最新更新