我正在处理一个大约230k行和11列的数据集,需要创建一组矩阵(或3D阵列(。给定矩阵的元素是根据3个条件(比如NAME、DATE和NUMBER(过滤时数据集的某一列(比如COST(的总和。以下是我目前的设置方式。
for(k in 1:length(names)) {
x <- matrix(0, nrow = length(dates), ncol = 20)
for(i in 1:length(dates)) {
for(j in 1:20) {
x[i, j] <- sum(claims[as.logical((data$NAME == names[k]) * (data$DATE == dates[i]) * (data$NUMBER == j)), 4])
}
}
assign(names[k], x)
}
有没有一种方法可以在不使用循环的情况下实现这一点?如果没有,至少可以更有效地实现?
谢谢!
编辑:这是我根据要求编造的一些测试数据。我只包含了这部分代码所必需的列。出于测试目的,我还在上面的代码中将矩阵的列数从40减少到20。日期和名称是for循环所需的向量。test_input是测试数据。test_output_market1是将在循环中创建的第一个矩阵。
名称:
c("Market1" "Market2" "Market3" "Market4" "Market5")
日期:
structure(c(18993 18994 18995 18996 18997 18998 18999
19000 19001 19002 19003 19004 19005 19006 19007 19008
19009 19010 19011 19012 19013 19014 19015 19016 19017
19018 19019 19020 19021 19022 19023) class = "Date")
test_input:
structure(list(DATE = structure(c(19021 19007 18994 19022
19010 19007 19007 18996 19012 19007) class = "Date") NUMBER = c(4
2 8 11 5 5 7 12 15 5) NAME = c("Market1" "Market3"
Market1 "Market1" "Market5" "Market4" "Market2" "Market3"
Market4 "Market5") COST = c(9384.3 93.23 6354.12 346.98
5743.48 845.73 3948.37 264.34 38790.24 8723.2)) class = "data.frame" row.names = c(NA
-10L))
test_output_market1:
structure(c(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9384.3
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 6354.12 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 346.98 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) .Dim = c(31L
20L))
我需要指出的是,我不确定您对所提供的structure()
元素做了什么。它们不是dput()
的输出。例如,每个元素之间都有逗号。您是否手动创建或修改了它们?在未来,请查看此资源:制作R可重复的问题。
还有一件事。。。您使用了对象名称names
和dates
。这两个都是关键词。我已将对象更改为nm
和dts
。这里又是对象——从dput
的输出中看,它们的名称是我使用过的。我重新创建了names
,而不是修复它。(这不是dput
的输出。(
dts <- structure(c(18993, 18994, 18995, 18996, 18997, 18998, 18999, 19000, 19001,
19002, 19003, 19004, 19005, 19006, 19007, 19008, 19009, 19010,
19011, 19012, 19013, 19014, 19015, 19016, 19017, 19018, 19019,
19020, 19021, 19022, 19023), class = "Date")
test_input <- structure(list(
DATE = structure(c(19021, 19007, 18994, 19022, 19010, 19007, 19007, 18996, 19012,
19007), class = "Date"),
NUMBER = c(4, 2, 8, 11, 5, 5, 7, 12, 15, 5),
NAME = c("Market1", "Market3", "Market1", "Market1", "Market5",
"Market4", "Market2", "Market3", "Market4", "Market5"),
COST = c(9384.3, 93.23, 6354.12, 346.98, 5743.48, 845.73,
3948.37, 264.34, 38790.24, 8723.2)),
row.names = c(NA, -10L),
class = "data.frame")
# create name vector
nm <- sort(unique(test_input$NAME))
在创建了可用的对象之后,我创建了一个空列表来存储将来自循环的每个矩阵。
library(tidyverse) # really just for %>%
# create a list to store each of the matrices
v <- vector(mode = "list",
length = nm) # length unique NAME
for(k in 1:length(nm)){. # for each unique NAME
x <- matrix(0,
nrow = length(dts), # creating the empty matrix
ncol = 20)
for(i in 1:length(dts)){ # for each DATE in vector
for(j in 1:20){ # for each NUMBER in vector
x[i, j] <- test_input[which(
test_input$NAME == nm[k] & # NAME == name
test_input$DATE == dts[i] & # DATE == date
test_input$NUMBER == j), "COST"] %>% # NUMBER = number and select cost
unlist(use.names = F) %>% # make any values obtained a vector
{sum(.)} # add the vector of values
}
}
v[[k]] <- x # add the matrix to the list of matrices
}
如果执行v[[1]]
,您将看到nm
中第一个值的输出,即"0";Market1;
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
# [1,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [2,] 0 0 0 0.0 0 0 0 6354.12 0 0 0.00 0 0 0 0 0 0 0 0 0
# [3,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [4,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [5,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [6,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [7,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [8,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [9,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [10,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [11,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [12,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [13,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [14,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [15,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [16,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [17,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [18,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [19,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [20,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [21,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [22,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [23,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [24,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [25,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [26,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [27,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [28,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [29,] 0 0 0 9384.3 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
# [30,] 0 0 0 0.0 0 0 0 0.00 0 0 346.98 0 0 0 0 0 0 0 0 0
# [31,] 0 0 0 0.0 0 0 0 0.00 0 0 0.00 0 0 0 0 0 0 0 0 0
这与test_output_market1
中的数据相匹配。
all.equal(v3[[1]], test_output_market1)
# [1] TRUE