我有一个宽格式的data.table
,如下所示:
library(data.table)
dt_wide <- data.table(
"id" = seq(1:10),
"yw_1001" = trunc( runif(10,0,100) ),
"yw_1002" = trunc( runif(10,0,100) ),
"yw_1003" = trunc( runif(10,0,100) ),
"yw_1004" = trunc( runif(10,0,100) ),
"yw_1005" = trunc( runif(10,0,100) ),
"yw_1006" = trunc( runif(10,0,100) ),
"yw_1007" = trunc( runif(10,0,100) ),
"yw_1008" = trunc( runif(10,0,100) ),
"yw_1009" = trunc( runif(10,0,100) ),
"yw_1010" = trunc( runif(10,0,100) ),
"yw_1011" = trunc( runif(10,0,100) ),
"yw_1012" = trunc( runif(10,0,100) ),
"yw_1013" = trunc( runif(10,0,100) ),
"yw_1014" = trunc( runif(10,0,100) ),
"yw_1015" = trunc( runif(10,0,100) ),
"yw_1016" = trunc( runif(10,0,100) ),
"yw_1017" = trunc( runif(10,0,100) ),
"yw_1018" = trunc( runif(10,0,100) ),
"yw_1019" = trunc( runif(10,0,100) ),
"yw_1020" = trunc( runif(10,0,100) ),
"yw_1021" = trunc( runif(10,0,100) ),
"yw_1022" = trunc( runif(10,0,100) ),
"yw_1023" = trunc( runif(10,0,100) ),
"yw_1024" = trunc( runif(10,0,100) ),
"yw_1025" = trunc( runif(10,0,100) ),
"yw_1026" = trunc( runif(10,0,100) ),
"yw_1027" = trunc( runif(10,0,100) ),
"yw_1028" = trunc( runif(10,0,100) ),
"yw_1029" = trunc( runif(10,0,100) ),
"yw_1030" = trunc( runif(10,0,100) ),
"yw_1031" = trunc( runif(10,0,100) ),
"yw_1032" = trunc( runif(10,0,100) ),
"yw_1033" = trunc( runif(10,0,100) ),
"yw_1034" = trunc( runif(10,0,100) ),
"yw_1035" = trunc( runif(10,0,100) ),
"yw_1036" = trunc( runif(10,0,100) ),
"yw_1037" = trunc( runif(10,0,100) ),
"yw_1038" = trunc( runif(10,0,100) ),
"yw_1039" = trunc( runif(10,0,100) ),
"yw_1040" = trunc( runif(10,0,100) ),
"yw_1041" = trunc( runif(10,0,100) ),
"yw_1042" = trunc( runif(10,0,100) ),
"yw_1043" = trunc( runif(10,0,100) ),
"yw_1044" = trunc( runif(10,0,100) ),
"yw_1045" = trunc( runif(10,0,100) ),
"yw_1046" = trunc( runif(10,0,100) ),
"yw_1047" = trunc( runif(10,0,100) ),
"yw_1048" = trunc( runif(10,0,100) ),
"yw_1049" = trunc( runif(10,0,100) ),
"yw_1050" = trunc( runif(10,0,100) ),
"yw_1051" = trunc( runif(10,0,100) ),
"yw_1052" = trunc( runif(10,0,100) )
)
cols对应年份(前两位(和周数(后两位(。
在我的实际数据集中(nrow=550000,ncol=1400(,我不能data.table::melt
,因为它会创建一个超过行限制的data.table。
事实上,我只需要特定周数的值。获取以下数据。表
dt2 <- data.table(
"id" = seq(1:10),
"date" = sample(seq(as.Date('2010/01/01'), as.Date('2010/12/31'), by="day"), 10)
)
对于每个唯一的id
,我需要将dt_wide中日期后5、10和15周的值保留在dt2
中。理想情况下,希望减少dt_wide中的col大小,这样我就可以融化为长格式。
有什么建议吗?
这里有一种简单的方法,可以利用dplyr、tidyr、stringr和lubridate包来处理一些突变:
# Calling required libraries
library(data.table)
library(dplyr)
# Creating dataframe
dt_wide <- data.table(
"id" = seq(1:10),
"yw_1001" = trunc( runif(10,0,100) ),
"yw_1002" = trunc( runif(10,0,100) ),
"yw_1003" = trunc( runif(10,0,100) ),
"yw_1004" = trunc( runif(10,0,100) ),
"yw_1005" = trunc( runif(10,0,100) ),
"yw_1006" = trunc( runif(10,0,100) ),
"yw_1007" = trunc( runif(10,0,100) ),
"yw_1008" = trunc( runif(10,0,100) ),
"yw_1009" = trunc( runif(10,0,100) ),
"yw_1010" = trunc( runif(10,0,100) ),
"yw_1011" = trunc( runif(10,0,100) ),
"yw_1012" = trunc( runif(10,0,100) ),
"yw_1013" = trunc( runif(10,0,100) ),
"yw_1014" = trunc( runif(10,0,100) ),
"yw_1015" = trunc( runif(10,0,100) ),
"yw_1016" = trunc( runif(10,0,100) ),
"yw_1017" = trunc( runif(10,0,100) ),
"yw_1018" = trunc( runif(10,0,100) ),
"yw_1019" = trunc( runif(10,0,100) ),
"yw_1020" = trunc( runif(10,0,100) ),
"yw_1021" = trunc( runif(10,0,100) ),
"yw_1022" = trunc( runif(10,0,100) ),
"yw_1023" = trunc( runif(10,0,100) ),
"yw_1024" = trunc( runif(10,0,100) ),
"yw_1025" = trunc( runif(10,0,100) ),
"yw_1026" = trunc( runif(10,0,100) ),
"yw_1027" = trunc( runif(10,0,100) ),
"yw_1028" = trunc( runif(10,0,100) ),
"yw_1029" = trunc( runif(10,0,100) ),
"yw_1030" = trunc( runif(10,0,100) ),
"yw_1031" = trunc( runif(10,0,100) ),
"yw_1032" = trunc( runif(10,0,100) ),
"yw_1033" = trunc( runif(10,0,100) ),
"yw_1034" = trunc( runif(10,0,100) ),
"yw_1035" = trunc( runif(10,0,100) ),
"yw_1036" = trunc( runif(10,0,100) ),
"yw_1037" = trunc( runif(10,0,100) ),
"yw_1038" = trunc( runif(10,0,100) ),
"yw_1039" = trunc( runif(10,0,100) ),
"yw_1040" = trunc( runif(10,0,100) ),
"yw_1041" = trunc( runif(10,0,100) ),
"yw_1042" = trunc( runif(10,0,100) ),
"yw_1043" = trunc( runif(10,0,100) ),
"yw_1044" = trunc( runif(10,0,100) ),
"yw_1045" = trunc( runif(10,0,100) ),
"yw_1046" = trunc( runif(10,0,100) ),
"yw_1047" = trunc( runif(10,0,100) ),
"yw_1048" = trunc( runif(10,0,100) ),
"yw_1049" = trunc( runif(10,0,100) ),
"yw_1050" = trunc( runif(10,0,100) ),
"yw_1051" = trunc( runif(10,0,100) ),
"yw_1052" = trunc( runif(10,0,100) )
)
# Creating dataframe with point of interest
dt2 <- data.table(
"id" = seq(1:10),
"date" = sample(seq(as.Date('2010/01/01'), as.Date('2010/12/31'), by="day"), 10)
)
# Mutating data to get only required columns
columns_to_select <-
dt2 %>%
# Getting dates after 5/10/15 weeks
mutate(after5 = date + (7 * 5),
after10 = date + (7 * 10),
after15 = date + (7 * 15)) %>%
# Converting dates from wide format to long format
tidyr::gather(key = "key", value = req_date, -c(id, date)) %>%
# Converting date into respective column name in dt_wide dataframe
mutate(year = format(as.Date(req_date), "%y"),
week = stringr::str_pad(lubridate::week(req_date), 2, "left", "0"),
select_date = paste0("yw_", year, week)) %>%
# Selecting only required column into a vector
select(select_date) %>%
pull()
# Choosing from the wide dataframe only required columns
dt_wide %>%
select(id, contains(columns_to_select))
# id yw_1024 yw_1044 yw_1017 yw_1014 yw_1045 yw_1031 yw_1035 yw_1029 yw_1049 yw_1022 yw_1019 yw_1050 yw_1036 yw_1040 yw_1034 yw_1027 yw_1041
# 1: 1 59 7 11 7 93 19 83 48 75 94 19 9 93 41 6 26 18
# 2: 2 84 22 18 70 29 53 63 26 23 12 93 84 17 57 96 93 98
# 3: 3 4 72 56 35 65 73 58 91 27 65 58 5 62 13 36 79 26
# 4: 4 36 5 26 56 34 27 60 64 79 27 40 64 32 0 96 56 19
# 5: 5 44 82 78 23 71 78 36 43 63 95 91 37 21 87 63 73 25
# 6: 6 46 45 81 89 59 0 85 3 68 23 90 82 93 42 28 67 32
# 7: 7 56 32 7 26 49 31 79 93 14 45 25 79 39 64 64 86 91
# 8: 8 82 99 46 79 81 56 39 10 20 27 83 29 30 30 35 96 24
# 9: 9 10 87 28 40 51 41 95 43 62 59 44 19 72 76 27 65 36
# 10: 10 81 19 44 55 22 53 98 54 16 29 30 28 20 2 5 39 23
这里有一个使用data.table
(以及用于日期的lubridate
包(的简单解决方案。由于您的问题似乎是根据内存限制构建的,因此只需使用必要的列为(小(ID表中的每一行调用melt()
即可。还要注意,我们必须检查dt_wide
中是否存在给定的日期,因为有些日期不存在。
rbindlist(lapply(1:nrow(dt2), function(x) {
rowid <- dt2[x, id]
rowdate <- dt2[x, date]
dates <- rowdate + lubridate::weeks(c(5, 10, 15))
cols <- paste0("yw_", strftime(dates, "%y%W"))
# Because some dates aren't in the table
cols <- intersect(cols, colnames(dt_wide))
if (!length(cols)) return(NULL)
melt(dt_wide[id == rowid, c("id", cols), with = F], id.vars = "id")
}))