r-基于data.table中的长格式日期列,减少宽格式data.table的列



我有一个宽格式的data.table,如下所示:

library(data.table)
dt_wide <- data.table(
  "id" = seq(1:10),
  "yw_1001" = trunc( runif(10,0,100) ),
  "yw_1002" = trunc( runif(10,0,100) ),
  "yw_1003" = trunc( runif(10,0,100) ),
  "yw_1004" = trunc( runif(10,0,100) ),
  "yw_1005" = trunc( runif(10,0,100) ),
  "yw_1006" = trunc( runif(10,0,100) ),
  "yw_1007" = trunc( runif(10,0,100) ),
  "yw_1008" = trunc( runif(10,0,100) ),
  "yw_1009" = trunc( runif(10,0,100) ),
  "yw_1010" = trunc( runif(10,0,100) ),
  "yw_1011" = trunc( runif(10,0,100) ),
  "yw_1012" = trunc( runif(10,0,100) ),
  "yw_1013" = trunc( runif(10,0,100) ),
  "yw_1014" = trunc( runif(10,0,100) ),
  "yw_1015" = trunc( runif(10,0,100) ),
  "yw_1016" = trunc( runif(10,0,100) ),
  "yw_1017" = trunc( runif(10,0,100) ),
  "yw_1018" = trunc( runif(10,0,100) ),
  "yw_1019" = trunc( runif(10,0,100) ),
  "yw_1020" = trunc( runif(10,0,100) ),
  "yw_1021" = trunc( runif(10,0,100) ),
  "yw_1022" = trunc( runif(10,0,100) ),
  "yw_1023" = trunc( runif(10,0,100) ),
  "yw_1024" = trunc( runif(10,0,100) ),
  "yw_1025" = trunc( runif(10,0,100) ),
  "yw_1026" = trunc( runif(10,0,100) ),
  "yw_1027" = trunc( runif(10,0,100) ),
  "yw_1028" = trunc( runif(10,0,100) ),
  "yw_1029" = trunc( runif(10,0,100) ),
  "yw_1030" = trunc( runif(10,0,100) ),
  "yw_1031" = trunc( runif(10,0,100) ),
  "yw_1032" = trunc( runif(10,0,100) ),
  "yw_1033" = trunc( runif(10,0,100) ),
  "yw_1034" = trunc( runif(10,0,100) ),
  "yw_1035" = trunc( runif(10,0,100) ),
  "yw_1036" = trunc( runif(10,0,100) ),
  "yw_1037" = trunc( runif(10,0,100) ),
  "yw_1038" = trunc( runif(10,0,100) ),
  "yw_1039" = trunc( runif(10,0,100) ),
  "yw_1040" = trunc( runif(10,0,100) ),
  "yw_1041" = trunc( runif(10,0,100) ),
  "yw_1042" = trunc( runif(10,0,100) ),
  "yw_1043" = trunc( runif(10,0,100) ),
  "yw_1044" = trunc( runif(10,0,100) ),
  "yw_1045" = trunc( runif(10,0,100) ),
  "yw_1046" = trunc( runif(10,0,100) ),
  "yw_1047" = trunc( runif(10,0,100) ),
  "yw_1048" = trunc( runif(10,0,100) ),
  "yw_1049" = trunc( runif(10,0,100) ),
  "yw_1050" = trunc( runif(10,0,100) ),
  "yw_1051" = trunc( runif(10,0,100) ),
  "yw_1052" = trunc( runif(10,0,100) )
  )

cols对应年份(前两位(和周数(后两位(。

在我的实际数据集中(nrow=550000,ncol=1400(,我不能data.table::melt,因为它会创建一个超过行限制的data.table。

事实上,我只需要特定周数的值。获取以下数据。表

dt2 <- data.table(
  "id" = seq(1:10),
  "date" = sample(seq(as.Date('2010/01/01'), as.Date('2010/12/31'), by="day"), 10)
)

对于每个唯一的id,我需要将dt_wide中日期后5、10和15周的值保留在dt2中。理想情况下,希望减少dt_wide中的col大小,这样我就可以融化为长格式。

有什么建议吗?

这里有一种简单的方法,可以利用dplyr、tidyr、stringr和lubridate包来处理一些突变

# Calling required libraries
library(data.table)
library(dplyr)
# Creating dataframe
dt_wide <- data.table(
  "id" = seq(1:10),
  "yw_1001" = trunc( runif(10,0,100) ),
  "yw_1002" = trunc( runif(10,0,100) ),
  "yw_1003" = trunc( runif(10,0,100) ),
  "yw_1004" = trunc( runif(10,0,100) ),
  "yw_1005" = trunc( runif(10,0,100) ),
  "yw_1006" = trunc( runif(10,0,100) ),
  "yw_1007" = trunc( runif(10,0,100) ),
  "yw_1008" = trunc( runif(10,0,100) ),
  "yw_1009" = trunc( runif(10,0,100) ),
  "yw_1010" = trunc( runif(10,0,100) ),
  "yw_1011" = trunc( runif(10,0,100) ),
  "yw_1012" = trunc( runif(10,0,100) ),
  "yw_1013" = trunc( runif(10,0,100) ),
  "yw_1014" = trunc( runif(10,0,100) ),
  "yw_1015" = trunc( runif(10,0,100) ),
  "yw_1016" = trunc( runif(10,0,100) ),
  "yw_1017" = trunc( runif(10,0,100) ),
  "yw_1018" = trunc( runif(10,0,100) ),
  "yw_1019" = trunc( runif(10,0,100) ),
  "yw_1020" = trunc( runif(10,0,100) ),
  "yw_1021" = trunc( runif(10,0,100) ),
  "yw_1022" = trunc( runif(10,0,100) ),
  "yw_1023" = trunc( runif(10,0,100) ),
  "yw_1024" = trunc( runif(10,0,100) ),
  "yw_1025" = trunc( runif(10,0,100) ),
  "yw_1026" = trunc( runif(10,0,100) ),
  "yw_1027" = trunc( runif(10,0,100) ),
  "yw_1028" = trunc( runif(10,0,100) ),
  "yw_1029" = trunc( runif(10,0,100) ),
  "yw_1030" = trunc( runif(10,0,100) ),
  "yw_1031" = trunc( runif(10,0,100) ),
  "yw_1032" = trunc( runif(10,0,100) ),
  "yw_1033" = trunc( runif(10,0,100) ),
  "yw_1034" = trunc( runif(10,0,100) ),
  "yw_1035" = trunc( runif(10,0,100) ),
  "yw_1036" = trunc( runif(10,0,100) ),
  "yw_1037" = trunc( runif(10,0,100) ),
  "yw_1038" = trunc( runif(10,0,100) ),
  "yw_1039" = trunc( runif(10,0,100) ),
  "yw_1040" = trunc( runif(10,0,100) ),
  "yw_1041" = trunc( runif(10,0,100) ),
  "yw_1042" = trunc( runif(10,0,100) ),
  "yw_1043" = trunc( runif(10,0,100) ),
  "yw_1044" = trunc( runif(10,0,100) ),
  "yw_1045" = trunc( runif(10,0,100) ),
  "yw_1046" = trunc( runif(10,0,100) ),
  "yw_1047" = trunc( runif(10,0,100) ),
  "yw_1048" = trunc( runif(10,0,100) ),
  "yw_1049" = trunc( runif(10,0,100) ),
  "yw_1050" = trunc( runif(10,0,100) ),
  "yw_1051" = trunc( runif(10,0,100) ),
  "yw_1052" = trunc( runif(10,0,100) )
)
# Creating dataframe with point of interest
dt2 <- data.table(
  "id" = seq(1:10),
  "date" = sample(seq(as.Date('2010/01/01'), as.Date('2010/12/31'), by="day"), 10)
)
# Mutating data to get only required columns
columns_to_select <-
  dt2 %>%
  # Getting dates after 5/10/15 weeks
  mutate(after5 = date + (7 * 5),
         after10 = date + (7 * 10),
         after15 = date + (7 * 15)) %>%
  # Converting dates from wide format to long format
  tidyr::gather(key = "key", value = req_date, -c(id, date)) %>%
  # Converting date into respective column name in dt_wide dataframe
  mutate(year = format(as.Date(req_date), "%y"),
         week = stringr::str_pad(lubridate::week(req_date), 2, "left", "0"),
         select_date = paste0("yw_", year, week)) %>%
  # Selecting only required column into a vector
  select(select_date) %>%
  pull()
# Choosing from the wide dataframe only required columns
dt_wide %>%
  select(id, contains(columns_to_select))
# id yw_1024 yw_1044 yw_1017 yw_1014 yw_1045 yw_1031 yw_1035 yw_1029 yw_1049 yw_1022 yw_1019 yw_1050 yw_1036 yw_1040 yw_1034 yw_1027 yw_1041
# 1:  1      59       7      11       7      93      19      83      48      75      94      19       9      93      41       6      26      18
# 2:  2      84      22      18      70      29      53      63      26      23      12      93      84      17      57      96      93      98
# 3:  3       4      72      56      35      65      73      58      91      27      65      58       5      62      13      36      79      26
# 4:  4      36       5      26      56      34      27      60      64      79      27      40      64      32       0      96      56      19
# 5:  5      44      82      78      23      71      78      36      43      63      95      91      37      21      87      63      73      25
# 6:  6      46      45      81      89      59       0      85       3      68      23      90      82      93      42      28      67      32
# 7:  7      56      32       7      26      49      31      79      93      14      45      25      79      39      64      64      86      91
# 8:  8      82      99      46      79      81      56      39      10      20      27      83      29      30      30      35      96      24
# 9:  9      10      87      28      40      51      41      95      43      62      59      44      19      72      76      27      65      36
# 10: 10      81      19      44      55      22      53      98      54      16      29      30      28      20       2       5      39      23

这里有一个使用data.table(以及用于日期的lubridate包(的简单解决方案。由于您的问题似乎是根据内存限制构建的,因此只需使用必要的列为(小(ID表中的每一行调用melt()即可。还要注意,我们必须检查dt_wide中是否存在给定的日期,因为有些日期不存在。

rbindlist(lapply(1:nrow(dt2), function(x) {
  rowid <- dt2[x, id]
  rowdate <- dt2[x, date]
  
  dates <- rowdate + lubridate::weeks(c(5, 10, 15))
  cols <- paste0("yw_", strftime(dates, "%y%W"))
  
  # Because some dates aren't in the table
  cols <- intersect(cols, colnames(dt_wide))
  if (!length(cols)) return(NULL)
  
  melt(dt_wide[id == rowid, c("id", cols), with = F], id.vars = "id")
}))

最新更新