R - 在日期序列之间扩展值,并作为列添加到 data.table



更新:akrun 提供的建议解决方案对我有用,但我的问题是value.var = RATING中定义的值转移到相应的日期列。请注意,定义为RATING_DATEVALID_THRU_DATE之间的时间段的所有月份都不会填充。

到目前为止我尝试过但失败了:而不是像这样定义 dcast 操作

dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE, 
value.var = 'RATING')

我确实试过

dt1 <- dcast(setDT(ratings.dt), 
ISSUE_ID + RATING_TYPE ~ (VALID_THRU_DATE - RATING_DATE), 
value.var = 'RATING')      

dt1 <- dcast(setDT(ratings.dt), 
ISSUE_ID + RATING_TYPE ~ as.yearmon(seq(
RATING_DATE, VALID_THRU_DATE), frac = 1), 
value.var = 'RATING')

dt1 <- dcast(setDT(ratings.dt), 
ISSUE_ID + RATING_TYPE ~ (RATING_DATE:VALID_THRU_DATE), 
value.var = 'RATING')

我以为我可以使用定义每个评级有效期的 2 列,因为两者都是dcast()函数调用中的日期列,但显然该任务背后的逻辑概念化起来更加复杂。

现在,我手动概念化了此任务,首先构建一个"骨架 data.table",然后通过以长格式逐行循环原始评级 data.table 来填充该表,并将定义的评级分布在框架表中的两个日期之间。(我将 RATING 重命名为 RATING_NUM 以区别于"原始"字母数字评级)

# (0) Filter only the most recent rating within a given month
ratings_num.dt <- ratings_num.dt[, 
.SD[.N], 
by = .(ISSUE_ID, RATING_TYPE, RATING_DATE)] 
# (1) Defining start and end date for the rating time series
start_date    <- as.Date("1990-01-01", "%Y-%m-%d")
end_date      <- as.Date("2021-01-31", "%Y-%m-%d")
# (2) Define the dates as new columns for a skeleton data.table
new_cols      <- seq(from = start_date, 
to = end_date,
by = "month")
new_cols      <- date_ymd_to_m_end(new_cols)
new_col_names <- as.character(new_cols, "%Y-%m-%d")
# (3) Determine how many months the rating time series spans 
N_months <- elapsed_months_lubri(start_date, end_date) + 1 
# some function to do just what the name implies
MONTH_ID <- c(1:N_months)
# (4) Define the layout of the new skeleton table
# Note: The new table should contain the 3 rows per issue ID, namely the rating time series of each issue ID for every considered rating ageny 
rating_type.vec <- c("FR", "MR", "SPR")    
df_skeleton <- data.frame(rep(issue_IDs.vec, each = 3), rating_type.vec)
someInitialValue <- 0
# Credit to Jonas
to_Add <- setNames(data.frame(matrix(rep(
someInitialValue, nrow(df_skeleton)*length(new_col_names)), 
ncol = length(new_col_names), 
nrow = NROW(df))), 
new_col_names)
ratings_num_ts.df <- cbind(df_skeleton, to_Add)
ratings_num_ts.dt <- setDT(ratings_num_ts.df)
setnames(ratings_num_ts.dt, 
c("rep.issue_IDs.vec..each...3.", "rating_type.vec"),
c("ISSUE_ID", "RATING_TYPE"))
# (5) Create a data.table to join on ratings_num.dt to add month IDs to use for assigning ratings
seq_dates.dt <- setDT(data.frame(new_cols, MONTH_ID))
seq_dates.dt <- setnames(seq_dates.dt, c("new_cols"), c("RATING_DATE"))
ratings_num.dt <- ratings_num.dt[seq_dates.dt, 
on = .(RATING_DATE = RATING_DATE)]
ratings_num.dt <- ratings_num.dt[seq_dates.dt, 
on = .(RATING_VAL_THRU = RATING_DATE)]
# (6) If for the joined MONTH_IDs there is no corresponding RATING_DATE or RATING_VAL_THRU entry, the join will write NA values for these values in the joined table and can be filtered out accordingly
ratings_num.dt <- ratings_num.dt[!is.na(ISSUE_ID)]
# (7) Rename column of second MONTH_ID
setnames(ratings_num.dt,
c("MONTH_ID", "i.MONTH_ID"),
c("MONTH_ID_START", "MONTH_ID_END"))
# (8) Sort table by setting keys 
setkey(ratings_num.dt, ISSUE_ID, RATING_TYPE, RATING_DATE)
# (9) Defining logic as loop 
tic()
i <- 1
j <- nrow(ratings_num.dt)

id.vec             <- ratings_num.dt[, ISSUE_ID] 
rating_type.vec    <- ratings_num.dt[, RATING_TYPE]
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2)  
month_ID_end.vec   <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[, RATING_NUM]
total <- j
pb <- progress_bar$new(format = "[:bar] :current/:total 
(:percent) eta: :eta", total = total)

spread_ratings_to_ts <- function(dt_source, dt_ts) {
pb$tick(0)
for (i in 1:j) {
id             <- id.vec[i]  # alternatively ROW_ID == i
rating_type    <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i]  # change to right value
month_ID_end   <- month_ID_end.vec[i]
rating_num     <- rating_num.vec[i]

dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type, 
(month_ID_start:month_ID_end) := rating_num]

if (i %% 50 == 0) {
pb$tick()
}  

i <- i + 1
}
}
spread_ratings_to_ts(ratings_num.dt, ratings_num_ts.dt)
toc() 
## ~ 3,600 sec for ~ 250k rows to loop through ##

# (10) Compute rating means
# Substitute all pre-filled zeros in the table with NA as there is simply no 
# rating available at this point in time
ratings_num_ts.dt <- ratings_num_ts.dt %>% 
na_if(0)
ratings_num_ts.dt <- rbind(ratings_num_ts.dt, 
ratings_num_ts.dt[, 
c(.(RATING_TYPE = 'Mean'), 
lapply(.SD, mean, na.rm=TRUE)), 
by = .(ISSUE_ID), 
.SDcols = -(1:2)])
setkey(ratings_num_ts.dt, ISSUE_ID, RATING_TYPE)

我尝试使用如下所示foreach(...) %dopar% function(...)并行化此循环,但到目前为止它不起作用。这主要是由上面非常低效的循环的运行时驱动的 - 尽管工作得很好并完成了我想要的。在处理foreach函数调用时,我特别不确定如何编写一个合适的组合函数,我可以将其放入foreach调用中,该函数将根据需要包装结果。

i <- 1
j <- nrow(ratings_num.dt)
id.vec             <- ratings_num.dt[, ISSUE_ID]
rating_type.vec    <- ratings_num.dt[, RATING_TYPE]
# col 1+2 not rating but ISSUE_ID and RATING_TYPE
month_ID_start.vec <- (ratings_num.dt[, MONTH_ID_START] + 2) 
month_ID_end.vec   <- (ratings_num.dt[, MONTH_ID_END] + 2)
rating_num.vec     <- ratings_num.dt[, RATING_NUM]
spread_ratings_to_ts <- function(dt_source, dt_ts) {
id             <- id.vec[i]
rating_type    <- rating_type.vec[i]
month_ID_start <- month_ID_start.vec[i]
month_ID_end   <- month_ID_end.vec[i]
rating_num     <- rating_num.vec[i]

dt_ts[ISSUE_ID == id & RATING_TYPE == rating_type][, 
(month_ID_start:month_ID_end) := rating_num]
}   
myCluster <- makeCluster(((detectCores()/2) - 1), type = "PSOCK")
registerDoParallel(myCluster)
clusterEvalQ(cl = myCluster, {
setMKLthreads(1)
})
foreach(i = 1:j, .combine = 'rbind') %dopar% 
spread_ratings_to_ts(dt_source = ratings_num.dt,
dt_ts = ratings_num_ts.dt)
stopCluster(myCluster)
>背景/数据:理论上这很容易,即使是 3 岁的孩子也可以手动完成这项任务,但即使在解决这个问题近一周后,我也没有进一步的解决方案。

问题: 我正在使用一个大型财务数据集。它包含由ISSUE_ID确定的债券发行及其相应的RATING,由惠誉,穆迪和标准普尔定义为RATING_TYPE的3家评级机构提供。我为每个评级确定了一个发布日期和一个定义为RATING_DATEVALID_THRU_DATE的有效日期,两者都是DATE类型。所有日期都按 yearmonth() 格式化为给定月份的最后一天,因为它们的评级用于确定索引包含,其规则在月底进行评估。

ISSUE_ID属于numeric

RATING属于character类型

RATING_TYPE属于character

我的数据设置为名为ratings.dt的 data.table,我需要为开始日期和结束日期之间的序列添加列。然后,我的目标是为每个问题 ID 设置三行,其中一行用于每个评级机构各自评级历史的时间序列。

data.table 的键设置为ISSUE_ID、RATING_TYPE 和 RATING_DATE。

数据现在如下所示:

ISSUE_ID  RATING_TYPE  RATING   RATING_DATE   VALID_THRU_DATE RATING_DATE_SEQ
123       FR           3.33   2000-01-31    2000-04-31             1
123       FR           4.00   2000-05-31    2000-02-28             2
123       FR           3.66   2001-03-31    2001-04-31             3
123       FR           2.00   2001-05-31    2001-04-30             4
123       FR           2.33   2001-04-30    2003-12-31             5
123       FR           3.00   2004-01-31    2004-06-30             6
123       MR           2.33   1999-04-31    1999-12-31             1
123       MR           2.66   2000-01-31    2000-04-31             2
123       MR           3.00   2001-03-31    2001-04-30             3
123       MR           3.33   2001-05-31    2003-01-31             4
123       MR           3.00   2003-02-28    2003-07-31             5
123       SP           3.33   1999-04-31    2002-03-31             1
123       SP           3.00   2002-04-31    2003-05-31             2 
244       ...

现在,我想基本上将RATING中定义的评级分散到一系列日期中。 我想像这样去:

ISSUE_ID  RATING_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
123        FR                                 ...      3.33         2.33      ...     3.00
123        MR            2.33         2.33    ...      2.66         2.66      ...
123        SP            3.33         3.33    ...      3.33         2.66      ...
244       ...

这样我就可以做到:

ISSUE_ID  RATING_TYPE   1999-04-30  1999-05-31  ...   2000-01-31  2000-02-28    ...  2004-06-30 
123       FR                                  ...      3.33         2.33      ...     3.00
123       MR            2.33         2.33     ...      2.66         2.66      ...
123       SP            3.33         3.33     ...      3.33         2.66      ...
123      Mean           2.83         2.83     ...      3.11         2.55      ... 

然后,我可以通过 data.table 语法计算每个问题 ID 每月的平均评级,如下所示

ratings.dt[, 
lapply(.SD, mean),
.SDcols = x:y,       # col indexes of added date sequence columns
by = .(ISSUE_ID)]

使用我的映射表将字母数字评级(例如 AAA、B+、C- 等)转换为数值以允许基于数字的算术计算(例如平均值),我可以将数字评级平均值转换回字母数字平均值。那就意味着任务完成了!

另外,我现在不确定这个问题是否可以更有效地概念化。将不胜感激任何指示!

我们使用pivot_wider转换宽格式,按summarise进行分组,通过将另一个观察值与mean值连接来创建"平均值"行。 使用dplyrversion >=1.0时,summarise可以返回每组多行

library(dplyr)
library(tidyr)
ratings.dt %>%
select(-VALID_THRU_DATE, -RATING_DATE_SEQ) %>% 
pivot_wider(names_from = RATING_DATE, values_from = RATING) %>% 
group_by(ISSUE_ID) %>% 
summarise(RATING_TYPE = c(RATING_TYPE, "Mean"), 
across(where(is.numeric), ~ c(., mean(., na.rm = TRUE))), .groups = 'drop')

-输出

# A tibble: 4 x 11
#  ISSUE_ID RATING_TYPE `2000-01-31` `2000-05-31` `2001-03-31` `2001-05-31` `2001-04-30` `2004-01-31` `1999-04-31`
#     <int> <chr>              <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
#1      123 FR                  3.33            4         3.66         2            2.33            3        NA   
#2      123 MR                  2.66           NA         3            3.33        NA              NA         2.33
#3      123 SP                 NA              NA        NA           NA           NA              NA         3.33
#4      123 Mean                3.00            4         3.33         2.66         2.33            3         2.83
# … with 2 more variables: `2003-02-28` <dbl>, `2002-04-31` <dbl>

<小时 />

或使用data.table

library(data.table)
dt1 <- dcast(setDT(ratings.dt), ISSUE_ID + RATING_TYPE ~ RATING_DATE, 
value.var = 'RATING')
rbind(dt1, dt1[, c(.(RATING_TYPE = 'Mean'), lapply(.SD, mean, na.rm = TRUE)), .(ISSUE_ID), .SDcols = -(1:2)])
#   ISSUE_ID RATING_TYPE 1999-04-31 2000-01-31 2000-05-31 2001-03-31 2001-04-30 2001-05-31 2002-04-31 2003-02-28
#1:      123          FR         NA      3.330          4       3.66       2.33      2.000         NA         NA
#2:      123          MR       2.33      2.660         NA       3.00         NA      3.330         NA          3
#3:      123          SP       3.33         NA         NA         NA         NA         NA          3         NA
#4:      123        Mean       2.83      2.995          4       3.33       2.33      2.665          3          3
#   2004-01-31
#1:          3
#2:         NA
#3:         NA
#4:          3

数据

ratings.dt <- structure(list(ISSUE_ID = c(123L, 123L, 123L, 123L, 123L, 123L, 
123L, 123L, 123L, 123L, 123L, 123L, 123L), RATING_TYPE = c("FR", 
"FR", "FR", "FR", "FR", "FR", "MR", "MR", "MR", "MR", "MR", "SP", 
"SP"), RATING = c(3.33, 4, 3.66, 2, 2.33, 3, 2.33, 2.66, 3, 3.33, 
3, 3.33, 3), RATING_DATE = c("2000-01-31", "2000-05-31", "2001-03-31", 
"2001-05-31", "2001-04-30", "2004-01-31", "1999-04-31", "2000-01-31", 
"2001-03-31", "2001-05-31", "2003-02-28", "1999-04-31", "2002-04-31"
), VALID_THRU_DATE = c("2000-04-31", "2000-02-28", "2001-04-31", 
"2001-04-30", "2003-12-31", "2004-06-30", "1999-12-31", "2000-04-31", 
"2001-04-30", "2003-01-31", "2003-07-31", "2002-03-31", "2003-05-31"
), RATING_DATE_SEQ = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 
5L, 1L, 2L)), class = "data.frame", row.names = c(NA, -13L))

最新更新