使用r在数据帧中查找借款人最后一次违约的开始



我正试图使用r来识别借款人默认事件的第一次出现。在下面的示例中,我在名为ID的数据集中有一个借款人。最后一列名为STAET_LAST_default_ID,是作为数据中的新列所需的输出。我希望得到一些帮助。

ACCOUNT_ID  PERIOD  Arrears_Days    DPD90   DEFAULT_DATE    START_LAST_DEFAULT_ID
#100100     202001  0               0                       0
#100100     202002  12              0                       0
#100100     202003  0               0                       0
#100100     202004  25              0                       0
#100100     202005  31              0                       0
#100100     202006  65              0                       0
#100100     202007  91              1       202007          0
#100100     202008  120             1                       0
#100100     202009  90              1                       0
#100100     202010  89              0                       0
#100100     202011  0               0                       0
#100100     202012  0               0                       0
#100100     202101  0               0                       0
#100100     202102  31              0                       0
#100100     202103  61              0                       0
#100100     202104  121             1       202104          1 #This is the answer
#100100     202105  130             1                       0
#100100     202106  130             1                       0
#100100     202107  150             1                       0
#100100     202108  120             1                       0
#100100     202109  0               0                       0

一个可能的解决方案:

library(tidyverse)
df <- data.frame(
ACCOUNT_ID = c(100100L,
100100L,100100L,100100L,100100L,100100L,
100100L,100100L,100100L,100100L,100100L,
100100L,100100L,100100L,100100L,100100L,
100100L,100100L,100100L,100100L,100100L),
PERIOD = c(202001L,
202002L,202003L,202004L,202005L,202006L,
202007L,202008L,202009L,202010L,202011L,
202012L,202101L,202102L,202103L,202104L,
202105L,202106L,202107L,202108L,202109L),
Arrears_Days = c(0L,12L,
0L,25L,31L,65L,91L,120L,90L,89L,0L,0L,
0L,31L,61L,121L,130L,130L,150L,120L,
0L),
DPD90 = c(0L,0L,
0L,0L,0L,0L,1L,1L,1L,0L,0L,0L,0L,0L,
0L,1L,1L,1L,1L,1L,0L),
DEFAULT_DATE = c(NA,NA,
NA,NA,NA,NA,202007L,NA,NA,NA,NA,NA,NA,
NA,NA,202104L,NA,NA,NA,NA,NA),
START_LAST_DEFAULT_ID = c(0L,0L,
0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
0L,1L,0L,0L,0L,0L,0L)
)
df %>% 
mutate(aux = data.table::rleid(DPD90)) %>% 
group_by(ACCOUNT_ID, aux) %>% 
mutate(DEFDATE = if_else(DPD90 == 1, first(PERIOD), NA_integer_)) %>% 
group_by(aux) %>%
mutate(DEFDATE = if_else(row_number() == 1, DEFDATE, NA_integer_)) %>%
group_by(ACCOUNT_ID) %>% 
mutate(Z=if_else(!is.na(DEFDATE) & DEFDATE==max(DEFDATE, na.rm = T),1,0)) %>%
ungroup %>% select(-aux) 
#>    ACCOUNT_ID PERIOD Arrears_Days DPD90 DEFAULT_DATE START_LAST_DEFAULT_ID
#> 1      100100 202001            0     0           NA                     0
#> 2      100100 202002           12     0           NA                     0
#> 3      100100 202003            0     0           NA                     0
#> 4      100100 202004           25     0           NA                     0
#> 5      100100 202005           31     0           NA                     0
#> 6      100100 202006           65     0           NA                     0
#> 7      100100 202007           91     1       202007                     0
#> 8      100100 202008          120     1           NA                     0
#> 9      100100 202009           90     1           NA                     0
#> 10     100100 202010           89     0           NA                     0
#> 11     100100 202011            0     0           NA                     0
#> 12     100100 202012            0     0           NA                     0
#> 13     100100 202101            0     0           NA                     0
#> 14     100100 202102           31     0           NA                     0
#> 15     100100 202103           61     0           NA                     0
#> 16     100100 202104          121     1       202104                     1
#> 17     100100 202105          130     1           NA                     0
#> 18     100100 202106          130     1           NA                     0
#> 19     100100 202107          150     1           NA                     0
#> 20     100100 202108          120     1           NA                     0
#> 21     100100 202109            0     0           NA                     0
#>    DEFDATE Z
#> 1       NA 0
#> 2       NA 0
#> 3       NA 0
#> 4       NA 0
#> 5       NA 0
#> 6       NA 0
#> 7   202007 0
#> 8       NA 0
#> 9       NA 0
#> 10      NA 0
#> 11      NA 0
#> 12      NA 0
#> 13      NA 0
#> 14      NA 0
#> 15      NA 0
#> 16  202104 1
#> 17      NA 0
#> 18      NA 0
#> 19      NA 0
#> 20      NA 0
#> 21      NA 0

相关内容

最新更新