基于r中的几个条件创建新列



我有一个由三列组成的数据框架,状态的唯一值如下"X"0";"C"1";"2";"3";"4";"5"。一开始,我不知道如何根据每个id分组并根据条件创建多个列,例如,如果状态为2,3,4,5,则目标列为1,否则为0。

month_balance表示(提取数据的月份为起始点,向后,0为当前月份,-1为前一个月,依此类推)

状态表示(0:逾期1-29天,1:逾期30-59天,2:逾期60-89天,3:逾期90-119天,4:逾期120-149天,5:逾期或坏账冲销超过150天C:当月付清,X:当月无贷款)

df <- data.frame (id  = c("5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805"),
month_balance = c("0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14","-15","0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14"),
status = c("C","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X")
)

最后,我希望得到如下输出:

df1 <- data.frame (id  = c("5008804","5008805"),
month_begin = c("16","15"),
paid_off = c("13","12"),
num_of_pastdues = c("2","2"),
no_loan = c("1","1"),
target = c("0","0"))

base R的解决方案可以是创建一个自定义函数并将其应用于每个组,即

MyFunction <- function(x){
month_begin = length(x)
paid_off = sum(x == 'C')
num_of_pastdues = sum(x %in% 0:5)
no_loan = sum(x == 'X')
target = ifelse(any(x %in% 2:5), 1, 0)
return(c(month_begin=month_begin, paid_off=paid_off, num_of_pastdues=num_of_pastdues, no_loan=no_loan, target=target))
}
res <- t(sapply(split(df$status, df$id), MyFunction))
month_begin paid_off num_of_pastdues no_loan target
#    5008804 16          13       2               1       0     
#    5008805 15          12       2               1       0

要使它成为一个列id为then的数据帧,

res_df <- data.frame(res)
res_df$id <- rownames(res_df)
rownames(res_df) <- NULL
res_df
#month_begin paid_off num_of_pastdues no_loan target      id
#1          16       13               2       1      0 5008804
#2          15       12               2       1      0 5008805

不太确定如何为target编码,因为每个id的状态都出现了目标0和1多次。

下面是我如何构造其他变量的:

df %>% 
group_by(id) %>% 
summarise(
month_begin=max(abs(as.numeric(month_balance)))+1, 
paid_off=sum(status=="C"), 
num_of_pastdues=sum(status %in% 0:5), 
no_loan=sum(status=="X"))
# A tibble: 2 x 5
id      month_begin paid_off num_of_pastdues no_loan
<chr>         <dbl>    <int>           <int>   <int>
1 5008804          16       13               2       1
2 5008805          15       12               2       1
library(tidyverse)
df <- data.frame (id  = c("5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008804","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805","5008805"),
month_balance = c("0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14","-15","0","-1","-2","-3","-4","-5","-6","-7","-8","-9","-10","-11","-12","-13","-14"),
status = c("C","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X","C","C","C","C","C","C","C","C","C","C","C","C","1","0","X")
) %>% 
as_tibble()
df %>%  
mutate(target = case_when(status %in% c(2, 3, 4, 5) ~ 1, 
TRUE ~ 0), 
paid_off = case_when(status == "C" ~ 1, 
TRUE ~ 0), 
no_loan = case_when(status == "X" ~ 1,
TRUE ~ 0)) %>%  

group_by(id) %>%  
summarise(month_begin = n(), 
across(c(paid_off, no_loan, target), sum))
#> # A tibble: 2 x 5
#>   id      month_begin paid_off no_loan target
#>   <chr>         <int>    <dbl>   <dbl>  <dbl>
#> 1 5008804          16       13       1      0
#> 2 5008805          15       12       1      0

由reprex包(v2.0.1)创建于2022-06-29

您可以尝试使用dplyr。首先,您可以创建具有所需条件的变量,然后可以使用summary来计算每个组满足条件的次数。

df <- df %>%
mutate(num_of_pastdues = case_when(
status %in% c(2,3,4,5) ~ 1,
TRUE ~ 0
)) %>%
mutate(no_loan  = case_when(
status == "X" ~ 1,
TRUE ~ 0
)) %>%
mutate(paid_off  = case_when(
status == "C" ~ 1,
TRUE ~ 0
)) %>%
group_by(id) %>% 
summarise(num_of_pastdues = sum(num_of_pastdues), no_loan = sum(no_loan), paid_off = sum(paid_off))

最新更新