r-有条件地清理数据帧



我正试图通过删除错误添加的行来清理数据帧。

这是伪数据:

temp <- structure(list(Date = c("24/06/2002", "24/06/2002", "25/06/2002","25/06/2002", "26/06/2002", 
"02/07/2002","03/07/2002","24/07/2002", "08/07/2002",
"08/07/2002", "15/07/2002", "17/07/2002", 
"22/07/2002", "22/07/2002", "28/07/2002", "29/07/2002"), 
payment = c(200, 1000,-1000, -1000, 1000,
-1000,-1000,-1000, 1200,
-1200, 1200, 1200,
200, 56700, -56700, -200), 
Code = c("ABC", "M567", "M567","M567", "XYZ", "M567", "ABX" ,
"M567","M567", "M567", 
"M567", "M567", "M300", 
"M678", "M678", "ABC"), 
ID = c(NA, "98","187","187","12ee","M11","M13",
NA,"K999", 
"K999", "111", "111", "11",
"12345", NA, NA)), row.names = c(NA, -16L), class = "data.frame")

数据帧看起来像这个

Date payment Code    ID
1  24/06/2002     200  ABC  <NA>
2  24/06/2002    1000 M567    98
3  25/06/2002   -1000 M567   187
4  25/06/2002   -1000 M567   187
5  26/06/2002    1000  XYZ  12ee
6  02/07/2002   -1000 M567   M11
7  03/07/2002   -1000  ABX   M13
8  24/07/2002   -1000 M567  <NA>
9  08/07/2002    1200 M567  K999
10 08/07/2002   -1200 M567  K999
11 15/07/2002    1200 M567   111
12 17/07/2002    1200 M567   111
13 22/07/2002     200 M300    11
14 22/07/2002   56700 M678 12345
15 28/07/2002  -56700 M678  <NA>
16 29/07/2002    -200  ABC  <NA>

正如你所看到的,数据中有一些积极和消极的支付。负付款是错误添加的交易或退款。

例如,+1200将根据代码和ID-1200取消,而第14行和第15行相似,但ID为NA-因此我必须用其正付款行和viseversa的ID来填充。这样我就可以删除这两行了。

我在StackOverflow程序员的帮助下尝试的代码(以前问过(

library(dplyr)
library(data.table)
library(tidyr)
Final_df <- df1 %>% 
group_by(Code) %>%
mutate(ind = rowid(payment)) %>%
group_by(ind, .add = TRUE) %>% 
fill(ID, .direction = 'downup') #%>% 
ungroup %>%
mutate(absPayment = abs(payment)) %>% 
arrange(ID, Code, absPayment) %>%
group_by(Code, ID, absPayment) %>%
mutate(grp = rowid(sign(payment))) %>% 
group_by(grp, .add = TRUE) %>%
filter(n() == 1) %>% 
ungroup %>%   
select(names(df1)) 

但这里的问题是第8行-2002年7月24日-1000 M567应该由第2行填充,因为代码和正付款是匹配的-这样以后我就可以取消这两行了。由于该行远离第8行。direction='downup'不起作用。

我认为有一种更好的方法来填充NA,而不是使用方向(因为它不会应用于遥远的类似行(

预期输出为:

Date payment Code    ID
1  25/06/2002   -1000 M567   187
2  25/06/2002   -1000 M567   187
3  26/06/2002    1000  XYZ  12ee
4  02/07/2002   -1000 M567   M11
5  03/07/2002   -1000  ABX   M13
6  15/07/2002    1200 M567   111
7  17/07/2002    1200 M567   111
8  22/07/2002     200 M300    11

我从5天起就对此感到震惊。任何解决方案都会非常有帮助。

提前感谢

另一种可能的伪数据:

temp_2 <-  structure(list(Date = c("22/06/2002", "23/06/2002","24/06/2002", "25/06/2002","25/06/2002", "26/06/2002", 
"02/07/2002","03/07/2002","24/07/2002", "08/07/2002",
"08/07/2002", "15/07/2002", "17/07/2002", 
"22/07/2002", "22/07/2002", "28/07/2002", "29/07/2002"), 
payment = c(200,-1000, 1000,-1000, -1000, 1000,
-1000,-1000,-1000, 1200,
-1200, 1200, 1200,
200, 56700, -56700, -200), 
Code = c("ABC", "M567","M567", "M567","M567", "XYZ", "M567", "ABX" ,
"M567","M567", "M567", 
"M567", "M567", "M300", 
"M678", "M678", "ABC"), 
ID = c(NA,"187", "98","187","187","12ee",NA,NA,
NA,"K999", 
"K999", "111", "111", "11",
"12345", NA, NA)), row.names = c(NA, -17L), class = "data.frame")

temp_2的预期输出:

Date payment Code    ID
1  23/06/2002   -1000 M567   187
2  25/06/2002   -1000 M567   187
3  25/06/2002   -1000 M567   187
4  26/06/2002    1000  XYZ  12ee
5  03/07/2002   -1000  ABX  <NA>
6  24/07/2002   -1000 M567   98
7 15/07/2002    1200 M567   111
8 17/07/2002    1200 M567   111
9 22/07/2002     200 M300    11

我们可以使用

library(dplyr)
library(data.table)
f1 <- function(dat) {
i1 <- is.na(dat$ID) & nrow(dat) > 1
if(any(i1)) {
dat$ID[i1] <- dat$ID[!i1][match(dat$payment[i1], 
-dat$payment[!i1])]
}
return(dat)
}

temp %>%
mutate(rn = row_number()) %>%
group_by(Code, absPayment = abs(payment)) %>%
filter(sum(payment) != 0) %>%
group_modify(~ f1(.x)) %>%
group_by(ID, .add = TRUE) %>%
mutate(grp = rowid(sign(payment))) %>% 
group_by(grp, .add = TRUE) %>%
filter(n() == 1) %>% 
ungroup %>%
arrange(rn) %>%
select(names(temp)) 

-输出

# A tibble: 8 × 4
Date       payment Code  ID   
<chr>        <dbl> <chr> <chr>
1 25/06/2002   -1000 M567  187  
2 25/06/2002   -1000 M567  187  
3 26/06/2002    1000 XYZ   12ee 
4 02/07/2002   -1000 M567  M11  
5 03/07/2002   -1000 ABX   M13  
6 15/07/2002    1200 M567  111  
7 17/07/2002    1200 M567  111  
8 22/07/2002     200 M300  11   

对于第二种情况

temp_2 %>%
mutate(rn = row_number()) %>% 
group_by(Code, absPayment = abs(payment)) %>%
filter(sum(payment) != 0) %>%
group_modify(~ f1(.x)) %>%
group_by(ID, .add = TRUE) %>%
mutate(grp = rowid(sign(payment))) %>% 
group_by(grp, .add = TRUE) %>%
filter(n() == 1) %>% 
ungroup %>%
arrange(rn) %>%
select(names(temp_2))

-输出

# A tibble: 9 × 4
Date       payment Code  ID   
<chr>        <dbl> <chr> <chr>
1 23/06/2002   -1000 M567  187  
2 25/06/2002   -1000 M567  187  
3 25/06/2002   -1000 M567  187  
4 26/06/2002    1000 XYZ   12ee 
5 03/07/2002   -1000 ABX   <NA> 
6 24/07/2002   -1000 M567  98   
7 15/07/2002    1200 M567  111  
8 17/07/2002    1200 M567  111  
9 22/07/2002     200 M300  11   

这是我解决它的尝试,诀窍是正确替换NA。

# fill NAs according to their values 
temp <- temp %>% 
mutate(abs_payment = abs(payment)) %>% 
group_by(abs_payment, ID, Code) %>% 
# should consider replacement only if ID has only one row or if it is NA
mutate(is_candidate = (n() == 1) | is.na(ID)) %>%
group_by(abs_payment, Code) %>% 
# we do not want to replace IDs for non-na IDs 
mutate(new_ID = case_when(is_candidate & is.na(ID) ~ na.omit(ID)[1],
TRUE ~ ID))

# remove if sum equal to 0 
temp <- temp %>% 
group_by(Code, new_ID, abs_payment) %>% 
mutate(total = sum(payment)) %>% 
filter(total != 0 )

我不确定这是否正确,但这是我的尝试。我看不出你是如何得到预期输出的。他们的其他筛选标准是什么?您的原始数据超过8行。

library(tidyverse)
temp |>
mutate(Date = lubridate::dmy(Date)) |>
arrange(Code, abs(payment)) |>
group_by(Code, abs(payment), ID) |>
mutate(n = n()) |>
ungroup()|>
group_by(Code, abs(payment), n) |>
fill(ID, .direction = "updown") |>
ungroup()|>
select(names(temp)) |>
arrange(Date, abs(payment))
#> # A tibble: 16 x 4
#>    Date       payment Code  ID   
#>    <date>       <dbl> <chr> <chr>
#>  1 2002-06-24     200 ABC   <NA> 
#>  2 2002-06-24    1000 M567  98   
#>  3 2002-06-25   -1000 M567  187  
#>  4 2002-06-25   -1000 M567  187  
#>  5 2002-06-26    1000 XYZ   12ee 
#>  6 2002-07-02   -1000 M567  M11  
#>  7 2002-07-03   -1000 ABX   M13  
#>  8 2002-07-08    1200 M567  K999 
#>  9 2002-07-08   -1200 M567  K999 
#> 10 2002-07-15    1200 M567  111  
#> 11 2002-07-17    1200 M567  111  
#> 12 2002-07-22     200 M300  11   
#> 13 2002-07-22   56700 M678  12345
#> 14 2002-07-24   -1000 M567  M11  
#> 15 2002-07-28  -56700 M678  12345
#> 16 2002-07-29    -200 ABC   <NA>

相关内容

  • 没有找到相关文章

最新更新