我正试图通过删除错误添加的行来清理数据帧。
这是伪数据:
temp <- structure(list(Date = c("24/06/2002", "24/06/2002", "25/06/2002","25/06/2002", "26/06/2002",
"02/07/2002","03/07/2002","24/07/2002", "08/07/2002",
"08/07/2002", "15/07/2002", "17/07/2002",
"22/07/2002", "22/07/2002", "28/07/2002", "29/07/2002"),
payment = c(200, 1000,-1000, -1000, 1000,
-1000,-1000,-1000, 1200,
-1200, 1200, 1200,
200, 56700, -56700, -200),
Code = c("ABC", "M567", "M567","M567", "XYZ", "M567", "ABX" ,
"M567","M567", "M567",
"M567", "M567", "M300",
"M678", "M678", "ABC"),
ID = c(NA, "98","187","187","12ee","M11","M13",
NA,"K999",
"K999", "111", "111", "11",
"12345", NA, NA)), row.names = c(NA, -16L), class = "data.frame")
数据帧看起来像这个
Date payment Code ID
1 24/06/2002 200 ABC <NA>
2 24/06/2002 1000 M567 98
3 25/06/2002 -1000 M567 187
4 25/06/2002 -1000 M567 187
5 26/06/2002 1000 XYZ 12ee
6 02/07/2002 -1000 M567 M11
7 03/07/2002 -1000 ABX M13
8 24/07/2002 -1000 M567 <NA>
9 08/07/2002 1200 M567 K999
10 08/07/2002 -1200 M567 K999
11 15/07/2002 1200 M567 111
12 17/07/2002 1200 M567 111
13 22/07/2002 200 M300 11
14 22/07/2002 56700 M678 12345
15 28/07/2002 -56700 M678 <NA>
16 29/07/2002 -200 ABC <NA>
正如你所看到的,数据中有一些积极和消极的支付。负付款是错误添加的交易或退款。
例如,+1200将根据代码和ID与-1200取消,而第14行和第15行相似,但ID为NA-因此我必须用其正付款行和viseversa的ID来填充。这样我就可以删除这两行了。
我在StackOverflow程序员的帮助下尝试的代码(以前问过(:
library(dplyr)
library(data.table)
library(tidyr)
Final_df <- df1 %>%
group_by(Code) %>%
mutate(ind = rowid(payment)) %>%
group_by(ind, .add = TRUE) %>%
fill(ID, .direction = 'downup') #%>%
ungroup %>%
mutate(absPayment = abs(payment)) %>%
arrange(ID, Code, absPayment) %>%
group_by(Code, ID, absPayment) %>%
mutate(grp = rowid(sign(payment))) %>%
group_by(grp, .add = TRUE) %>%
filter(n() == 1) %>%
ungroup %>%
select(names(df1))
但这里的问题是第8行-2002年7月24日-1000 M567应该由第2行填充,因为代码和正付款是匹配的-这样以后我就可以取消这两行了。由于该行远离第8行。direction='downup'不起作用。
我认为有一种更好的方法来填充NA,而不是使用方向(因为它不会应用于遥远的类似行(
预期输出为:
Date payment Code ID
1 25/06/2002 -1000 M567 187
2 25/06/2002 -1000 M567 187
3 26/06/2002 1000 XYZ 12ee
4 02/07/2002 -1000 M567 M11
5 03/07/2002 -1000 ABX M13
6 15/07/2002 1200 M567 111
7 17/07/2002 1200 M567 111
8 22/07/2002 200 M300 11
我从5天起就对此感到震惊。任何解决方案都会非常有帮助。
提前感谢
另一种可能的伪数据:
temp_2 <- structure(list(Date = c("22/06/2002", "23/06/2002","24/06/2002", "25/06/2002","25/06/2002", "26/06/2002",
"02/07/2002","03/07/2002","24/07/2002", "08/07/2002",
"08/07/2002", "15/07/2002", "17/07/2002",
"22/07/2002", "22/07/2002", "28/07/2002", "29/07/2002"),
payment = c(200,-1000, 1000,-1000, -1000, 1000,
-1000,-1000,-1000, 1200,
-1200, 1200, 1200,
200, 56700, -56700, -200),
Code = c("ABC", "M567","M567", "M567","M567", "XYZ", "M567", "ABX" ,
"M567","M567", "M567",
"M567", "M567", "M300",
"M678", "M678", "ABC"),
ID = c(NA,"187", "98","187","187","12ee",NA,NA,
NA,"K999",
"K999", "111", "111", "11",
"12345", NA, NA)), row.names = c(NA, -17L), class = "data.frame")
temp_2的预期输出:
Date payment Code ID
1 23/06/2002 -1000 M567 187
2 25/06/2002 -1000 M567 187
3 25/06/2002 -1000 M567 187
4 26/06/2002 1000 XYZ 12ee
5 03/07/2002 -1000 ABX <NA>
6 24/07/2002 -1000 M567 98
7 15/07/2002 1200 M567 111
8 17/07/2002 1200 M567 111
9 22/07/2002 200 M300 11
我们可以使用
library(dplyr)
library(data.table)
f1 <- function(dat) {
i1 <- is.na(dat$ID) & nrow(dat) > 1
if(any(i1)) {
dat$ID[i1] <- dat$ID[!i1][match(dat$payment[i1],
-dat$payment[!i1])]
}
return(dat)
}
temp %>%
mutate(rn = row_number()) %>%
group_by(Code, absPayment = abs(payment)) %>%
filter(sum(payment) != 0) %>%
group_modify(~ f1(.x)) %>%
group_by(ID, .add = TRUE) %>%
mutate(grp = rowid(sign(payment))) %>%
group_by(grp, .add = TRUE) %>%
filter(n() == 1) %>%
ungroup %>%
arrange(rn) %>%
select(names(temp))
-输出
# A tibble: 8 × 4
Date payment Code ID
<chr> <dbl> <chr> <chr>
1 25/06/2002 -1000 M567 187
2 25/06/2002 -1000 M567 187
3 26/06/2002 1000 XYZ 12ee
4 02/07/2002 -1000 M567 M11
5 03/07/2002 -1000 ABX M13
6 15/07/2002 1200 M567 111
7 17/07/2002 1200 M567 111
8 22/07/2002 200 M300 11
对于第二种情况
temp_2 %>%
mutate(rn = row_number()) %>%
group_by(Code, absPayment = abs(payment)) %>%
filter(sum(payment) != 0) %>%
group_modify(~ f1(.x)) %>%
group_by(ID, .add = TRUE) %>%
mutate(grp = rowid(sign(payment))) %>%
group_by(grp, .add = TRUE) %>%
filter(n() == 1) %>%
ungroup %>%
arrange(rn) %>%
select(names(temp_2))
-输出
# A tibble: 9 × 4
Date payment Code ID
<chr> <dbl> <chr> <chr>
1 23/06/2002 -1000 M567 187
2 25/06/2002 -1000 M567 187
3 25/06/2002 -1000 M567 187
4 26/06/2002 1000 XYZ 12ee
5 03/07/2002 -1000 ABX <NA>
6 24/07/2002 -1000 M567 98
7 15/07/2002 1200 M567 111
8 17/07/2002 1200 M567 111
9 22/07/2002 200 M300 11
这是我解决它的尝试,诀窍是正确替换NA。
# fill NAs according to their values
temp <- temp %>%
mutate(abs_payment = abs(payment)) %>%
group_by(abs_payment, ID, Code) %>%
# should consider replacement only if ID has only one row or if it is NA
mutate(is_candidate = (n() == 1) | is.na(ID)) %>%
group_by(abs_payment, Code) %>%
# we do not want to replace IDs for non-na IDs
mutate(new_ID = case_when(is_candidate & is.na(ID) ~ na.omit(ID)[1],
TRUE ~ ID))
# remove if sum equal to 0
temp <- temp %>%
group_by(Code, new_ID, abs_payment) %>%
mutate(total = sum(payment)) %>%
filter(total != 0 )
我不确定这是否正确,但这是我的尝试。我看不出你是如何得到预期输出的。他们的其他筛选标准是什么?您的原始数据超过8行。
library(tidyverse)
temp |>
mutate(Date = lubridate::dmy(Date)) |>
arrange(Code, abs(payment)) |>
group_by(Code, abs(payment), ID) |>
mutate(n = n()) |>
ungroup()|>
group_by(Code, abs(payment), n) |>
fill(ID, .direction = "updown") |>
ungroup()|>
select(names(temp)) |>
arrange(Date, abs(payment))
#> # A tibble: 16 x 4
#> Date payment Code ID
#> <date> <dbl> <chr> <chr>
#> 1 2002-06-24 200 ABC <NA>
#> 2 2002-06-24 1000 M567 98
#> 3 2002-06-25 -1000 M567 187
#> 4 2002-06-25 -1000 M567 187
#> 5 2002-06-26 1000 XYZ 12ee
#> 6 2002-07-02 -1000 M567 M11
#> 7 2002-07-03 -1000 ABX M13
#> 8 2002-07-08 1200 M567 K999
#> 9 2002-07-08 -1200 M567 K999
#> 10 2002-07-15 1200 M567 111
#> 11 2002-07-17 1200 M567 111
#> 12 2002-07-22 200 M300 11
#> 13 2002-07-22 56700 M678 12345
#> 14 2002-07-24 -1000 M567 M11
#> 15 2002-07-28 -56700 M678 12345
#> 16 2002-07-29 -200 ABC <NA>