我正在分析上诉中的捐赠者数据。假设我们有这个数据帧,它有一个ID,给出的金额,以及上诉卡的请求金额:
dfgive <- data.frame(id = c(1, 2, 3, 4, 5),
gift = c(20, 1000, 1500, 300, 500),
ask1 = c(50, 5000, 5000, 100, 2500),
ask2 = c(100, 3750, 3750, 250, 5000),
ask3 = c(250, 2500, 2500, 500, 10000),
gift_diff_1 = c(-30, -4000, -3500, 200, -2000),
gift_diff_2 = c(-80, -2750, -2250, 50, -4500),
gift_diff_3 = c(-230, -1500, -1000, -200, -9500),
mindiff = c(-230, -4000, -3500, -200, -9500))
每个请求金额的礼物差额和最小差额是用下面的代码创建的。
mutate(gift_diff_1 = gift - ask1) %>%
mutate(gift_diff_2 = gift - ask2) %>%
mutate(gift_diff_3 = gift - ask3) %>%
mutate(mindiff=min(gift_diff_1, gift_diff_2, gift_diff_3))
我接下来希望做的是创建一个名为gift_closest
的新列,该列将确定哪一个请求金额(ask1、ask2或ask3(的绝对值最接近礼物。我们可以使用派生的gift_diff列,也可以完全在方程中进行。我不知道该怎么做。
新列的值将是"0"中的一个;ask1"ask2";,或";ask3";。理想情况下,在dplyr
链中,因为这是我创建整个数据集的地方,其中将包括一堆人口统计字段。
谢谢。。。
如果有许多不同的ask
列,您可以考虑重新整形数据,以减少代码重复。如果您想在该数据帧中保留其他列,则可以将下面的结果连接回原始dfgive。
library(tidyverse)
dfgive <- data.frame(
id = c(1, 2, 3, 4, 5),
gift = c(20, 1000, 1500, 300, 500),
ask1 = c(50, 5000, 5000, 100, 2500),
ask2 = c(100, 3750, 3750, 250, 5000),
ask3 = c(250, 2500, 2500, 500, 10000)
)
dfgive %>%
pivot_longer(
cols = starts_with("ask"),
names_to = "ask_num",
values_to = "ask"
) %>%
mutate(gift_diff = gift - ask) %>%
group_by(id) %>%
summarise(
mindiff = min(gift_diff),
gift_closest = ask_num[which.min(abs(gift_diff))]
)
#> # A tibble: 5 x 3
#> id mindiff gift_closest
#> <dbl> <dbl> <chr>
#> 1 1 -230 ask1
#> 2 2 -4000 ask3
#> 3 3 -3500 ask3
#> 4 4 -200 ask2
#> 5 5 -9500 ask1
首先:我想你的mindiff
-部分不会达到你期望的效果。用pmin
替换min
。
library(dplyr)
dfgive %>%
mutate(abs = pmin(abs(gift - ask1), abs(gift - ask2), abs(gift - ask3)),
gift_close = case_when(abs(gift - ask1) == abs ~ "ask1",
abs(gift - ask2) == abs ~ "ask2",
abs(gift - ask3) == abs ~ "ask3",
TRUE ~ NA_character_))
返回
id gift ask1 ask2 ask3 gift_diff_1 gift_diff_2 gift_diff_3 mindiff abs gift_close
1 1 20 50 100 250 -30 -80 -230 -230 30 ask1
2 2 1000 5000 3750 2500 -4000 -2750 -1500 -4000 1500 ask3
3 3 1500 5000 3750 2500 -3500 -2250 -1000 -3500 1000 ask3
4 4 300 100 250 500 200 50 -200 -200 50 ask2
5 5 500 2500 5000 10000 -2000 -4500 -9500 -9500 2000 ask1
您还可以使用以下解决方案,该解决方案主要基于purrr
包函数,以及您收到的非常好的答案:
library(dplyr)
library(purrr)
dfgive %>%
select(2) %>%
map2_dfc(., dfgive %>% select(starts_with("gift_diff")),
~ .x - abs(.y)) %>%
set_names(~ paste("ask", 1:3, sep = "_")) %>%
rowwise() %>%
mutate(gift_close = colnames(.)[max.col(cur_data())]) %>%
select(gift_close) %>%
bind_cols(dfgive) %>%
relocate(gift_close, .after = last_col())
# A tibble: 5 x 10
# Rowwise:
id gift ask1 ask2 ask3 gift_diff_1 gift_diff_2 gift_diff_3 mindiff gift_close
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 20 50 100 250 -30 -80 -230 -230 ask_1
2 2 1000 5000 3750 2500 -4000 -2750 -1500 -4000 ask_3
3 3 1500 5000 3750 2500 -3500 -2250 -1000 -3500 ask_3
4 4 300 100 250 500 200 50 -200 -200 ask_2
5 5 500 2500 5000 10000 -2000 -4500 -9500 -9500 ask_1
使用cur_data
直接变异
dfgive %>%
mutate(gift_closest = {tmp <- str_detect(names(.), 'ask'); names(.[tmp])[max.col(-1 * abs(gift - cur_data()[tmp]))]})
id gift ask1 ask2 ask3 gift_diff_1 gift_diff_2 gift_diff_3 mindiff gift_closest
1 1 20 50 100 250 -30 -80 -230 -230 ask1
2 2 1000 5000 3750 2500 -4000 -2750 -1500 -4000 ask3
3 3 1500 5000 3750 2500 -3500 -2250 -1000 -3500 ask3
4 4 300 100 250 500 200 50 -200 -200 ask2
5 5 500 2500 5000 10000 -2000 -4500 -9500 -9500 ask1
即使您可以继续生成所有新列,而不需要对每个请求值重复它
dfgive <- structure(list(id = c(1, 2, 3, 4, 5), gift = c(20, 1000, 1500,
300, 500), ask1 = c(50, 5000, 5000, 100, 2500), ask2 = c(100,
3750, 3750, 250, 5000), ask3 = c(250, 2500, 2500, 500, 10000)), class = "data.frame", row.names = c(NA,
-5L))
dfgive
#> id gift ask1 ask2 ask3
#> 1 1 20 50 100 250
#> 2 2 1000 5000 3750 2500
#> 3 3 1500 5000 3750 2500
#> 4 4 300 100 250 500
#> 5 5 500 2500 5000 10000
library(tidyverse)
dfgive %>%
mutate(gift_closest = {tmp <- str_detect(names(.), 'ask'); names(.[tmp])[max.col(-1 * abs(gift - cur_data()[tmp]))]}) %>%
mutate(across(starts_with('ask'), ~ gift - .,
.names = 'gift_diff_{str_remove(.col,"ask")}')) %>%
rowwise() %>%
mutate(mindiff = min(c_across(starts_with('gift_diff'))))
#> # A tibble: 5 x 10
#> # Rowwise:
#> id gift ask1 ask2 ask3 gift_closest gift_diff_1 gift_diff_2 gift_diff_3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 20 50 100 250 ask1 -30 -80 -230
#> 2 2 1000 5000 3750 2500 ask3 -4000 -2750 -1500
#> 3 3 1500 5000 3750 2500 ask3 -3500 -2250 -1000
#> 4 4 300 100 250 500 ask2 200 50 -200
#> 5 5 500 2500 5000 10000 ask1 -2000 -4500 -9500
#> # ... with 1 more variable: mindiff <dbl>
创建于2021-06-10由reprex包(v2.0.0(