在R中,使用值最接近目标的列的名称创建新列



我正在分析上诉中的捐赠者数据。假设我们有这个数据帧,它有一个ID,给出的金额,以及上诉卡的请求金额:

dfgive <- data.frame(id = c(1, 2, 3, 4, 5),
gift = c(20, 1000, 1500, 300, 500),
ask1 = c(50, 5000, 5000, 100, 2500),
ask2 = c(100, 3750, 3750, 250, 5000),
ask3 = c(250, 2500, 2500, 500, 10000),
gift_diff_1 = c(-30, -4000, -3500, 200, -2000),
gift_diff_2 = c(-80, -2750, -2250, 50, -4500),
gift_diff_3 = c(-230, -1500, -1000, -200, -9500),
mindiff = c(-230, -4000, -3500, -200, -9500))

每个请求金额的礼物差额和最小差额是用下面的代码创建的。

mutate(gift_diff_1 = gift - ask1) %>%
mutate(gift_diff_2 = gift - ask2) %>%
mutate(gift_diff_3 = gift - ask3) %>%
mutate(mindiff=min(gift_diff_1, gift_diff_2, gift_diff_3))

我接下来希望做的是创建一个名为gift_closest的新列,该列将确定哪一个请求金额(ask1、ask2或ask3(的绝对值最接近礼物。我们可以使用派生的gift_diff列,也可以完全在方程中进行。我不知道该怎么做。

新列的值将是"0"中的一个;ask1"ask2";,或";ask3";。理想情况下,在dplyr链中,因为这是我创建整个数据集的地方,其中将包括一堆人口统计字段。

谢谢。。。

如果有许多不同的ask列,您可以考虑重新整形数据,以减少代码重复。如果您想在该数据帧中保留其他列,则可以将下面的结果连接回原始dfgive。

library(tidyverse)
dfgive <- data.frame(
id = c(1, 2, 3, 4, 5),
gift = c(20, 1000, 1500, 300, 500),
ask1 = c(50, 5000, 5000, 100, 2500),
ask2 = c(100, 3750, 3750, 250, 5000),
ask3 = c(250, 2500, 2500, 500, 10000)
)
dfgive %>%
pivot_longer(
cols = starts_with("ask"),
names_to = "ask_num",
values_to = "ask"
) %>%
mutate(gift_diff = gift - ask) %>%
group_by(id) %>%
summarise(
mindiff = min(gift_diff),
gift_closest = ask_num[which.min(abs(gift_diff))]
)
#> # A tibble: 5 x 3
#>      id mindiff gift_closest
#>   <dbl>   <dbl> <chr>       
#> 1     1    -230 ask1        
#> 2     2   -4000 ask3        
#> 3     3   -3500 ask3        
#> 4     4    -200 ask2        
#> 5     5   -9500 ask1

首先:我想你的mindiff-部分不会达到你期望的效果。用pmin替换min

library(dplyr)
dfgive %>%
mutate(abs = pmin(abs(gift - ask1), abs(gift - ask2), abs(gift - ask3)),
gift_close = case_when(abs(gift - ask1) == abs ~ "ask1",
abs(gift - ask2) == abs ~ "ask2",
abs(gift - ask3) == abs ~ "ask3",
TRUE ~ NA_character_))

返回

id gift ask1 ask2  ask3 gift_diff_1 gift_diff_2 gift_diff_3 mindiff  abs gift_close
1  1   20   50  100   250         -30         -80        -230    -230   30       ask1
2  2 1000 5000 3750  2500       -4000       -2750       -1500   -4000 1500       ask3
3  3 1500 5000 3750  2500       -3500       -2250       -1000   -3500 1000       ask3
4  4  300  100  250   500         200          50        -200    -200   50       ask2
5  5  500 2500 5000 10000       -2000       -4500       -9500   -9500 2000       ask1

您还可以使用以下解决方案,该解决方案主要基于purrr包函数,以及您收到的非常好的答案:

library(dplyr)
library(purrr)
dfgive %>% 
select(2) %>%
map2_dfc(., dfgive %>% select(starts_with("gift_diff")), 
~ .x - abs(.y)) %>%
set_names(~ paste("ask", 1:3, sep = "_")) %>%
rowwise() %>% 
mutate(gift_close = colnames(.)[max.col(cur_data())]) %>%
select(gift_close) %>%
bind_cols(dfgive) %>%
relocate(gift_close, .after = last_col())

# A tibble: 5 x 10
# Rowwise: 
id  gift  ask1  ask2  ask3 gift_diff_1 gift_diff_2 gift_diff_3 mindiff gift_close
<dbl> <dbl> <dbl> <dbl> <dbl>       <dbl>       <dbl>       <dbl>   <dbl> <chr>     
1     1    20    50   100   250         -30         -80        -230    -230 ask_1     
2     2  1000  5000  3750  2500       -4000       -2750       -1500   -4000 ask_3     
3     3  1500  5000  3750  2500       -3500       -2250       -1000   -3500 ask_3     
4     4   300   100   250   500         200          50        -200    -200 ask_2     
5     5   500  2500  5000 10000       -2000       -4500       -9500   -9500 ask_1  

使用cur_data直接变异

dfgive %>% 
mutate(gift_closest = {tmp <- str_detect(names(.), 'ask'); names(.[tmp])[max.col(-1 * abs(gift - cur_data()[tmp]))]})
id gift ask1 ask2  ask3 gift_diff_1 gift_diff_2 gift_diff_3 mindiff gift_closest
1  1   20   50  100   250         -30         -80        -230    -230         ask1
2  2 1000 5000 3750  2500       -4000       -2750       -1500   -4000         ask3
3  3 1500 5000 3750  2500       -3500       -2250       -1000   -3500         ask3
4  4  300  100  250   500         200          50        -200    -200         ask2
5  5  500 2500 5000 10000       -2000       -4500       -9500   -9500         ask1

即使您可以继续生成所有新列,而不需要对每个请求值重复它

dfgive <- structure(list(id = c(1, 2, 3, 4, 5), gift = c(20, 1000, 1500, 
300, 500), ask1 = c(50, 5000, 5000, 100, 2500), ask2 = c(100, 
                                            3750, 3750, 250, 5000), ask3 = c(250, 2500, 2500, 500, 10000)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                -5L))
dfgive
#>   id gift ask1 ask2  ask3
#> 1  1   20   50  100   250
#> 2  2 1000 5000 3750  2500
#> 3  3 1500 5000 3750  2500
#> 4  4  300  100  250   500
#> 5  5  500 2500 5000 10000
library(tidyverse)
dfgive %>% 
mutate(gift_closest = {tmp <- str_detect(names(.), 'ask'); names(.[tmp])[max.col(-1 * abs(gift - cur_data()[tmp]))]}) %>%
mutate(across(starts_with('ask'), ~ gift - .,
.names = 'gift_diff_{str_remove(.col,"ask")}')) %>%
rowwise() %>%
mutate(mindiff = min(c_across(starts_with('gift_diff'))))
#> # A tibble: 5 x 10
#> # Rowwise: 
#>      id  gift  ask1  ask2  ask3 gift_closest gift_diff_1 gift_diff_2 gift_diff_3
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>              <dbl>       <dbl>       <dbl>
#> 1     1    20    50   100   250 ask1                 -30         -80        -230
#> 2     2  1000  5000  3750  2500 ask3               -4000       -2750       -1500
#> 3     3  1500  5000  3750  2500 ask3               -3500       -2250       -1000
#> 4     4   300   100   250   500 ask2                 200          50        -200
#> 5     5   500  2500  5000 10000 ask1               -2000       -4500       -9500
#> # ... with 1 more variable: mindiff <dbl>

创建于2021-06-10由reprex包(v2.0.0(

最新更新