我有这个数据框架:
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9), year = c("2017", "2018",
"2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026",
"2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024",
"2025", "2026", "2017", "2018", "2019", "2020", "2021", "2022",
"2023", "2024", "2025", "2026", "2017", "2018", "2019", "2020",
"2021", "2022", "2023", "2024", "2025", "2026", "2017", "2018",
"2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026",
"2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024",
"2025", "2026", "2017", "2018", "2019", "2020", "2021", "2022",
"2023", "2024", "2025", "2026", "2017", "2018", "2019", "2020",
"2021", "2022", "2023", "2024", "2025", "2026", "2017", "2018",
"2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026"
), volume = c(0.0013, 0.0013, 0.0012579, 0.0011895, 0.0011421,
0.0010842, 0.0010211, 0.0010158, 0.00099474, 0.00092632, 0.07878,
0.078791, 0.077295, 0.076638, 0.075538, 0.074468, 0.074776, 0.074051,
0.071706, 0.068056, 0.023269, 0.023011, 0.022374, 0.021962, 0.021408,
0.020949, 0.020811, 0.020354, 0.019309, 0.018042, 0.0004, 0.0004,
0.00038421, 0.00035263, 0.00033158, 0.00032105, 0.00026842, 0.00028421,
0.00026842, 0.00024211, 0.0002, 0.0001, 0.00011579, 0, 0, 0,
0, 0, 0, 0, 0.028422, 0.028361, 0.027768, 0.027501, 0.027029,
0.02651, 0.026588, 0.026209, 0.025094, 0.023391, 0.0001, 0.0001,
0, 0, 0, 0, 0, 0, 0, 0, 0.0047, 0.0047158, 0.0048368, 0.0048316,
0.0049263, 0.0049737, 0.0049947, 0.0051684, 0.0052526, 0.0051842,
0.0106, 0.010389, 0.010279, 0.010005, 0.0098421, 0.0096368, 0.0094053,
0.0093368, 0.0092526, 0.0089316)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -90L))
它看起来像这样:
# A tibble: 6 × 3
id year volume
<dbl> <chr> <dbl>
1 1 2017 0.0013
2 1 2018 0.0013
3 1 2019 0.00126
4 1 2020 0.00119
5 1 2021 0.00114
6 1 2022 0.00108
Id有9个不同的Id,每个Id有10行。现在我想找到列volume
的最大值,然后过滤掉组(或者只是创建一个额外的列,如inTop3
),突出显示那些在前3个最高音量值中的id。
这可能意味着最大的3个值在ID = 2的组中。但是我只想比较每组的最大值与其他组的最大值。
获得每组的最大值是微不足道的:
df %>%
group_by(id) %>%
mutate(
m = max(volume)
)
但是我有点不知道如何继续下去。特别是我想知道如何创建一个布尔列来指示一个组是否在前3名中。
另一个可能的解决方案:
library(dplyr)
df %>%
group_by(id) %>%
summarise(m = max(volume)) %>%
slice_max(m, n = 3)
#> # A tibble: 3 × 2
#> id m
#> <dbl> <dbl>
#> 1 2 0.0788
#> 2 6 0.0284
#> 3 3 0.0233
获取3个max-values中每一个的整个组:
library(tidyverse)
df %>%
group_by(id) %>%
summarise(m = max(volume)) %>%
slice_max(m, n = 3) %>%
group_split(id) %>%
map(~ inner_join(df, .x, by = "id"))
#> [[1]]
#> # A tibble: 10 × 4
#> id year volume m
#> <dbl> <chr> <dbl> <dbl>
#> 1 2 2017 0.0788 0.0788
#> 2 2 2018 0.0788 0.0788
#> 3 2 2019 0.0773 0.0788
#> 4 2 2020 0.0766 0.0788
#> 5 2 2021 0.0755 0.0788
#> 6 2 2022 0.0745 0.0788
#> 7 2 2023 0.0748 0.0788
#> 8 2 2024 0.0741 0.0788
#> 9 2 2025 0.0717 0.0788
#> 10 2 2026 0.0681 0.0788
#>
#> [[2]]
#> # A tibble: 10 × 4
#> id year volume m
#> <dbl> <chr> <dbl> <dbl>
#> 1 3 2017 0.0233 0.0233
#> 2 3 2018 0.0230 0.0233
#> 3 3 2019 0.0224 0.0233
#> 4 3 2020 0.0220 0.0233
#> 5 3 2021 0.0214 0.0233
#> 6 3 2022 0.0209 0.0233
#> 7 3 2023 0.0208 0.0233
#> 8 3 2024 0.0204 0.0233
#> 9 3 2025 0.0193 0.0233
#> 10 3 2026 0.0180 0.0233
#>
#> [[3]]
#> # A tibble: 10 × 4
#> id year volume m
#> <dbl> <chr> <dbl> <dbl>
#> 1 6 2017 0.0284 0.0284
#> 2 6 2018 0.0284 0.0284
#> 3 6 2019 0.0278 0.0284
#> 4 6 2020 0.0275 0.0284
#> 5 6 2021 0.0270 0.0284
#> 6 6 2022 0.0265 0.0284
#> 7 6 2023 0.0266 0.0284
#> 8 6 2024 0.0262 0.0284
#> 9 6 2025 0.0251 0.0284
#> 10 6 2026 0.0234 0.0284
您可以使用dplyr::top_n
df %>%
group_by(id) %>%
arrange(id, desc(volume)) %>%
top_n(3)
id year volume
<dbl> <chr> <dbl>
1 1 2017 0.0013
2 1 2018 0.0013
3 1 2019 0.00126
4 2 2018 0.0788
5 2 2017 0.0788
6 2 2019 0.0773
7 3 2017 0.0233
8 3 2018 0.0230
9 3 2019 0.0224
10 4 2017 0.0004
# … with 24 more rows
<标题>超越/nottop3 h1> div class="one_answers">对我来说,这是最接近我想要的:
df %>%
group_by(id) %>%
summarise(m = max(volume)) %>%
arrange(desc(m)) %>%
mutate(top3 = if_else(row_number() %in% c(1, 2, 3), T, F)) %>%
inner_join(., df, by = c("id")) -> top3
标题>