r语言 - 使用两列的数据帧中的百分位数



也许这是一个简单的问题,但我卡住了。

我的数据框架(来自年度调查)包含了按年份和运输的几种物种的长度数据。我想要得到,对于每一年,每个物种的95%百分位。我的数据框架的示例

structure(list(year = c(2015L, 2015L, 2015L, 2015L, 2014L, 2016L,
2015L, 2016L, 2014L, 2016L, 2015L, 2015L, 2016L, 2016L, 2014L, 2014L,
2014L, 2015L, 2016L, 2016L), cod_haul = structure(c(72L, 51L, 77L,
43L, 20L, 92L, 75L, 93L, 9L, 103L, 65L, 63L, 85L, 102L, 27L, 24L,
14L, 55L, 114L, 105L), .Label = c("N14_02", "N14_03", "N14_04",
"N14_06", "N14_07", "N14_08", "N14_10", "N14_13", "N14_16", "N14_17",
"N14_19", "N14_21", "N14_24", "N14_25", "N14_26", "N14_27", "N14_28",
"N14_29", "N14_30", "N14_32", "N14_33", "N14_35", "N14_37", "N14_39",
"N14_40", "N14_41", "N14_42", "N14_44", "N14_51", "N14_54", "N14_55",
"N14_56", "N14_57", "N14_58", "N14_61", "N14_62", "N14_64", "N14_66",
"N14_67", "N15_01", "N15_03", "N15_07", "N15_11", "N15_12", "N15_14",
"N15_16", "N15_18", "N15_19", "N15_20", "N15_22", "N15_23", "N15_24",
"N15_25", "N15_26", "N15_27", "N15_28", "N15_29", "N15_30", "N15_31",
"N15_32", "N15_36", "N15_37", "N15_39", "N15_41", "N15_44", "N15_46",
"N15_47", "N15_48", "N15_52", "N15_55", "N15_56", "N15_58", "N15_59",
"N15_60", "N15_62", "N15_63", "N15_64", "N15_66", "N15_67", "N16_04",
"N16_06", "N16_07", "N16_08", "N16_11", "N16_12", "N16_13", "N16_15",
"N16_17", "N16_18", "N16_20", "N16_22", "N16_23", "N16_25", "N16_28",
"N16_29", "N16_30", "N16_31", "N16_32", "N16_33", "N16_34", "N16_35",
"N16_37", "N16_40", "N16_41", "N16_45", "N16_46", "N16_47", "N16_48",
"N16_49", "N16_50", "N16_51", "N16_52", "N16_53", "N16_54", "N16_56",
"N16_58", "N16_60", "N16_61", "N16_62", "N16_63", "N16_64","N16_66"),
class = "factor"), haul = c(58L, 23L, 64L, 11L, 32L, 23L, 62L, 25L,
16L, 40L, 44L, 39L, 12L, 37L, 42L, 39L, 25L, 27L, 54L, 45L), name =
structure(c(2L, 23L, 11L, 2L, 19L, 15L, 18L, 16L, 3L, 21L, 16L, 21L,
20L, 19L, 3L, 18L, 16L, 11L, 7L, 13L), .Label = c("Argentina 
sphyraena", "Arnoglossus laterna", "Blennius ocellaris", "Boops 
boops", "Callionymus lyra", "Callionymus maculatus", "Capros aper",
"Cepola macrophthalma", "Chelidonichthys cuculus", "Chelidonichthys
lucerna", "Conger conger", "Eutrigla gurnardus", "Gadiculus 
argenteus", "Galeus melastomus", "Helicolenus dactylopterus", 
"Lepidorhombus boscii", "Lepidorhombus whiffiagonis", "Merluccius
merluccius", "Microchirus variegatus", "Micromesistius poutassou",
"Phycis blennoides", "Raja clavata", "Scyliorhinus canicula", 
"Solea solea", "Trachurus trachurus", "Trisopterus luscus"), class
= "factor"), length = c(9L, 18L, 50L, 12L, 14L, 12L, 31L, 19L, 15L,
16L, 26L, 48L, 23L, 10L, 16L, 24L, 12L, 46L, 75L, 13L), number =
c(5L, 4L, 1L, 2L, 29L, 5L, 2L, 14L, 1L, 1L, 4L, 1L, 29L, 21L, 2L,
1L, 2L, 1L, 2L, 14L)), row.names = c(NA, 20L), class = 
"data.frame") 

尽管我尝试了几种方法,但都没能找到解决这个问题的方法。

任何建议或建议都是非常感谢的。

谢谢!

Ps:虽然这不是绝对必要的,但如果百分位数可以作为一个新列添加到数据框中,那就太好了。

df %>%
group_by(year) %>%
summarize(species.95 = quantile(species, 0.95)

我无法下载您的数据框架,但是您可以使用分位数函数找到每个物种的95%。

如果我猜对了

library(tidyverse)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            "collector")), skip = 1L), class = "col_spec"))
df %>% 
group_by(year, name) %>% 
mutate(q95 = quantile(length, probs = 0.95))

library(data.table)
setDT(df)
df[, q95 := quantile(length, probs = 0.95), by = list(year, name)][order(name, year)]

最新更新