R:计算百分位数的快捷方式



我正在使用R编程语言。

我有以下数据集:

set.seed(123)
library(dplyr)
var1 = rnorm(10000, 100,100)
var2 = rnorm(10000, 100,100)
var3 = rnorm(10000, 100,100)
var4 = rnorm(10000, 100,100)
var5 <- factor(sample(c("Yes", "No"), 1000, replace=TRUE, prob=c(0.4, 0.6)))
var6 <- factor(sample(c("Yes", "No"), 1000, replace=TRUE, prob=c(0.4, 0.6)))
my_data = data.frame( var1, var2, var3, var4, var5, var6)

我想计算"分组百分比"(例如,在任意级别上)根据分类变量在此数据集中的不同列。

最初,我试着用一个函数来做这件事——但这一直给我带来很多困难(例如R: difficulty calculation Percentiles?)。

因此,我试图"手动"执行此操作。与此同时,例如,假设:

  • 基于var5和var6的分组
  • 我想创建一个变量"class3"将var3分成10个百分位数的组
  • 我想创建一个变量"class4"将var4分成20个百分位数的组

作为一个例子,这里有两种不同的方法我试图做到这一点:

方法1:产生一些NA ?

library(dplyr)
final = my_data %>% group_by(var5, var6) %>%
mutate(class3 = case_when(ntile(var3, 10) == 1 ~ paste0(round(min(var3), 2), " to ", round(quantile(var3, 0.1), 2), " decile 1"),
ntile(var3, 10) == 2 ~ paste0(round(quantile(var3, 0.1), 2), " to ", round(quantile(var3, 0.2), 2), " decile 2"),
ntile(var3, 10) == 3 ~ paste0(round(quantile(var3, 0.2), 2), " to ", round(quantile(var3, 0.3), 2), " decile 3"),
ntile(var3, 10) == 4 ~ paste0(round(quantile(var3, 0.3), 2), " to ", round(quantile(var3, 0.4), 2), " decile 4"),
ntile(var3, 10) == 5 ~ paste0(round(quantile(var3, 0.4), 2), " to ", round(quantile(var3, 0.5), 2), " decile 5"),
ntile(var3, 10) == 6 ~ paste0(round(quantile(var3, 0.5), 2), " to ", round(quantile(var3, 0.6), 2), " decile 6"),
ntile(var3, 10) == 7 ~ paste0(round(quantile(var3, 0.6), 2), " to ", round(quantile(var3, 0.7), 2), " decile 7"),
ntile(var3, 10) == 8 ~ paste0(round(quantile(var3, 0.7), 2), " to ", round(quantile(var3, 0.8), 2), " decile 8"),
ntile(var3, 10) == 9 ~ paste0(round(quantile(var3, 0.8), 2), " to ", round(quantile(var3, 0.9), 2), " decile 9"),
ntile(var3, 10) == 10 ~ paste0(round(quantile(var3, 0.9), 2), " to ", round(max(var3), 2), " decile 10"))) %>%
mutate(class4 = case_when(ntile(var4, 20) == 1 ~ paste0(round(min(var4), 2), " to ", round(quantile(var4, 0.1), 2), " pcile 1"),
ntile(var4, 20) == 2 ~ paste0(round(quantile(var4, 0.1), 2), " to ", round(quantile(var4, 0.2), 2), " pcile 2"),
ntile(var4, 20) == 3 ~ paste0(round(quantile(var4, 0.2), 2), " to ", round(quantile(var4, 0.3), 2), " pcile 3"),
ntile(var4, 20) == 4 ~ paste0(round(quantile(var4, 0.3), 2), " to ", round(quantile(var4, 0.4), 2), " pcile 4"),
ntile(var4, 20) == 5 ~ paste0(round(quantile(var4, 0.4), 2), " to ", round(quantile(var4, 0.5), 2), " pcile 5")))

方法2:更少的NA的吗?

final = my_data %>% group_by(var5, var6) %>%  mutate(class3 = paste0(cut(var3, breaks = c(-Inf, quantile(var3, c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)), Inf), 
labels = c("ptile 1", "ptile 2", "ptile 3", "ptile 4", "ptile 5", "ptile 6", "ptile 7", "ptile 8", "ptile 9", "ptile 10")),
" (", round(min(var3), 2), " to ", round(max(var3), 2), ")")) %>% 
mutate(class4 = paste0(cut(var4, breaks = c(-Inf, quantile(var4, c(0.2, 0.4, 0.6, 0.8)), Inf), 
labels = c("ptile 1", "ptile 2", "ptile 3", "ptile 4", "ptile 5")),
" (", round(min(var4), 2), " to ", round(max(var4), 2), ")"))

我认为方法2可能更正确,因为产生的NA值更少-但与此同时,是否有人可以帮助我验证我是否正确地这样做(在方法2中)…如果不是,我该如何纠正?

谢谢!

我们可以使用quantile()函数在R中轻松计算百分位数,该函数使用以下语法:

quantile(x, probs = seq(0, 1, 0.25))

x:希望找到其百分位数的数值向量Probs:[0,1]中概率的数值向量,表示我们希望找到的百分位数

#create vector of 100 random values uniformly distributed between 0 and 500
data <- runif(100, 0, 500)
#Find the quartiles (25th, 50th, and 75th percentiles) of the vector
quantile(data, probs = c(.25, .5, .75))
#      25%       50%       75% 
# 97.78961 225.07593 356.47943 
#Find the deciles (10th, 20th, 30th, ..., 90th percentiles) of the vector
quantile(data, probs = seq(.1, .9, by = .1))
#      10%       20%       30%       40%       50%       60%       70%       80% 
# 45.92510  87.16659 129.49574 178.27989 225.07593 300.79690 337.84393 386.36108 
#      90% 
#423.28070
#Find the 37th, 53rd, and 87th percentiles
quantile(data, probs = c(.37, .53, .87))
#     37%      53%      87% 
#159.9561 239.8420 418.4787

对于方法1,我认为您主要使用NA值,因为您需要使用ntile(..., 5)而不是ntile(..., 20)

final = my_data %>% group_by(var5, var6) %>%
mutate(class3 = case_when(ntile(var3, 10) == 1 ~ paste0(round(min(var3), 2), " to ", round(quantile(var3, 0.1), 2), " decile 1"),
ntile(var3, 10) == 2 ~ paste0(round(quantile(var3, 0.1), 2), " to ", round(quantile(var3, 0.2), 2), " decile 2"),
ntile(var3, 10) == 3 ~ paste0(round(quantile(var3, 0.2), 2), " to ", round(quantile(var3, 0.3), 2), " decile 3"),
ntile(var3, 10) == 4 ~ paste0(round(quantile(var3, 0.3), 2), " to ", round(quantile(var3, 0.4), 2), " decile 4"),
ntile(var3, 10) == 5 ~ paste0(round(quantile(var3, 0.4), 2), " to ", round(quantile(var3, 0.5), 2), " decile 5"),
ntile(var3, 10) == 6 ~ paste0(round(quantile(var3, 0.5), 2), " to ", round(quantile(var3, 0.6), 2), " decile 6"),
ntile(var3, 10) == 7 ~ paste0(round(quantile(var3, 0.6), 2), " to ", round(quantile(var3, 0.7), 2), " decile 7"),
ntile(var3, 10) == 8 ~ paste0(round(quantile(var3, 0.7), 2), " to ", round(quantile(var3, 0.8), 2), " decile 8"),
ntile(var3, 10) == 9 ~ paste0(round(quantile(var3, 0.8), 2), " to ", round(quantile(var3, 0.9), 2), " decile 9"),
ntile(var3, 10) == 10 ~ paste0(round(quantile(var3, 0.9), 2), " to ", round(max(var3), 2), " decile 10"))) %>%


mutate(class4 = case_when(ntile(var4, 5) == 1 ~ paste0(round(min(var4), 2), " to ", round(quantile(var4, 0.1), 2), " pcile 1"),
ntile(var4, 5) == 2 ~ paste0(round(quantile(var4, 0.2), 2), " to ", round(quantile(var4, 0.4), 2), " pcile 2"),
ntile(var4, 5) == 3 ~ paste0(round(quantile(var4, 0.4), 2), " to ", round(quantile(var4, 0.6), 2), " pcile 3"),
ntile(var4, 5) == 4 ~ paste0(round(quantile(var4, 0.6), 2), " to ", round(quantile(var4, 0.8), 2), " pcile 4"),
ntile(var4, 5) == 5 ~ paste0(round(quantile(var4, 0.8), 2), " to ", round(quantile(var4, 1), 2), " pcile 5")))

编辑:如果我从头开始,我会使用这些数据。表的方法:

library(data.table)
dt = as.data.table(my_data)
dt[, c("class3", "class4") := {
class3_cut = cut(var3, quantile(var3, seq(0, 1, .1)))
class4_cut = cut(var4, quantile(var4, seq(0, 1, .2)))
.(paste(class3_cut, as.integer(class3_cut)),
paste(class4_cut, as.integer(class4_cut)))
},
by = .(var5, var6)]

最新更新