我想要一些关于在R中创建格式化表的帮助——无论是使用普通的IDE还是R Markdown。我主要想做两件事:
- 根据不同列按组显示描述性统计数据(平均值、中位数、最小值、最大值(
- 根据总样本(未分组的数据(呈现描述性统计数据
样本数据:
df <- data.frame(Gender = c("F", "M", "F", "M", "M", "M", "M", "F", "M", "M"),
Young = c("Y", "N", "Y", "N", "Y", "N", "Y", "N", "Y", "N"),
Age = c("14", "25", "13", "24", "14", "25", "13", "24", "10", "26"),
Location = c("Suburb", "Rural", "Suburb", "Rural","Suburb", "Rural","Suburb", "Rural","Suburb", "Rural"))
预期结果
变量 | 平均值 | 中值最大值 | 最小值|
---|---|---|---|
性别 | |||
女性 | |||
男 | |||
位置 | |||
郊区 | |||
农村 | |||
总计 |
您可以通过获取长格式的数据来获得所需的所有信息。
library(dplyr)
library(tidyr)
df <- type.convert(df, as.is = TRUE)
df %>%
pivot_longer(cols = -Age) %>%
group_by(name, value) %>%
summarise(min_age = min(Age),
max_age = max(Age),
median_age = median(Age),
mean_age = mean(Age))
# name value min_age max_age median_age mean_age
# <chr> <chr> <int> <int> <int> <dbl>
#1 Gender F 13 24 14 17
#2 Gender M 10 26 24 19.6
#3 Location Rural 24 26 25 24.8
#4 Location Suburb 10 14 13 12.8
#5 Young N 24 26 25 24.8
#6 Young Y 10 14 13 12.8
使用data.table的类似答案:
> library(data.table)
> df <- data.frame(Gender = c("F", "M", "F", "M", "M", "M", "M", "F", "M", "M"),
+ Young = c("Y", "N", "Y", "N", "Y", "N", "Y", "N", "Y", "N"),
+ Age = c("14", "25", "13", "24", "14", "25", "13", "24",
+ "10", "26"),
+ Location = c("Suburb", "Rural", "Suburb",
+ "Rural","Suburb", "Rural","Suburb",
+ "Rural","Suburb", "Rural"))
> setDT(df) # make it a data.table
> df[,Age:=as.integer(Age)] # correct age column
> df[,.(mean=mean(Age), median=median(Age), max=max(Age), min=min(Age)),
+ by=.(Gender,Location)]
Gender Location mean median max min
1: F Suburb 13.5000 13.5 14 13
2: M Rural 25.0000 25.0 26 24
3: M Suburb 12.3333 13.0 14 10
4: F Rural 24.0000 24.0 24 24
>
或者,如果我们想一次按一个变量进行分层:
> df[,.(mean=mean(Age), median=median(Age), max=max(Age),min=min(Age)),
+ by=.(Gender)]
Gender mean median max min
1: F 17.0000 14 24 13
2: M 19.5714 24 26 10
> df[,.(mean=mean(Age), median=median(Age), max=max(Age), min=min(Age)),
+ by=.(Location)]
Location mean median max min
1: Suburb 12.8 13 14 10
2: Rural 24.8 25 26 24
>
灵感来自Ronak的好答案,就像数据表中的一行:
> melt(df, id.vars="Age")[, .(mean=mean(Age),
+ median=median(Age),
+ min=min(Age),
+ max=max(Age)), by=.(variable,value)]
variable value mean median min max
1: Gender F 17.0000 14 13 24
2: Gender M 19.5714 24 10 26
3: Young Y 12.8000 13 10 14
4: Young N 24.8000 25 24 26
5: Location Suburb 12.8000 13 10 14
6: Location Rural 24.8000 25 24 26
>
几个包都为此封装了函数。我通常使用{psych}包中的describe
。
library(tidyverse)
df <- data.frame(Gender = c("F", "M", "F", "M", "M", "M", "M", "F", "M", "M"),
Young = c("Y", "N", "Y", "N", "Y", "N", "Y", "N", "Y", "N"),
Age = c("14", "25", "13", "24", "14", "25", "13", "24", "10", "26"),
Location = c("Suburb", "Rural", "Suburb", "Rural","Suburb", "Rural","Suburb", "Rural","Suburb", "Rural"))
df_summary <- psych::describe(df)
df_summary
vars n mean sd median trimmed mad min max range skew kurtosis se
Gender* 1 10 1.7 0.48 2.0 1.75 0.00 1 2 1 -0.75 -1.57 0.15
Young* 2 10 1.5 0.53 1.5 1.50 0.74 1 2 1 0.00 -2.19 0.17
Age* 3 10 3.5 1.58 3.5 3.50 2.22 1 6 5 0.00 -1.42 0.50
Location* 4 10 1.5 0.53 1.5 1.50 0.74 1 2 1 0.00 -2.19 0.17
然后,您可以使用dplyr
来执行任何您想要的操作。
df_summary %>% select(mean, median, max, min)