长格式R数据的描述性统计



我有重复测量(5次或更少(的数据,包括血压测量。我用长格式格式化了数据,但由于我是第一次这样做,我现在不再知道如何获得变量的描述性统计数据。

我的示例数据:

questiondata <- structure(list(id = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 
3, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 8, 8), 
time = c("time1", "time2", "time3", "time5", "time1", "time2", "time3", "time5", 
"time1", "time2", "time3", "time5", "time4", "time5", "time4", "time5", 
"time4", "time5", "time4", "time5", "time4", "time5", "time4", "time5"), 
cohort = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), 
systolicBP = c(102, 137, 132, 150, 152, 146, 160.5, 159.5, 144, 138, 137.5, 163, 
137, 147, 125, 141, 147, 150, 148, 167.5, 153.5, 164.5, 159, 123), 
diastolicBP = c(56, 99, 78, 90, 77, 78, 80.5, 82, 72, 70, 67.5, 61, 86, 90, 80.5, 
84, 75, 81, 91, 96, 80, 87.5, 87, 79), 
egfr = c(78.2, NA, 55.8, NA, NA, NA, 87.6, NA, NA, NA, 75.6, 70.9, 71.9, 71.8, 
47.9, 36.6, 93.7, 81.5, 93.2, 82.1, 92.9, 79.1, 66.6, 55.2)), 
row.names = c(NA, -24L), class = c("tbl_df", "tbl", "data.frame"))

其对应于以下tibble:


# A tibble: 24 x 6
id time  cohort systolicBP diastolicBP  egfr
<dbl> <chr>  <dbl>      <dbl>       <dbl> <dbl>
1 1 time1      1       102         56    78.2
2 1 time2      1       137         99    NA  
3 1 time3      1       132         78    55.8
4 1 time5      1       150         90    NA  
5 2 time1      1       152         77    NA  
6 2 time2      1       146         78    NA  
7 2 time3      1       160.        80.5  87.6
8 2 time5      1       160.        82    NA  
9 3 time1      1       144         72    NA  
10 3 time2      1       138         70    NA  
11 3 time3      1       138.        67.5  75.6
12 3 time5      1       163         61    70.9
13 4 time4      2       137         86    71.9
14 4 time5      2       147         90    71.8
15 5 time4      2       125         80.5  47.9
16 5 time5      2       141         84    36.6
17 6 time4      2       147         75    93.7
18 6 time5      2       150         81    81.5
19 7 time4      2       148         91    93.2
20 7 time5      2       168.        96    82.1
21 8 time4      2       154.        80    92.9
22 8 time5      2       164.        87.5  79.1
23 9 time4      2       159         87    66.6
24 9 time5      2       123         79    55.2

例如,现在我想

  • 获取每个队列中的人数
  • 对于每个队列和每个时间点,系统血压的平均值是多少

我相信这很容易,但我似乎无法得到任何可行的结果。

library(tidyverse)
questiondata %>% 
group_by(cohort) %>% 
summarise(n = n_distinct(id),
mean_systolic = mean(systolicBP, na.rm = TRUE), .groups = "drop")
#> # A tibble: 2 x 3
#>   cohort     n mean_systolic
#>    <dbl> <int>         <dbl>
#> 1      1     3          143.
#> 2      2     6          147.

questiondata %>% 
group_by(cohort, time) %>% 
summarise(mean_systolic_time = mean(systolicBP, na.rm = TRUE), .groups = "drop")
#> # A tibble: 6 x 3
#>   cohort time  mean_systolic_time
#>    <dbl> <chr>              <dbl>
#> 1      1 time1               133.
#> 2      1 time2               140.
#> 3      1 time3               143.
#> 4      1 time5               158.
#> 5      2 time4               145.
#> 6      2 time5               149.

创建于2021-06-25由reprex包(v2.0.0(

questiondata %>%
group_by(cohort, time) %>%
summarise(mean_systolic_time = str_c(round(mean(systolicBP, na.rm = TRUE), digits = 0), " mmHg"),
.groups = "drop")
# A tibble: 6 x 3
cohort time  mean_systolic_time
<dbl> <chr> <chr>             
1      1 time1 133 mmHg          
2      1 time2 140 mmHg          
3      1 time3 143 mmHg          
4      1 time5 158 mmHg          
5      2 time4 145 mmHg          
6      2 time5 149 mmHg

最新更新