计算特定时间范围内变量的百分比



我正在尝试按月获取颜色的频率。最后,我想做一个折线图,每种颜色都有一条线,显示每个月它在所有颜色中所占的百分比。我的数据如下:

ID color   color2  color3  date
55 red     blue     NA     2020-03-15
67 yellow  NA       NA     2020-05-02
83 blue    yellow   NA     2020-05-17
78 red     yellow   blue   2020-12-15  
43 green   NA       NA     2021-01-27
29 yellow  green    NA     2021-01-03

我需要这样的东西,这样我就可以制作图表:


Month      freq_blue freq_red  freq_yellow  freq_green %_blue %_red %_yellow %_green
03-2020       1        1          0           0            50    50     0       0
04-2020       0        0          0           0            0     0      0       0
05-2020       1        0          2           0            33.3  0      66.7    0
06-2020       0        0          0           0            0     0      0       0
07-2020       0        0          0           0            0     0      0       0
08-2020       0        0          0           0            0     0      0       0
09-2020       0        0          0           0            0     0      0       0
10-2020       0        0          0           0            0     0      0       0
11-2020       0        0          0           0            0     0      0       0
12-2020       1        1          1           0            33.3  33.3  33.3     0
01-2021       0        0          1           2            0     0     33.3    66.7

我试图按月份分组,然后计算频率,但我遇到了一些麻烦,因为有多个color列。

这里有一个使用tidyverse(和lubridate(的解决方案。

首先,您可以使用pivot_longer将颜色转换为长格式。您可以删除缺少颜色/值的行。

然后,您可以在按月份、年份和颜色(值(分组后使用count计算频率。通过按月份和年份分组,可以计算比例(百分比(。

最后,如果需要,您可以使用pivot_wider将数据转换为宽格式。函数complete也用于填充缺失的月份和颜色。

library(tidyverse)
library(lubridate)
df$date <- as.Date(df$date)
df %>%
mutate(month = month(date), year = year(date)) %>%
pivot_longer(cols = starts_with("color")) %>%
filter(!is.na(value)) %>%
group_by(month, year, value) %>%
count() %>%
group_by(month, year) %>%
mutate(percent = n/sum(n)) %>%
ungroup() %>%
complete(year, month = 1:12, value = c("blue", "red", "yellow", "green"), fill = list(n = 0, percent = 0)) %>%
pivot_wider(id_cols = c(month, year), names_from = value, values_from = c(n, percent))

输出

# A tibble: 24 × 10
month  year n_blue n_green n_red n_yellow percent_blue percent_green percent_red percent_yellow
<dbl> <dbl>  <dbl>   <dbl> <dbl>    <dbl>        <dbl>         <dbl>       <dbl>          <dbl>
1     1  2020      0       0     0        0        0                 0         0            0    
2     2  2020      0       0     0        0        0                 0         0            0    
3     3  2020      1       0     1        0        0.5               0         0.5          0    
4     4  2020      0       0     0        0        0                 0         0            0    
5     5  2020      1       0     0        2        0.333             0         0            0.667
6     6  2020      0       0     0        0        0                 0         0            0    
7     7  2020      0       0     0        0        0                 0         0            0    
8     8  2020      0       0     0        0        0                 0         0            0    
9     9  2020      0       0     0        0        0                 0         0            0    
10    10  2020      0       0     0        0        0                 0         0            0    
# … with 14 more rows
df <- tibble(ID = c(55,67,83,78,43,29),
color = c("red", "yellow", "blue", "red", "green", "yellow"),
color2 = c("red", NA, "yellow", "yellow", NA, "green"),
color3 = c(NA, NA, NA, "blue", NA, NA),
date = c("2020-03-15", "2020-05-02", "2020-05-17", "2020-12-15","2021-01-27", "2021-01-03"))

dat <- df %>% mutate(month = as.yearmon(date)) %>% group_by(month)%>%
pivot_longer(cols = c(color, color2, color3)) %>% mutate(val = 1) %>% drop_na() %>%
select(month, value, val) %>% 
pivot_wider(id_cols= month, names_from=value, names_prefix = "freq_",values_from = val, values_fn = sum) %>% 
unnest(cols = c(freq_red, freq_blue, freq_yellow, freq_green)) %>% 
replace_na(list(freq_red =0, freq_blue = 0, freq_yellow = 0, freq_green = 0)) %>%
ungroup() %>% mutate_if(is.numeric, as.double) %>%mutate(`red_%` = round(freq_red/rowSums(.[2:5])*100,2),
`yellow_%` = round(freq_yellow/rowSums(.[2:5])*100,2),
`blue_%` = round(freq_blue/rowSums(.[2:5])*100,2),
`green_%` = round(freq_green/rowSums(.[2:5])*100,2))

最新更新