r语言 - 即使缺少数据点,如何显示具有特定顺序(月-年)的宽表?


> df_1
# A tibble: 47 x 3
# Groups:   therapy_class [9]
therapy_class             Year_month count
<ord>                     <yearmon>  <int>
1 ALK Inhibitors            Dec 2019      16
2 ALK Inhibitors            Jan 2020      14
3 ALK Inhibitors            Feb 2020      14
4 ALK Inhibitors            Mar 2020      22
5 ALK Inhibitors            Apr 2020      13
6 ALK Inhibitors            May 2020      17
7 Anti-VEGF-based therapies Dec 2019      33
8 Anti-VEGF-based therapies Jan 2020      35
9 Anti-VEGF-based therapies Feb 2020      36
10 Anti-VEGF-based therapies Mar 2020      20
# … with 37 more rows

A tibble: 10 x 7
therapy_class                    `Dec 2019`         `Jan 2020`         `Feb 2020`        `Mar 2020`        `Apr 2020`        `May 2020`       
<ord>                            <chr>              <chr>              <chr>             <chr>             <chr>             <chr>            
1 ALK Inhibitors                   "16 <br>[2.7%]"    "14 <br>[2.0%]"    "14 <br>[2.2%]"   "22 <br>[3.3%]"   "13 <br>[2.1%]"   "17 <br>[3.4%]"  
2 Anti-VEGF-based therapies        "33 <br>[5.6%]"    "35 <br>[4.9%]"    "36 <br>[5.7%]"   "20 <br>[3.0%]"   "21 <br>[3.4%]"   "20 <br>[4.0%]"  
3 EGFR TKIs                        "52 <br>[8.8%]"    "57 <br>[8.0%]"    "60 <br>[9.5%]"   "52 <br>[7.8%]"   "56 <br>[9.2%]"   "49 <br>[9.8%]"  
4 EGFR-antibody based therapies    ""                 ""                 ""                ""                ""                ""               
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]"     "4 <br>[0.6%]"     "4 <br>[0.6%]"    ""                "1 <br>[0.2%]"    ""               
6 IO-based therapies               "308 <br>[52.0%]"  "385 <br>[54.0%]"  "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]"  "147 <br>[20.6%]"  "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]"
8 Single agent chemotherapies      "29 <br>[4.9%]"    "33 <br>[4.6%]"    "17 <br>[2.7%]"   "28 <br>[4.2%]"   "25 <br>[4.1%]"   "22 <br>[4.4%]"  
9 Other                            "30 <br>[5.1%]"    "38 <br>[5.3%]"    "42 <br>[6.7%]"   "33 <br>[4.9%]"   "31 <br>[5.1%]"   "21 <br>[4.2%]"  
10 <strong>Total</strong>           "<strong>592</str… "<strong>713</str… "<strong>631</st… "<strong>668</st… "<strong>612</st… "<strong>501</st…

> df_2
# A tibble: 46 x 3
# Groups:   therapy_class [9]
therapy_class             Year_month count
<ord>                     <yearmon>  <int>
1 ALK Inhibitors            Dec 2019      16
2 ALK Inhibitors            Feb 2020      14
3 ALK Inhibitors            Mar 2020      22
4 ALK Inhibitors            Apr 2020      13
5 ALK Inhibitors            May 2020      17
6 Anti-VEGF-based therapies Dec 2019      33
7 Anti-VEGF-based therapies Jan 2020      35
8 Anti-VEGF-based therapies Feb 2020      36
9 Anti-VEGF-based therapies Mar 2020      20
10 Anti-VEGF-based therapies Apr 2020      21
# … with 36 more rows
> t2
# A tibble: 10 x 7
therapy_class                    `Dec 2019`         `Feb 2020`         `Mar 2020`        `Apr 2020`        `May 2020`        `Jan 2020`       
<ord>                            <chr>              <chr>              <chr>             <chr>             <chr>             <chr>            
1 ALK Inhibitors                   "16 <br>[2.7%]"    "14 <br>[2.2%]"    "22 <br>[3.3%]"   "13 <br>[2.1%]"   "17 <br>[3.4%]"   ""               
2 Anti-VEGF-based therapies        "33 <br>[5.6%]"    "36 <br>[5.7%]"    "20 <br>[3.0%]"   "21 <br>[3.4%]"   "20 <br>[4.0%]"   "35 <br>[5.0%]"  
3 EGFR TKIs                        "52 <br>[8.8%]"    "60 <br>[9.5%]"    "52 <br>[7.8%]"   "56 <br>[9.2%]"   "49 <br>[9.8%]"   "57 <br>[8.2%]"  
4 EGFR-antibody based therapies    ""                 ""                 ""                ""                ""                ""               
5 Non-platinum-based chemotherapy… "1 <br>[0.2%]"     "4 <br>[0.6%]"     ""                "1 <br>[0.2%]"    ""                "4 <br>[0.6%]"   
6 IO-based therapies               "308 <br>[52.0%]"  "330 <br>[52.3%]"  "379 <br>[56.7%]" "345 <br>[56.4%]" "265 <br>[52.9%]" "385 <br>[55.1%]"
7 Platinum-based chemotherapy com… "123 <br>[20.8%]"  "128 <br>[20.3%]"  "134 <br>[20.1%]" "120 <br>[19.6%]" "107 <br>[21.4%]" "147 <br>[21.0%]"
8 Single agent chemotherapies      "29 <br>[4.9%]"    "17 <br>[2.7%]"    "28 <br>[4.2%]"   "25 <br>[4.1%]"   "22 <br>[4.4%]"   "33 <br>[4.7%]"  
9 Other                            "30 <br>[5.1%]"    "42 <br>[6.7%]"    "33 <br>[4.9%]"   "31 <br>[5.1%]"   "21 <br>[4.2%]"   "38 <br>[5.4%]"  
10 <strong>Total</strong>           "<strong>592</str… "<strong>631</str… "<strong>668</st… "<strong>612</st… "<strong>501</st… "<strong>699</st…
> 

我试图创建一个宽表计数和百分比从长表。这些列是"月","年",需要按顺序排列。我的问题是,当第一组(ALK抑制剂)的某些"月年"缺失时,列的顺序就会中断。缺少的"月年"放在末尾。而且长桌子不是固定的桌子。它是由用户选择月份和年份范围的函数生成的。因此,Year_month列可以有任何范围。

在这个例子中,我使用了2019年12月到2020年5月的6个月范围。"df_1"有所有6个月,所以所产生的宽表是预期的。"df_2"2020年1月缺少ALK抑制剂。因此,结果表的末尾是'Jan 2020'。

这是我用来生成宽表的代码:

df_2 %>%
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")), list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]", 100 * (. / sum(., na.rm = TRUE)))))
))

下面是样本数据df_2

structure(list(therapy_class = structure(c(1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 5L, 5L, 5L, 
5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 
8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), Year_month = structure(c(2019.91666666667, 
2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333, 
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25, 
2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 
2020.25, 2020.33333333333, NA, 2019.91666666667, 2020, 2020.08333333333, 
2020.25, 2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 
2020.25, 2020.33333333333, 2019.91666666667, 2020, 2020.08333333333, 
2020.16666666667, 2020.25, 2020.33333333333, 2019.91666666667, 
2020, 2020.08333333333, 2020.16666666667, 2020.25, 2020.33333333333, 
2019.91666666667, 2020, 2020.08333333333, 2020.16666666667, 2020.25, 
2020.33333333333), class = "yearmon"), count = c(16L, 14L, 22L, 
13L, 17L, 33L, 35L, 36L, 20L, 21L, 20L, 52L, 57L, 60L, 52L, 56L, 
49L, NA, 1L, 4L, 4L, 1L, 308L, 385L, 330L, 379L, 345L, 265L, 
123L, 147L, 128L, 134L, 120L, 107L, 29L, 33L, 17L, 28L, 25L, 
22L, 30L, 38L, 42L, 33L, 31L, 21L)), row.names = c(NA, -46L), groups = structure(list(
therapy_class = structure(1:9, .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:5, 6:11, 12:17, 18L, 19:22, 23:28, 29:34, 35:40, 41:46), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

这是一个新的数据集,其中包含月/季度数据

df <- structure(list(therapy_class = structure(c(1L, 1L, 1L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 
7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L), .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), quarter = structure(c(2020.75, 
2021, 2021.25, 2020.5, 2020.75, 2021.25, 2020.5, 2020.75, 2021, 
2021.25, NA, NA, 2020.5, 2020.75, 2021, 2021.25, 2021.5, 2020.5, 
2020.75, 2021, 2021.25, 2021.5, 2020.5, 2020.75, 2021, 2021.25, 
2021.5, 2020.5, 2020.75, 2021, 2021.25, 2021.5), class = "yearqtr"), 
count = c(4L, 2L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, NA, NA, 
252L, 184L, 190L, 180L, 120L, 59L, 46L, 63L, 47L, 25L, 4L, 
4L, 1L, 2L, 1L, 16L, 12L, 21L, 21L, 11L)), row.names = c(NA, 
-32L), groups = structure(list(therapy_class = structure(1:9, .Label = c("ALK Inhibitors", 
"Anti-VEGF-based therapies", "EGFR TKIs", "EGFR-antibody based therapies", 
"Non-platinum-based chemotherapy combinations", "IO-based therapies", 
"Platinum-based chemotherapy combinations", "Single agent chemotherapies", 
"Other"), class = c("ordered", "factor")), .rows = structure(list(
1:3, 4:6, 7:10, 11L, 12L, 13:17, 18:22, 23:27, 28:32), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

——由于缺少数据,季度日期顺序颠倒

therapy_class                                `2020 Q4`      `2021 Q1`      `2021 Q2`      `2020 Q3`       `NA` `2021 Q3`     
<ord>                                        <chr>          <chr>          <chr>          <chr>          <int> <chr>         
1 ALK Inhibitors                               "4 [1.58%]"    "2 [0.72%]"    "2 [0.78%]"    ""                NA ""            
2 Anti-VEGF-based therapies                    "2 [0.79%]"    ""             "3 [1.17%]"    "3 [0.90%]"       NA ""            
3 EGFR TKIs                                    "1 [0.40%]"    "1 [0.36%]"    "1 [0.39%]"    "1 [0.30%]"       NA ""            
4 EGFR-antibody based therapies                ""             ""             ""             ""                NA ""            
5 Non-platinum-based chemotherapy combinations ""             ""             ""             ""                NA ""            
6 IO-based therapies                           "184 [72.73%]" "190 [68.35%]" "180 [70.31%]" "252 [75.22%]"    NA "120 [76.43%]"
7 Platinum-based chemotherapy combinations     "46 [18.18%]"  "63 [22.66%]"  "47 [18.36%]"  "59 [17.61%]"     NA "25 [15.92%]" 
8 Single agent chemotherapies                  "4 [1.58%]"    "1 [0.36%]"    "2 [0.78%]"    "4 [1.19%]"       NA "1 [0.64%]"   
9 Other                                        "12 [4.74%]"   "21 [7.55%]"   "21 [8.20%]"   "16 [4.78%]"      NA "11 [7.01%]"  
10 Total                                        "253"          "278"          "256"          "335"              0 "157" 

一个选项是在执行pivot_wider之前先使用complete创建缺少的年份月份。对于pivot_wider,默认顺序是基于顺序

中唯一值的外观。
library(dplyr)
library(tidyr)
library(zoo)
df_2 %>%
ungroup %>% 
mutate(Year_month = as.Date(Year_month)) %>% 
complete(therapy_class, Year_month =  seq(from = min(Year_month, 
na.rm = TRUE), to = max(Year_month, na.rm = TRUE),
by = '1 month')) %>% 
mutate(Year_month = as.yearmon(Year_month)) %>% 
pivot_wider(names_from = Year_month, values_from = count) %>%
ungroup() %>%
mutate_at(.vars = vars(contains("20")),
list(
~ ifelse(is.na(.), "", paste(., sprintf("<br>[%1.1f%%]",
100 * (. / sum(., na.rm = TRUE)))))
))

与产出

# A tibble: 9 × 8
therapy_class                                `Dec 2019`        `Jan 2020`        `Feb 2020`        `Mar 2020`        `Apr 2020`     `May 2020`     `NA`
<ord>                                        <chr>             <chr>             <chr>             <chr>             <chr>          <chr>         <int>
1 ALK Inhibitors                               "16 <br>[2.7%]"   ""                "14 <br>[2.2%]"   "22 <br>[3.3%]"   "13 <br>[2.1%… "17 <br>[3.4…    NA
2 Anti-VEGF-based therapies                    "33 <br>[5.6%]"   "35 <br>[5.0%]"   "36 <br>[5.7%]"   "20 <br>[3.0%]"   "21 <br>[3.4%… "20 <br>[4.0…    NA
3 EGFR TKIs                                    "52 <br>[8.8%]"   "57 <br>[8.2%]"   "60 <br>[9.5%]"   "52 <br>[7.8%]"   "56 <br>[9.2%… "49 <br>[9.8…    NA
4 EGFR-antibody based therapies                ""                ""                ""                ""                ""             ""               NA
5 Non-platinum-based chemotherapy combinations "1 <br>[0.2%]"    "4 <br>[0.6%]"    "4 <br>[0.6%]"    ""                "1 <br>[0.2%]" ""               NA
6 IO-based therapies                           "308 <br>[52.0%]" "385 <br>[55.1%]" "330 <br>[52.3%]" "379 <br>[56.7%]" "345 <br>[56.… "265 <br>[52…    NA
7 Platinum-based chemotherapy combinations     "123 <br>[20.8%]" "147 <br>[21.0%]" "128 <br>[20.3%]" "134 <br>[20.1%]" "120 <br>[19.… "107 <br>[21…    NA
8 Single agent chemotherapies                  "29 <br>[4.9%]"   "33 <br>[4.7%]"   "17 <br>[2.7%]"   "28 <br>[4.2%]"   "25 <br>[4.1%… "22 <br>[4.4…    NA
9 Other                                        "30 <br>[5.1%]"   "38 <br>[5.4%]"   "42 <br>[6.7%]"   "33 <br>[4.9%]"   "31 <br>[5.1%… "21 <br>[4.2…    NA

With new data

df %>% 
ungroup %>% 
mutate(Year_month = as.Date(quarter)) %>% 
complete(therapy_class, 
Year_month = seq(from = min(Year_month,na.rm = TRUE ), 
to = max(Year_month, na.rm = TRUE), by = '1 quarter')) %>% 
mutate(Year_month = as.yearqtr(Year_month))  %>% 
pivot_wider(names_from = Year_month, values_from = count) %>%  
ungroup() %>%   
mutate_at(.vars = vars(contains("20")), 
list(     ~ ifelse(is.na(.), "", 
paste(., sprintf("<br>[%1.1f%%]",   
100 * (. / sum(., na.rm = TRUE)))))   ))

与产出

# A tibble: 35 × 8
therapy_class             quarter   `2020 Q3`      `2020 Q4`      `2021 Q1`      `2021 Q2`      `2021 Q3`  `NA`
<ord>                     <yearqtr> <chr>          <chr>          <chr>          <chr>          <chr>     <int>
1 ALK Inhibitors            <NA>      ""             ""             ""             ""             ""           NA
2 ALK Inhibitors            2020 Q4   ""             "4 <br>[1.6%]" ""             ""             ""           NA
3 ALK Inhibitors            2021 Q1   ""             ""             "2 <br>[0.7%]" ""             ""           NA
4 ALK Inhibitors            2021 Q2   ""             ""             ""             "2 <br>[0.8%]" ""           NA
5 Anti-VEGF-based therapies 2020 Q3   "3 <br>[0.9%]" ""             ""             ""             ""           NA
6 Anti-VEGF-based therapies 2020 Q4   ""             "2 <br>[0.8%]" ""             ""             ""           NA
7 Anti-VEGF-based therapies <NA>      ""             ""             ""             ""             ""           NA
8 Anti-VEGF-based therapies 2021 Q2   ""             ""             ""             "3 <br>[1.2%]" ""           NA
9 EGFR TKIs                 2020 Q3   "1 <br>[0.3%]" ""             ""             ""             ""           NA
10 EGFR TKIs                 2020 Q4   ""             "1 <br>[0.4%]" ""             ""             ""           NA
# … with 25 more rows

最新更新