假设我有一个包含以下合成数据的数据帧:
> set.seed(1)
>
> patient_id = seq(101, 150)
> admit_date = sample(seq(as.Date('2021/01/01'), as.Date('2021/01/10'), by="day"), 50, replace = T)
> release_date = sample(seq(as.Date('2021/01/11'), as.Date('2021/01/31'), by="day"), 50, replace = T)
> all = data.frame(patient_id, admit_date, release_date)
>
> all
patient_id admit_date release_date
1 101 2021-01-03 2021-01-21
2 102 2021-01-04 2021-01-29
.
.
.
49 149 2021-01-08 2021-01-28
50 150 2021-01-07 2021-01-23
在上面的示例中,发布日期在接收的数据之后。
我想有效地创建一个表来计算在给定日期出现的患者数量?
我有以下的解决方案,我想改进:
> date_range = sort(unique(c(all$admit_date, all$release_date)))
> count_vector = vector()
>
> for(i in 1:length(date_range)){
> current_date = date_range
> count_vector[i] = length(which((all$admit_date <= date_range[i]) &
> (all$release_date >= date_range[i])) )
> }
>
> date_count = data.frame(date_range, count_vector)
> date_count
date_range count_vector
1 2021-01-01 3
2 2021-01-02 7
3 2021-01-03 12
.
.
.
27 2021-01-29 7
28 2021-01-30 2
29 2021-01-31 1
此解决方案适用于小数据集,但是是否有可能改进我现有的解决方案以有效地获得大数据集所需的计数?
您可以创建一个包含每个患者日期序列的向量,并将其pivot (unnest):
set.seed(1)
patient_id = seq(101, 150)
admit_date = sample(seq(as.Date('2021/01/01'),
as.Date('2021/01/10'),
by="day"), 50, replace = T)
release_date = sample(seq(as.Date('2021/01/11'),
as.Date('2021/01/31'),
by="day"), 50, replace = T)
all = data.frame(patient_id, admit_date, release_date)
library(tidyverse)
date_pivot <- all %>%
as_tibble() %>%
mutate(seq_dates = purrr::map2(admit_date,
release_date,
function(x,y)
seq(x,y, by = 1))) %>%
tidyr::unnest(cols = seq_dates)
> date_pivot
# A tibble: 801 × 4
patient_id admit_date release_date seq_dates
<int> <date> <date> <date>
1 101 2021-01-05 2021-01-22 2021-01-05
2 101 2021-01-05 2021-01-22 2021-01-06
3 101 2021-01-05 2021-01-22 2021-01-07
4 101 2021-01-05 2021-01-22 2021-01-08
5 101 2021-01-05 2021-01-22 2021-01-09
6 101 2021-01-05 2021-01-22 2021-01-10
7 101 2021-01-05 2021-01-22 2021-01-11
8 101 2021-01-05 2021-01-22 2021-01-12
9 101 2021-01-05 2021-01-22 2021-01-13
这样就更容易计算每天的病人数量了,因为每天只有一个病人:
count_dates <- date_pivot %>%
group_by(d_date = seq_dates) %>%
summarise(n_pat = n_distinct(patient_id))
> count_dates
# A tibble: 31 × 2
d_date n_pat
<date> <int>
1 2021-01-01 3
2 2021-01-02 6
3 2021-01-03 9
4 2021-01-04 13
5 2021-01-05 19
6 2021-01-06 25
7 2021-01-07 31
8 2021-01-08 34
9 2021-01-09 42
这会给出与代码相同的输出:
> date_range = sort(unique(c(all$admit_date, all$release_date)))
> count_vector = vector()
> for(i in 1:length(date_range)){
+ current_date = date_range
+ count_vector[i] = length(which((all$admit_date <= date_range[i]) &
+ (all$release_date >= date_range[i])) )
+ }
> count_vector[1:9]
[1] 3 6 9 13 19 25 31 34 42
您可以在计数中考虑或不考虑admit_date和release_date:
count_dates_inside_visit <- date_pivot %>%
filter(seq_dates > admit_date, seq_dates < release_date) %>%
group_by(d_date = seq_dates) %>%
summarise(n_pat = n_distinct(patient_id))
您可以对每个日期的范围和count
发生情况使用fuzzyjoin连接两个数据集。
library(fuzzyjoin)
library(dplyr)
date_range = sort(unique(c(all$admit_date, all$release_date)))
data.frame(date_range) %>%
fuzzy_inner_join(all,
by = c('date_range' = 'admit_date',
'date_range' = 'release_date'),
match_fun = c(`>=`, `<=`)) %>%
count(date_range)
# date_range n
#1 2021-01-01 3
#2 2021-01-02 6
#3 2021-01-03 9
#4 2021-01-04 13
#5 2021-01-05 19
#6 2021-01-06 25
#7 2021-01-07 31
#8 2021-01-08 34
#9 2021-01-09 42
#10 2021-01-10 50
#11 2021-01-11 50
#...
#...