R For Loop and If-else data.table



我被困在我试图创建的for循环上。示例数据集如下:

ex <- structure(list(person_id = c("79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", 
"79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", "79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", 
"79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", "8b6ea77b-e694-48fb-a9e9-ca8bf1accc65", 
"8b6ea77b-e694-48fb-a9e9-ca8bf1accc65", "8b6ea77b-e694-48fb-a9e9-ca8bf1accc65", 
"8b6ea77b-e694-48fb-a9e9-ca8bf1accc65"), prs_nat_key = c("8240588160001", 
"8240588160001", "8240588160001", "8240588160001", "106705689", 
"106705689", "106705689", "106705689"), serv_from_dt = structure(c(18262, 
18262, 18262, 18262, 18278, 18278, 18278, 18278), class = "Date"), 
serv_to_dt = structure(c(18262, 18262, 18262, 18265, 18282, 
18282, 18299, 18299), class = "Date"), new_pos = c("IP", 
"IP", "IP", "IP", "IP", "IP", "IP", "IP"), days_diff = c(0, 
0, 0, 3, 4, 4, 21, 21)), row.names = c(NA, -8L), class = c("data.table", 
"data.frame"))

我正在尝试创建一个名为start_date的新列。该列将基于每个person_id的serv_from_dt和serv_to_dt日期创建。到目前为止,我这样做的方法如下:

找到serv_from_dt和serv_to_dt之间的日期差异大于0的唯一serv_from_dt的每个person_id(我们就叫它diff_date);如果,按行计算,serv_frm_dt是>= person_id的最大唯一的diff_date, serv_to_dt <= person_id的最大唯一的diff_date,那么标记为唯一的diff_date。到目前为止我有这个:

values=ex[,.(uniqueN(sort(unique(serv_to_dt[ex$days_diff>0]), TRUE))), person_id]
n = as.numeric(values[,1])
m = as.numeric(values[,2])
for (i in m){
ex[,`:=`(min_start = fifelse((serv_to_dt<= sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[1] & 
serv_from_dt>= sort(unique(serv_from_dt[ex$days_diff>0]))[1]), 
sort(unique(serv_from_dt[ex$days_diff>0]))[1], fifelse((serv_to_dt<= sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[i] & 
               serv_from_dt>= sort(unique(serv_from_dt[ex$days_diff>0]))[i]), 
            sort(unique(serv_from_dt[ex$days_diff>0]))[i], serv_from_dt)),
max_end = fifelse((serv_to_dt<= sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[1] & 
serv_from_dt>= sort(unique(serv_from_dt[ex$days_diff>0]))[1]), 
sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[1], fifelse((serv_to_dt<= sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[i] & 
                   serv_from_dt>= sort(unique(serv_from_dt[ex$days_diff>0]))[i]), 
                sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[i], serv_from_dt))), prs_nat_key]
}

上面的代码给了我我想要的,但我不知道如何扩展这个更大的数据集与多个person_ids和多个day_diffs。我希望代码是这样的,如果serv_frm/serv_to_dts在最大唯一的diff_date之间不成立,循环到下一个唯一的diff_date。在这种情况下,两个person_id只有一个唯一的diff_date(所以m = 1),但我想更新代码以在m>1. 我也试过用base R来做,但总是出错:

for(j in 1:m){

ex[, min_start := if((serv_to_dt<= sort(unique(serv_to_dt[ex$days_diff>0]), TRUE)[j] & 
serv_from_dt>= sort(unique(serv_from_dt[ex$days_diff>0]))[j])) sort(unique(serv_from_dt[ex$days_diff>0]))[j]]
j = j+ 1

}

任何帮助都将非常感激。

我的最终目标是创建两个名为min_start和max_end的新列。我意识到不用ifelse语句,我可以做一个连接。下面是我使用一个稍大的示例数据集的步骤:

ex <- structure(list(person_id = c("79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", 
"79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", "79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", 
"79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", "79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", 
"79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", "79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", 
"79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8", "8b6ea77b-e694-48fb-a9e9-ca8bf1accc65", 
"8b6ea77b-e694-48fb-a9e9-ca8bf1accc65", "8b6ea77b-e694-48fb-a9e9-ca8bf1accc65", 
"8b6ea77b-e694-48fb-a9e9-ca8bf1accc65"), prs_nat_key = c("8240588160001", 
"8240588160001", "8240588160001", "8240588160001", "8240588160001", 
"8240588160001", "8240588160001", "8240588160001", "106705689", 
"106705689", "106705689", "106705689"), serv_from_dt = structure(c(18262, 
18262, 18262, 18262, 18275, 18275, 18275, 18275, 18278, 18278, 
18278, 18278), class = "Date"), serv_to_dt = structure(c(18262, 
18262, 18262, 18265, 18275, 18278, 18278, 18278, 18282, 18282, 
18299, 18299), class = "Date"), new_pos = c("IP", "IP", "IP", 
"IP", "IP", "IP", "IP", "IP", "IP", "IP", "IP", "IP"), days_diff = c(0, 
0, 0, 3, 0, 3, 3, 3, 4, 4, 21, 21)), row.names = c(NA, -12L), class = c("data.table", 
"data.frame"))

为每个人创建一个只有唯一开始/结束日期的新数据帧:

date_period <- ex[, .(unique_start = unique(serv_from_dt[days_diff>0]),
unique_end = unique(serv_to_dt[days_diff>0])), prs_nat_key][order(prs_nat_key,unique_start,-unique_end),]
date_period %<>% distinct(prs_nat_key, unique_start, .keep_all = TRUE) %>% setDT()
如果date_period$prs_nat_key = ex$prs_nat_key &$serv_from_dt>= date_period$unique_start &$serv_from_dt <= date_period$unique_end &$serv_to_dt>= date_period$unique_start &Ex $serv_to_dt <= date_period$unique_end
ex[, c("start_date", "end_date") := 
date_period[ex, # join
.(unique_start, unique_end),
on = .(unique_start < serv_from_dt,
unique_start < serv_to_dt,
unique_end > serv_to_dt,
unique_end > serv_from_dt,
prs_nat_key = prs_nat_key)]]

我从这个问题中发现——>data.table中的条件连接?

不确定最终结果应该是什么,但看起来太复杂了。例如,您创建的date_period表可以这样做:

ex[, .(unique_start = first(serv_from_dt), unique_end = last(serv_to_dt)), by = c("prs_nat_key", "serv_from_dt")]
#      prs_nat_key serv_from_dt unique_start unique_end
# 1: 8240588160001   2020-01-01   2020-01-01 2020-01-04
# 2: 8240588160001   2020-01-14   2020-01-14 2020-01-17
# 3:     106705689   2020-01-17   2020-01-17 2020-02-07

似乎你试图重新连接它回到原来的表,也许这是你想要的。是的,这是您发布的原始表所需要的所有内容。

ex[, `:=` (start_date = first(serv_from_dt), end_date = last(serv_to_dt)), by = c("prs_nat_key", "serv_from_dt")]
#                                person_id   prs_nat_key serv_from_dt serv_to_dt new_pos days_diff start_date   end_date
#  1: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-01 2020-01-01      IP         0 2020-01-01 2020-01-04
#  2: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-01 2020-01-01      IP         0 2020-01-01 2020-01-04
#  3: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-01 2020-01-01      IP         0 2020-01-01 2020-01-04
#  4: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-01 2020-01-04      IP         3 2020-01-01 2020-01-04
#  5: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-14 2020-01-14      IP         0 2020-01-14 2020-01-17
#  6: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-14 2020-01-17      IP         3 2020-01-14 2020-01-17
#  7: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-14 2020-01-17      IP         3 2020-01-14 2020-01-17
#  8: 79d8c6ee-62f4-4a09-a31e-a3d1a48d79a8 8240588160001   2020-01-14 2020-01-17      IP         3 2020-01-14 2020-01-17
#  9: 8b6ea77b-e694-48fb-a9e9-ca8bf1accc65     106705689   2020-01-17 2020-01-21      IP         4 2020-01-17 2020-02-07
# 10: 8b6ea77b-e694-48fb-a9e9-ca8bf1accc65     106705689   2020-01-17 2020-01-21      IP         4 2020-01-17 2020-02-07
# 11: 8b6ea77b-e694-48fb-a9e9-ca8bf1accc65     106705689   2020-01-17 2020-02-07      IP        21 2020-01-17 2020-02-07
# 12: 8b6ea77b-e694-48fb-a9e9-ca8bf1accc65     106705689   2020-01-17 2020-02-07      IP        21 2020-01-17 2020-02-07