r语言 - 如何通过变量循环,并根据多个子字符串值在给定字符位置分配1的值?



我正在从SAS移动到R,我正试图将以下代码从SAS转换为R:

data Drug_inj;
set initial;
array odiag(25) odiag1-odiag25;
do i = 1 to 25;
if substrn(odiag(i), 1,3) = 'T36' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T37' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T38' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T39' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,2) = 'T4'  and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,3) = 'T50' and (substrn(odiag(i), 6,1) = '1') then total_drug = 1;
if substrn(odiag(i), 1,4) = 'T369' == 1 then total_drug = 1;
end;

就像SAS代码一样,我希望R代码查看变量"odiag1"通过"odiag25"然后给出变量"total_drug"值";1&;"在满足条件的每一行上。特定的条件是字符串的前3个字符以"T36"通过"T50"第6个字符等于"1"

到目前为止,我认为在R中执行循环的最简单方法之一是创建一个列表
my_list <- list("odiag1" "odiag2" "odiag3" "odiag4" etc. "odiag25")
for i in length(my_list))
{
Drug_inj$total_drug = ifelse(substr(Drug_inj$i, 1,3 == "T36") & substr(Drug_inj$i, 6,1 == "1")) == 1
ifelse(substr(Drug_inj$i, 1,3 == "T37") & substr(Drug_inj$i, 6,1 == "1")) == 1
ifelse(substr(Drug_inj$i, 1,3 == "T38") & substr(Drug_inj$i, 6,1 == "1")) == 1
ifelse(substr(Drug_inj$i, 1,3 == "T39") & substr(Drug_inj$i, 6,1 == "1")) == 1
ifelse(substr(Drug_inj$i, 1,2 == "T4") & substr(Drug_inj$i, 6,1 == "1")) == 1
ifelse(substr(Drug_inj$i, 1,3 == "T50") & substr(Drug_inj$i, 6,1 == "1")) == 1
ifelse(substr(Drug_inj$i, 1,4 == "T369")) == 1}

我得到这个错误";substr(pdd_master$i, 1,3 == "T36") &Substr (pdd_master$i, 6,:只能对数字、逻辑或复杂类型进行操作">

我使用的代码是从同事给我的一些代码修改的。不幸的是,他们的代码只需要前3或4个字符,所以他们使用了以下代码:

pdd_master<- pdd_master %>% 
unite(all_causes, odiag1, odiag2, odiag3, odiag4, odiag5, odiag6, odiag7,
odiag8, odiag9, odiag10, odiag11, odiag12, odiag13, odiag14, odiag15, odiag16,
odiag17, odiag18, odiag19, odiag20, odiag21, odiag22, odiag23, odiag24, 
odiag25 sep = " ", remove = FALSE)
pdd_master$total_drug_unint = ifelse(str_detect(pdd_master$all_causes, "T36")==T,1,                                         
ifelse(str_detect(pdd_master$all_causes, "T37")==T,1,
ifelse(str_detect(pdd_master$all_causes, "T38")==T,1,
ifelse(str_detect(pdd_master$all_causes, "T39")==T,1,
ifelse(str_detect(pdd_master$all_causes, "T4")==T,1,
ifelse(str_detect(pdd_master$all_causes, "T50")==T,1,
ifelse(str_detect(pdd_master$all_causes, "T3691")==T,1,0)))))))))

由于他们的版本不需要查看前2或3个位置的字符,除了第6个位置的字符,我认为我需要做一个更传统的循环。

使用dplyr::if_any(),您可以使用tidyselect语法跨多个列应用测试。还可以通过测试开始子字符串是否在paste0("T", 36:50):

范围内,将所有测试合并为一个测试。
library(dplyr)
Drug_inj %>% 
mutate(total_drug = as.integer(
if_any(
.cols = odiag1:odiag5, 
.fns = ~ (substr(.x, 1, 3) %in% paste0("T", 36:50) & substr(.x, 6, 6) == 1) |
substr(.x, 1, 4) == "T369"
)
))
id odiag1 odiag2 odiag3 odiag4 odiag5 total_drug
1   1 T69880 T48900 T15200 T19781 T96201          0
2   2 T17160 T57341 T77861 T11291 T54481          0
3   3 T58691 T23971 T98041 T70501 T44780          0
4   4 T19430 T69631 T86840 T94860 T21231          0
5   5 T90850 T73650 T59201 T27471 T24791          0
6   6 T36911 T57890 T20900 T33501 T78321          1
7   7 T94121 T43891 T54210 T83670 T73520          1
8   8 T53430 T93100 T71920 T40301 T29870          1
9   9 T71301 T75980 T83571 T66510 T73021          0
10 10 T85040 T42281 T31631 T82660 T98990          1
11 11 T80390 T66010 T91921 T61350 T68470          0
12 12 T69930 T24641 T91030 T82221 T43860          0
13 13 T85660 T39360 T54991 T28981 T64351          0
14 14 T99820 T88390 T88320 T65480 T17440          0
15 15 T40760 T36190 T44520 T27561 T99881          0
16 16 T28401 T69920 T97600 T75070 T42180          0
17 17 T66851 T55650 T28491 T45501 T97011          1
18 18 T88631 T27251 T37961 T67121 T57060          1
19 19 T30791 T57310 T88331 T79461 T37131          1
20 20 T62440 T81541 T65160 T68280 T41260          0

示例数据:

set.seed(13)
Drug_inj <- data.frame(id = 1:20)
for (i in 1:5) {
Drug_inj[[paste0("odiag", i)]] <- paste0(
"T", 
sample(1000:9999, 20), 
sample(0:1, 20, replace = TRUE)
)
}

最新更新