我想整理一个数据框架并自动化这个过程。给定以下data.frame
:
library(survival)
library(rms)
library(broom)
library(tidyverse)
res.cox <- coxph(Surv(time, status) ~ rcs(age, 3) + sex + ph.ecog +
rcs(meal.cal, 4), data = lung)
output <- tidy(res.cox)
output
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 3)age -0.00306 0.0219 -0.140 0.889
# 2 rcs(age, 3)age' 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 rcs(meal.cal, 4)meal.cal -0.000416 0.00104 -0.400 0.689
# 6 rcs(meal.cal, 4)meal.cal' 0.00118 0.00232 0.509 0.611
# 7 rcs(meal.cal, 4)meal.cal'' -0.00659 0.0114 -0.577 0.564
我想从term
变量中删除rcs样条信息,并留下:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age -0.00306 0.0219 -0.140 0.889
# 2 s2 age 0.0154 0.0261 0.592 0.554
# 3 sex -0.525 0.192 -2.74 0.00620
# 4 ph.ecog 0.421 0.131 3.22 0.00128
# 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
# 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
# 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
我希望这个解决方案也能很容易地适用于其他情况,所以当你增加结的数量时:
res.cox2 <- coxph(Surv(time, status) ~ rcs(age, 4) + rcs(meal.cal, 6) +
sex + ph.ecog, data = lung)
output2 <- tidy(res.cox2)
output2
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 rcs(age, 4)age 0.0419 0.0403 1.04 0.298
# 2 rcs(age, 4)age' -0.101 0.0806 -1.26 0.208
# 3 rcs(age, 4)age'' 0.569 0.388 1.47 0.142
# 4 rcs(meal.cal, 6)meal.cal -0.000974 0.00155 -0.631 0.528
# 5 rcs(meal.cal, 6)meal.cal' 0.00751 0.0115 0.655 0.512
# 6 rcs(meal.cal, 6)meal.cal'' -0.0217 0.0358 -0.607 0.544
# 7 rcs(meal.cal, 6)meal.cal''' 0.0614 0.123 0.501 0.616
# 8 rcs(meal.cal, 6)meal.cal'''' -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
你会得到:
# term estimate std.error statistic p.value
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 s1 age 0.0419 0.0403 1.04 0.298
# 2 s2 age -0.101 0.0806 -1.26 0.208
# 3 s3 age 0.569 0.388 1.47 0.142
# 4 s1 meal.cal -0.000974 0.00155 -0.631 0.528
# 5 s2 meal.cal 0.00751 0.0115 0.655 0.512
# 6 s3 meal.cal -0.0217 0.0358 -0.607 0.544
# 7 s4 meal.cal 0.0614 0.123 0.501 0.616
# 8 s5 meal.cal -0.0775 0.163 -0.475 0.634
# 9 sex -0.552 0.195 -2.83 0.00465
# 10 ph.ecog 0.440 0.132 3.34 0.000835
等等……
到目前为止,我的尝试让我得到了一些方式,但我不确定处理'
,''
(注意第一项不包含'
)等的最佳方法:
output %>%
mutate(rcs_indicator = str_detect(term, fixed("rcs(")),
term = str_replace_all(term, "rcs\(.+?\)", ""))
# term estimate std.error statistic p.value rcs_indicator
# <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 age -0.00306 0.0219 -0.140 0.889 TRUE
# 2 age' 0.0154 0.0261 0.592 0.554 TRUE
# 3 sex -0.525 0.192 -2.74 0.00620 FALSE
# 4 ph.ecog 0.421 0.131 3.22 0.00128 FALSE
# 5 meal.cal -0.000416 0.00104 -0.400 0.689 TRUE
# 6 meal.cal' 0.00118 0.00232 0.509 0.611 TRUE
# 7 meal.cal'' -0.00659 0.0114 -0.577 0.564 TRUE
直接处理我需要更改的术语可能会很有用:
unique(str_subset(output$term, fixed("rcs(")) %>%
str_replace_all("'", ""))
# [1] "rcs(age, 3)age" "rcs(meal.cal, 4)meal.cal"
我觉得有一种比我现在所做的步骤更简单的方法。
有什么建议吗?
感谢这个有点笨拙,但应该可以工作:
library(dplyr)
library(stringr)
output %>%
group_by(group =str_extract(term, 'rcs\(.')) %>%
mutate(row = row_number()) %>%
mutate(term = str_replace_all(term, 'rcs\(', paste0("s",row, " "))) %>%
mutate(term = ifelse(str_detect(term, 's\d'),
str_extract(term, '.\d\s.*\s'), term)) %>%
mutate(term = str_trim(term)) %>%
mutate(term = str_replace_all(term, '\,', '')) %>%
ungroup() %>%
select(-c(group, row))
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 s1 age -0.00306 0.0219 -0.140 0.889
2 s2 age 0.0154 0.0261 0.592 0.554
3 sex -0.525 0.192 -2.74 0.00620
4 ph.ecog 0.421 0.131 3.22 0.00128
5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
6 s2 meal.cal 0.00118 0.00232 0.509 0.611
7 s3 meal.cal -0.00659 0.0114 -0.577 0.564
这也没有期望的那么优雅,但应该可以用于多个结
output %>%
mutate(is_spline = grepl("^rcs\(.*?, \d\)", term),
n_term = str_count(term, "'") + 1,
pre = ifelse(is_spline, paste0('s', n_term, ' '), ""),
term = paste0(pre, gsub("(^rcs\(.*?, \d\))|(\'+$)", "", term))) %>%
select(-is_spline, -n_term, -pre)
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 s1 age -0.00306 0.0219 -0.140 0.889
#> 2 s2 age 0.0154 0.0261 0.592 0.554
#> 3 sex -0.525 0.192 -2.74 0.00620
#> 4 ph.ecog 0.421 0.131 3.22 0.00128
#> 5 s1 meal.cal -0.000416 0.00104 -0.400 0.689
#> 6 s2 meal.cal 0.00118 0.00232 0.509 0.611
#> 7 s3 meal.cal -0.00659 0.0114 -0.577 0.564