我有一个这样的数据集:
ICD_10 | 诊断 |
---|---|
A00 | 霍乱 |
A01-A03 | 其他肠道传染病 |
A15 | 呼吸系统结核 |
A17-A19 | 其他结核病 |
使用专用icd包:
#data
d <- structure(list(ICD_10 = c("A00", "A01-A03", "A15", "A17-A19"), diagnosis = c("Cholera", "Other Intestinal infectious diseases", "Respiratory tuberculosis", "Other tuberculosis")), class = "data.frame", row.names = c(NA, -4L))
#remotes::install_github("jackwasey/icd")
library(icd)
为了避免在范围之间创建不存在或丢失现有代码,我们使用expand_ranges。例如,如果我们按顺序填写A01、A02、A03,则下面返回33个代码,而不是3个,这是错误的。
expand_range("A01", "A03")
# [1] "A01" "A010" "A0100" "A0101" "A0102" "A0103" "A0104" "A0105"
# [9] "A0109" "A011" "A012" "A013" "A014" "A02" "A020" "A021"
# [17] "A022" "A0220" "A0221" "A0222" "A0223" "A0224" "A0225" "A0229"
# [25] "A028" "A029" "A03" "A030" "A031" "A032" "A033" "A038"
# [33] "A039"
我们还使用explain_code来描述新创建的代码,例如用法:
explain_code("A01")
# [1] "Typhoid and paratyphoid fevers"
现在,将两个函数合并为一个函数,以获得漂亮的输出
# custom function using expand_range
f <- function(icd10, diagnosis){
x <- unlist(strsplit(icd10, "-"))
if(length(x) == 1){ ICD10 = x
} else {ICD10 = expand_range(x[1], x[2])}
data.frame(
icd10 = icd10,
diagnosis = diagnosis,
icd10range = ICD10,
desc = explain_code(ICD10))
}
并循环代码进行扩展,然后rowbind:
# loop through rows, and rowbind
res <- do.call(rbind,
mapply(f, d$ICD_10, d$diagnosis,
SIMPLIFY = FALSE, USE.NAMES = FALSE))
head(res)
# icd10 diagnosis icd10range desc
# 1 A00 Cholera A00 Cholera
# 2 A01-A03 Other Intestinal infectious diseases A01 Typhoid and paratyphoid fevers
# 3 A01-A03 Other Intestinal infectious diseases A010 Typhoid fever
# 4 A01-A03 Other Intestinal infectious diseases A0100 Typhoid fever, unspecified
# 5 A01-A03 Other Intestinal infectious diseases A0101 Typhoid meningitis
# 6 A01-A03 Other Intestinal infectious diseases A0102 Typhoid fever with heart involvement
正如预期的那样,A01-A03现在扩展到33行:
table(res$icd10)
# A00 A01-A03 A15 A17-A19
# 1 33 1 53
fun <- function(vec) {
ltr <- substring(vec, 1, 1)
L <- lapply(strsplit(gsub("[^-0-9]", "", vec), "-"), as.integer)
mapply(function(ltr, z) sprintf("%s%02i", ltr, if (length(z) > 1) seq(z[1], z[2]) else z),
ltr, L)
}
quux %>%
mutate(ICD_10 = fun(ICD_10)) %>%
tidyr::unnest(ICD_10)
# # A tibble: 8 x 2
# ICD_10 diagnosis
# <chr> <chr>
# 1 A00 Cholera
# 2 A01 Other Intestinal infectious diseases
# 3 A02 Other Intestinal infectious diseases
# 4 A03 Other Intestinal infectious diseases
# 5 A15 Respiratory tuberculosis
# 6 A17 Other tuberculosis
# 7 A18 Other tuberculosis
# 8 A19 Other tuberculosis
数据
quux <- structure(list(ICD_10 = c("A00", "A01-A03", "A15", "A17-A19"), diagnosis = c("Cholera", "Other Intestinal infectious diseases", "Respiratory tuberculosis", "Other tuberculosis")), class = "data.frame", row.names = c(NA, -4L))
一个选项:
tibble::tribble(
~ICD_10, ~diagnosis,
"A00", "Cholera",
"A01-A03", "Other Intestinal infectious diseases",
"A15", "Respiratory tuberculosis",
"A17-A19", "Other tuberculosis"
) |>
tidyr::separate_rows(ICD_10, sep = "-") |>
mutate(id = parse_number(ICD_10)) |>
group_by(diagnosis) |>
complete(id = min(id):max(id)) |>
mutate(ICD_10 = paste0("A", id))