mydata <- data.frame(id = 1:5,
item1 = c("", "", "1222", "1222", ""),
item2 = c("13", "", "", "", "382"))
> mydata
id item1 item2
1 1 13
2 2
3 3 1222
4 4 987
5 5 382
我有一个数据集,其中包含不同的代码。我想将这些代码映射到基于dictionary
的字符串
dictionary <- data.frame(code = c(1, 13, 382, 987, 1222),
entry = c("ballet", "soccer", "basketball", "painting", "pottery"))
> dictionary
code entry
1 1 ballet
2 13 soccer
3 382 basketball
4 987 painting
5 1222 pottery
所需的输出是一个带字符串的数据帧:
id item1 item2
1 1 soccer
2 2
3 3 pottery
4 4 pottery
5 5 basketball
mydata <- data.frame(id = 1:5,
item1 = c("", "", "1222", "1222", ""),
item2 = c("13", "", "", "", "382"))
dictionary <- data.frame(code = c(1, 13, 382, 987, 1222),
entry = c("ballet", "soccer", "basketball", "painting", "pottery"))
mydata$item1 <- ifelse(mydata$item1 %in% dictionary$code,
dictionary$entry[match( mydata$item1,dictionary$code)], "")
mydata$item2 <- ifelse(mydata$item2 %in% dictionary$code,
dictionary$entry[match( mydata$item2,dictionary$code)], "")
mydata
#> id item1 item2
#> 1 1 soccer
#> 2 2
#> 3 3 pottery
#> 4 4 pottery
#> 5 5 basketball
创建于2022-12-15与reprex v2.0.2
注意:
当然,一种规范的方法是使用因子(和NA,而不是空字符串):
factor(mydata$item1, dictionary$code, dictionary$entry)
# <NA> <NA> pottery pottery <NA>
使用命名向量作为字典可能会更容易一些。使用dplyr:
library(dplyr)
dictionary_vec <- setNames(dictionary$entry, dictionary$code)
mydata %>%
mutate(across(!id, ~ replace_na(dictionary_vec[.x], "")))
或在基地R:
dictionary_vec <- setNames(dictionary$entry, dictionary$code)
for (cn in colnames(mydata)[-1]) {
mydata[[cn]] <- dictionary_vec[mydata[[cn]]]
}
mydata[is.na(mydata)] <- ""
两种方法的结果:
id item1 item2
1 1 soccer
2 2
3 3 pottery
4 4 pottery
5 5 basketball
这非常有效。
mydata <- data.frame(id = 1:5,
item1 = c("", "", "1222", "1222", ""),
item2 = c("13", "", "", "", "382"))
dictionary <- data.frame(code = c(1, 13, 382, 987, 1222),
entry = c("ballet", "soccer", "basketball", "painting", "pottery"))
mydata$item1[mydata$item1 %in% dictionary$code]<-dictionary$entry[dictionary$code%in%mydata$item1]
mydata$item2[mydata$item2 %in% dictionary$code]<-dictionary$entry[dictionary$code%in%mydata$item2]
mydata
id item1 item2
1 1 soccer
2 2
3 3 pottery
4 4 pottery
5 5 basketball
;规范方式";使用factor
s.
mydata[-1] <- lapply(mydata[-1], factor, levels=dictionary$code, labels=dictionary$entry) |>
lapply(droplevels)
mydata
# id item1 item2
# 1 1 <NA> soccer
# 2 2 <NA> <NA>
# 3 3 pottery <NA>
# 4 4 pottery <NA>
# 5 5 <NA> basketball
|> lapply(droplevels)
只关心每个列中出现的级别。如果您想要字符,只需将其替换为|> lapply(as.character)