是否试图从数据字典中创建因子?我尝试使用Map
,但所有变量都被转换为丢失。如何最好地处理这种方法?以purrr
的方式进行操作也是受欢迎的。
library(dplyr)
mydata <- tibble(
a_1 = c(20,22, 13,14,44),
a_2 = c(42, 13, 32, 31, 14),
b = c(1, 2, 1, 1, 2),
c = c(1, 2, 1, 3, 1)
)
dictionary <- tibble(
variable = c("a", "b", "c"),
label = c("Age", "Gender", "Education"),
type = c("mselect", "select", "select"),
values = c(NA, "1, 2", "1, 2,3" ),
valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")
)
# Expected results
expectedata <- mydata %>%
mutate(
b = factor(b, levels = c(1, 2), labels = c("Male", "Female")),
c = factor(c, levels = c(1, 2, 3),
labels = c("Primary", "Secondary", "Tertiary"))
)
expectedata
# Select the factor variables
factor_vars <- dictionary %>%
filter(type == "select") %>% pull(variable)
mydata[] <- Map(
function(x, fctvalues, fctlabels) factor(x, fctvalues, fctlabels) ,
mydata,
dictionary$values[ match(factor_vars,
dictionary$variable) ],
dictionary$valuelabel[ match(factor_vars,
dictionary$variable) ]
)
通过pivot_
、left_join
和一些数据争用:
数据
library(tidyverse)
mydata <- tibble(
a_1 = c(20,22, 13,14,44),
a_2 = c(42, 13, 32, 31, 14),
b = c(1, 2, 1, 1, 2),
c = c(1, 2, 1, 3, 1)
)
dictionary <- tibble(
variable = c("a", "b", "c"),
label = c("Age", "Gender", "Education"),
type = c("mselect", "select", "select"),
values = c(NA, "1, 2", "1, 2, 3" ),
valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")
)
代码
target_dictionary <- dictionary %>%
# optional: filter(type == "select") %>%
separate_rows(values, valuelabel) %>%
select(variable, values, valuelabel)
target_mydata <- mydata %>%
# Assuming you have no unique identifier
rownames_to_column("id") %>%
pivot_longer(
cols = c("b", "c"),
names_to = "var_name",
values_to = "var_value"
) %>%
# because the data types don't match here
mutate(
var_value = as.character(var_value)
) %>%
left_join(
target_dictionary,
by = c("var_name" = "variable", "var_value" = "values")
) %>%
pivot_wider(
names_from = var_name,
values_from = valuelabel,
id_cols = c("id", "a_1", "a_2")
) %>%
select(-id)
结果:
> target_mydata
# A tibble: 5 × 4
a_1 a_2 b c
<dbl> <dbl> <chr> <chr>
1 20 42 Male Primary
2 22 13 Female Secondary
3 13 32 Male Primary
4 14 31 Male Tertiary
5 44 14 Female Primary
编辑:您还可以更进一步,重命名因子列名。
重命名列
target_mydata %>%
rename_with(
.fn = ~ setNames(dictionary$label, dictionary$variable)[.x],
.cols = intersect(names(mydata), setNames(dictionary$variable, dictionary$label))
)
结果:
# A tibble: 5 × 4
a_1 a_2 Gender Education
<dbl> <dbl> <chr> <chr>
1 20 42 Male Primary
2 22 13 Female Secondary
3 13 32 Male Primary
4 14 31 Male Tertiary
5 44 14 Female Primary