有没有一种方法可以从R中的数据字典中创建因子



是否试图从数据字典中创建因子?我尝试使用Map,但所有变量都被转换为丢失。如何最好地处理这种方法?以purrr的方式进行操作也是受欢迎的。

library(dplyr)
mydata <- tibble(
a_1 = c(20,22, 13,14,44),
a_2 = c(42, 13, 32, 31, 14),
b = c(1, 2, 1, 1, 2),
c = c(1, 2, 1, 3, 1)
)

dictionary <- tibble(
variable = c("a", "b", "c"),
label = c("Age", "Gender", "Education"),
type = c("mselect", "select", "select"),
values = c(NA, "1, 2", "1, 2,3" ),
valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")
)
# Expected results 
expectedata <- mydata %>% 
mutate(
b = factor(b, levels = c(1, 2), labels = c("Male", "Female")),
c = factor(c, levels = c(1, 2, 3), 
labels = c("Primary", "Secondary", "Tertiary"))
)
expectedata 

# Select the factor variables
factor_vars <- dictionary %>%
filter(type == "select") %>% pull(variable)

mydata[] <- Map(
function(x, fctvalues, fctlabels)  factor(x, fctvalues,  fctlabels) ,
mydata,
dictionary$values[ match(factor_vars,
dictionary$variable) ],
dictionary$valuelabel[ match(factor_vars,
dictionary$variable) ]
)

通过pivot_left_join和一些数据争用:

数据

library(tidyverse)
mydata <- tibble(
a_1 = c(20,22, 13,14,44),
a_2 = c(42, 13, 32, 31, 14),
b = c(1, 2, 1, 1, 2),
c = c(1, 2, 1, 3, 1)
)

dictionary <- tibble(
variable = c("a", "b", "c"),
label = c("Age", "Gender", "Education"),
type = c("mselect", "select", "select"),
values = c(NA, "1, 2", "1, 2, 3" ),
valuelabel = c(NA, "Male, Female", "Primary, Secondary, Tertiary")

)

代码

target_dictionary <- dictionary %>%
# optional: filter(type == "select") %>%
separate_rows(values, valuelabel) %>% 
select(variable, values, valuelabel)
target_mydata <- mydata %>%
# Assuming you have no unique identifier
rownames_to_column("id") %>%
pivot_longer(
cols = c("b", "c"),
names_to = "var_name",
values_to = "var_value"
) %>%
# because the data types don't match here
mutate(
var_value = as.character(var_value)
) %>%
left_join(
target_dictionary,
by = c("var_name" = "variable", "var_value" = "values")
) %>%
pivot_wider(
names_from = var_name,
values_from = valuelabel, 
id_cols = c("id", "a_1", "a_2")
) %>%
select(-id)

结果:

> target_mydata
# A tibble: 5 × 4
a_1   a_2 b      c        
<dbl> <dbl> <chr>  <chr>    
1    20    42 Male   Primary  
2    22    13 Female Secondary
3    13    32 Male   Primary  
4    14    31 Male   Tertiary 
5    44    14 Female Primary  

编辑:您还可以更进一步,重命名因子列名。

重命名列

target_mydata %>%
rename_with(
.fn = ~ setNames(dictionary$label, dictionary$variable)[.x], 
.cols = intersect(names(mydata), setNames(dictionary$variable, dictionary$label))
)

结果:

# A tibble: 5 × 4
a_1   a_2 Gender Education
<dbl> <dbl> <chr>  <chr>    
1    20    42 Male   Primary  
2    22    13 Female Secondary
3    13    32 Male   Primary  
4    14    31 Male   Tertiary 
5    44    14 Female Primary  

相关内容

  • 没有找到相关文章

最新更新