r-根据另一组具有相应结尾的变量,重新编码与字母A、B等对应的多组变量

  • 本文关键字:变量 编码 新编码 一组 结尾 r dplyr
  • 更新时间 :
  • 英文 :


我有以下数据:

mydata <- data.frame(var1_A=c(1,2,0,5,NA),var1_B=c(1,1,NA,0,3),var1_C=c(1,NA,1,1,0),
var1_D=c(1,1,NA,0,3),var1_E=c(1,2,0,5,NA),var1_F=c(1,2,NA,1,0),
var1_G=c(1,2,NA,5,NA),var1_H=c(1,NA,1,0,3),var1_I=c(1,NA,2,1,0), 
ideology2_A=c(2,2,2,3,4), ideology2_B=c(NA,3,3,1,4), 
ideology2_C=c(1,4,7,6,9), ideology2_D=c(8,9,7,6,4), 
ideology2_E=c(NA,9,5,6,4),
ideology2_F=c(1,1,1,1,4), ideology2_G=c(8,9,7,6,4), 
ideology2_H=c(8,9,7,6,5), ideology2_I=c(8,9,8,9,8),
ex_ideology_A=c(3,3,3,3,4), ex_ideology_B=c(3,NA,3,1,4), 
ex_ideology_C=c(1,4,7,6,NA),ex_ideology_D=c(3,3,3,NA,0),
ex_ideology_E=c(3,NA,3,1,1), ex_ideology_F = c(1,1,1,2,3),
ex_ideology_G=c(1,2,3,4,5), ex_ideology_H=c(1,1,1,1,1)
ex_ideology_I=c(2,2,2,NA,1), no_change_A=c(1,1,1,1,1),
no_change_B=c(2,2,2,2,2), no_change_C=c(1,1,1,1,1),
no_change_D=c(1,3,3,1,NA), no_change_E=c(1,1,1,1,1),
no_change_F=c(0,0,1,1,1), no_change_G=c(1,1,2,2,2),
no_change_H=c(5,6,7,8,9), no_change_I=c(1,1,1,1,1)
country=c("BRA","USA","URU,"BRA","UK"),
ID=c(1,2,3,4,5), v5=c(6:10))   

我需要重新编码具有单词"的变量组中的所有值;意识形态;对应于var1中具有相同字母的每个变量。更具体地说,每当var1_A==0或var1_A=NA时,将ideology2_A和ex_ideology3_A重新编码为NA,依此类推。事实上,生成新变量更合适,保留原始变量(如dplyr的"突变",而不是"转化"(。

我的实际数据有更多的列和60k行,但这个问题中涉及的列只有这三组9,由字母索引(还有其他变量组也用这些字母索引,就像上面例子中的"no_change"一样,但这不应该干扰解决方案(。

编辑:我的示例数据集现在更接近于我需要处理的真实数据。在帖子的第一个版本中,它是一个数据集,每组变量(字母a到C(只有3列,所有组的名称相似(var1、var2、var3(。除了基准和要重新编码的变量之外,它也没有其他变量。

这个基本R选项怎么样

do.call(
cbind,
lapply(
unname(u <- split.default(mydata, gsub(".*_", "", names(mydata)))),
function(x) cbind(x[1], x[-1] * ifelse(x[[1]] %in% c(0, NA), NA, 1))
)
)[match(c(sapply(u, names)), names(mydata))]

它给出

var1_A var1_B var1_C var2_A var2_B var2_C var3_A var3_B var3_C
1      1      1      1      2     NA      1      3      3      1
2      2      1     NA      2      3     NA      3     NA     NA
3      0     NA      1     NA     NA      7     NA     NA      7
4      5      0      1      3     NA      6      3     NA      6
5     NA      3      0     NA      4     NA     NA      4     NA

如果您想创建重新编码的列并保留原始列,请尝试下面的代码

do.call(
cbind,
lapply(
unname(u <- split.default(mydata, gsub(".*_", "", names(mydata)))),
function(x) cbind(x, x[-1] * ifelse(x[[1]] %in% c(0, NA), NA, 1))
)
)

它给出

var1_A var2_A var3_A var2_A var3_A var1_B var2_B var3_B var2_B var3_B var1_C
1      1      2      3      2      3      1     NA      3     NA      3      1
2      2      2      3      2      3      1      3     NA      3     NA     NA
3      0      2      3     NA     NA     NA      3      3     NA     NA      1
4      5      3      3      3      3      0      1      1     NA     NA      1
5     NA      4      4     NA     NA      3      4      4      4      4      0
var2_C var3_C var2_C var3_C
1      1      1      1      1
2      4      4     NA     NA
3      7      7      7      7
4      6      6      6      6
5      9     NA     NA     NA

数据

> dput(mydata)
structure(list(var1_A = c(1, 2, 0, 5, NA), var1_B = c(1, 1, NA, 
0, 3), var1_C = c(1, NA, 1, 1, 0), var2_A = c(2, 2, 2, 3, 4),
var2_B = c(NA, 3, 3, 1, 4), var2_C = c(1, 4, 7, 6, 9), var3_A = c(3,
3, 3, 3, 4), var3_B = c(3, NA, 3, 1, 4), var3_C = c(1, 4,
7, 6, NA)), class = "data.frame", row.names = c(NA, -5L))

使用各种枢轴:

library(dplyr)
library(tidyr)
mydata %>% 
mutate(row = row_number()) %>% 
pivot_longer(-row, names_to = "key", values_to = "values") %>% 
separate(key, c("var", "letter")) %>% 
pivot_wider(names_from = var, values_from = values) %>% 
mutate(
var2 = if_else(var1 == 0 | is.na(var1), NA_real_, var2),
var3 = if_else(var1 == 0 | is.na(var1), NA_real_, var3),
) %>% 
pivot_longer(starts_with("var"), names_to = "var", values_to = "values") %>% 
mutate(name = paste(var, letter, sep = "_")) %>% 
pivot_wider(id_cols = row, names_from = name, values_from = values)

类似于@eastclintw00d的解决方案,但方法略有不同。

tmp <- mydata %>% 
mutate(obs = 1:n()) %>% 
pivot_longer(-obs, 
names_pattern="(.*)_(.*)", 
names_to=c("vars", "letter"), 
values_to="vals") %>% 
group_by(letter, obs) %>% 
mutate(val1 = vals[which(vars == "var1")], 
new = case_when(val1 == 0 | is.na(val1) ~ NA_real_, TRUE ~ vals)) %>% 
select(-val1) %>% 
pivot_wider(names_from = c("vars", "letter"), 
values_from=c("vals", "new"))
names(tmp) <- gsub("vals_", "", names(tmp))

编辑:使用更新的数据

这应该适用于更新后的数据。不同之处在于如何选择要透视的变量——它会找到以下划线和任何大写字母结尾的变量。ends_with()函数似乎不适用于正则表达式,所以我使用了which()中封装的stringr中的str_detect()

tmp <- mydata %>% 
mutate(obs = 1:n()) %>% 
pivot_longer(which(str_detect(names(.), "_[A-Z]")), 
names_pattern="(.*)_(.*)", 
names_to=c("vars", "letter"), 
values_to="vals") %>% 
group_by(letter, obs) %>% 
mutate(val1 = vals[which(vars == "var1")], 
new = case_when(val1 == 0 | is.na(val1) ~ NA_real_, TRUE ~ vals)) %>% 
select(-val1) %>% 
pivot_wider(names_from = c("vars", "letter"), 
values_from=c("vals", "new"))
names(tmp) <- gsub("vals_", "", names(tmp))

最新更新