我有以下数据:
mydata <- data.frame(var1_A=c(1,2,0,5,NA),var1_B=c(1,1,NA,0,3),var1_C=c(1,NA,1,1,0),
var1_D=c(1,1,NA,0,3),var1_E=c(1,2,0,5,NA),var1_F=c(1,2,NA,1,0),
var1_G=c(1,2,NA,5,NA),var1_H=c(1,NA,1,0,3),var1_I=c(1,NA,2,1,0),
ideology2_A=c(2,2,2,3,4), ideology2_B=c(NA,3,3,1,4),
ideology2_C=c(1,4,7,6,9), ideology2_D=c(8,9,7,6,4),
ideology2_E=c(NA,9,5,6,4),
ideology2_F=c(1,1,1,1,4), ideology2_G=c(8,9,7,6,4),
ideology2_H=c(8,9,7,6,5), ideology2_I=c(8,9,8,9,8),
ex_ideology_A=c(3,3,3,3,4), ex_ideology_B=c(3,NA,3,1,4),
ex_ideology_C=c(1,4,7,6,NA),ex_ideology_D=c(3,3,3,NA,0),
ex_ideology_E=c(3,NA,3,1,1), ex_ideology_F = c(1,1,1,2,3),
ex_ideology_G=c(1,2,3,4,5), ex_ideology_H=c(1,1,1,1,1)
ex_ideology_I=c(2,2,2,NA,1), no_change_A=c(1,1,1,1,1),
no_change_B=c(2,2,2,2,2), no_change_C=c(1,1,1,1,1),
no_change_D=c(1,3,3,1,NA), no_change_E=c(1,1,1,1,1),
no_change_F=c(0,0,1,1,1), no_change_G=c(1,1,2,2,2),
no_change_H=c(5,6,7,8,9), no_change_I=c(1,1,1,1,1)
country=c("BRA","USA","URU,"BRA","UK"),
ID=c(1,2,3,4,5), v5=c(6:10))
我需要重新编码具有单词"的变量组中的所有值;意识形态;对应于var1中具有相同字母的每个变量。更具体地说,每当var1_A==0或var1_A=NA时,将ideology2_A和ex_ideology3_A重新编码为NA,依此类推。事实上,生成新变量更合适,保留原始变量(如dplyr的"突变",而不是"转化"(。
我的实际数据有更多的列和60k行,但这个问题中涉及的列只有这三组9,由字母索引(还有其他变量组也用这些字母索引,就像上面例子中的"no_change"一样,但这不应该干扰解决方案(。
编辑:我的示例数据集现在更接近于我需要处理的真实数据。在帖子的第一个版本中,它是一个数据集,每组变量(字母a到C(只有3列,所有组的名称相似(var1、var2、var3(。除了基准和要重新编码的变量之外,它也没有其他变量。
这个基本R选项怎么样
do.call(
cbind,
lapply(
unname(u <- split.default(mydata, gsub(".*_", "", names(mydata)))),
function(x) cbind(x[1], x[-1] * ifelse(x[[1]] %in% c(0, NA), NA, 1))
)
)[match(c(sapply(u, names)), names(mydata))]
它给出
var1_A var1_B var1_C var2_A var2_B var2_C var3_A var3_B var3_C
1 1 1 1 2 NA 1 3 3 1
2 2 1 NA 2 3 NA 3 NA NA
3 0 NA 1 NA NA 7 NA NA 7
4 5 0 1 3 NA 6 3 NA 6
5 NA 3 0 NA 4 NA NA 4 NA
如果您想创建重新编码的列并保留原始列,请尝试下面的代码
do.call(
cbind,
lapply(
unname(u <- split.default(mydata, gsub(".*_", "", names(mydata)))),
function(x) cbind(x, x[-1] * ifelse(x[[1]] %in% c(0, NA), NA, 1))
)
)
它给出
var1_A var2_A var3_A var2_A var3_A var1_B var2_B var3_B var2_B var3_B var1_C
1 1 2 3 2 3 1 NA 3 NA 3 1
2 2 2 3 2 3 1 3 NA 3 NA NA
3 0 2 3 NA NA NA 3 3 NA NA 1
4 5 3 3 3 3 0 1 1 NA NA 1
5 NA 4 4 NA NA 3 4 4 4 4 0
var2_C var3_C var2_C var3_C
1 1 1 1 1
2 4 4 NA NA
3 7 7 7 7
4 6 6 6 6
5 9 NA NA NA
数据
> dput(mydata)
structure(list(var1_A = c(1, 2, 0, 5, NA), var1_B = c(1, 1, NA,
0, 3), var1_C = c(1, NA, 1, 1, 0), var2_A = c(2, 2, 2, 3, 4),
var2_B = c(NA, 3, 3, 1, 4), var2_C = c(1, 4, 7, 6, 9), var3_A = c(3,
3, 3, 3, 4), var3_B = c(3, NA, 3, 1, 4), var3_C = c(1, 4,
7, 6, NA)), class = "data.frame", row.names = c(NA, -5L))
使用各种枢轴:
library(dplyr)
library(tidyr)
mydata %>%
mutate(row = row_number()) %>%
pivot_longer(-row, names_to = "key", values_to = "values") %>%
separate(key, c("var", "letter")) %>%
pivot_wider(names_from = var, values_from = values) %>%
mutate(
var2 = if_else(var1 == 0 | is.na(var1), NA_real_, var2),
var3 = if_else(var1 == 0 | is.na(var1), NA_real_, var3),
) %>%
pivot_longer(starts_with("var"), names_to = "var", values_to = "values") %>%
mutate(name = paste(var, letter, sep = "_")) %>%
pivot_wider(id_cols = row, names_from = name, values_from = values)
类似于@eastclintw00d的解决方案,但方法略有不同。
tmp <- mydata %>%
mutate(obs = 1:n()) %>%
pivot_longer(-obs,
names_pattern="(.*)_(.*)",
names_to=c("vars", "letter"),
values_to="vals") %>%
group_by(letter, obs) %>%
mutate(val1 = vals[which(vars == "var1")],
new = case_when(val1 == 0 | is.na(val1) ~ NA_real_, TRUE ~ vals)) %>%
select(-val1) %>%
pivot_wider(names_from = c("vars", "letter"),
values_from=c("vals", "new"))
names(tmp) <- gsub("vals_", "", names(tmp))
编辑:使用更新的数据
这应该适用于更新后的数据。不同之处在于如何选择要透视的变量——它会找到以下划线和任何大写字母结尾的变量。ends_with()
函数似乎不适用于正则表达式,所以我使用了which()
中封装的stringr
中的str_detect()
。
tmp <- mydata %>%
mutate(obs = 1:n()) %>%
pivot_longer(which(str_detect(names(.), "_[A-Z]")),
names_pattern="(.*)_(.*)",
names_to=c("vars", "letter"),
values_to="vals") %>%
group_by(letter, obs) %>%
mutate(val1 = vals[which(vars == "var1")],
new = case_when(val1 == 0 | is.na(val1) ~ NA_real_, TRUE ~ vals)) %>%
select(-val1) %>%
pivot_wider(names_from = c("vars", "letter"),
values_from=c("vals", "new"))
names(tmp) <- gsub("vals_", "", names(tmp))