数据文件:-redacted-
我正在使用ANES 2016时间序列研究数据。使用以下代码和类别对缺失数据进行编码,以表明它们缺失的原因:
1。不适用
2。以单独文件或编码版本提供的文本响应将包含在未来的版本
‐3中。限制
4。误差
5。断开,充分部分IW
‐6。没有选举后面试
‐7。无后期数据,因IW
‐8不完整而删除。不知道
‐9。拒绝
我想在我的数据中使用case_when和横跨Tidyverse替换任何这些NAs。下面是一个示例,我尝试用NA替换选中的所有列中的值1。这只是返回原始数据帧。我想保留不匹配的数据
所有数据都被格式化为带标签的数字,因此强制转换为整数值。
#Returns original data frame
anes %>%
mutate(
across(V162078:V161522,
as.integer,
~case_when(. == 1 ~ NA_real_,
. != 1 ~ .)))
class(anes$V162078)
[1] "labelled" "numeric"
> dput(head(anes))
structure(list(V160101f = c(0.8877, 1.1605, 0.4161, 0.3852, 0.6931,
0.7588), V161010e = c("LA", "AR", "MS", "TN", "OH", "NJ"), V162078 = c(15,
50, 50, 15, 30, 0), V162079 = c(85, 60, 70, 60, 15, 65), V161002 = c(1,
1, 1, 1, 2, 1), V161003 = c(3, 1, 4, 3, 3, 1), V161004 = c(1,
1, 3, 2, 2, 1), V161005 = c(2, 1, 2, 1, 1, 1), V161006 = c(-1,
1, -1, 2, 1, 2), V161008 = c(3, 7, 3, 7, 5, 7), V161011 = c(1,
1, 3, 1, 1, 1), V161019 = c(2, 4, -1, -1, -1, 4), V161020 = c(-1,
-1, 2, -1, -1, -1), V161021 = c(2, 2, 2, 2, 2, 2), V161021a = c(-1,
-1, -1, -1, -1, -1), V161022 = c(2, 2, -1, 1, 2, 2), V161030 = c(1,
1, -1, -1, 1, -8), V161080 = c(2, 2, 2, 1, 2, 2), V161081 = c(2,
2, 2, 2, 2, 2), V161082 = c(2, 2, 2, 2, 2, 2), V161083 = c(2,
-8, 2, 2, 2, 2), V161084 = c(2, -8, 2, 2, 2, 2), V161085 = c(2,
2, 2, 2, 2, 2), V161110 = c(3, 3, 3, 1, 5, 3), V161126 = c(99,
5, 99, 99, 4, 6), V161128 = c(1, 1, 3, 2, 3, 1), V161129 = c(6,
6, 5, 4, -8, 5), V161146 = c(2, 1, 2, 2, 1, 2), V161149 = c(2,
1, 2, 1, 1, 2), V161151x = c(4, 4, 6, 7, 5, 1), V161158x = c(7,
6, 3, 5, 3, 5), V161204x = c(4, 6, 4, 6, 7, 7), V161215 = c(4,
4, 4, 2, 5, 5), V161216 = c(1, 1, 1, 1, 2, 1), V161217 = c(1,
1, 1, 1, 3, 1), V161223 = c(1, 2, 2, 3, 2, 1), V161227x = c(4,
1, 5, 1, 5, 1), V161228x = c(1, 1, 1, 1, 4, -8), V161235x = c(5,
3, 3, 3, 4, 2), V161241 = c(1, 2, 2, 1, 1, 1), V161265x = c(2,
8, 8, 2, 7, 4), V161267 = c(29, 26, 23, 58, 38, 60), V161268 = c(1,
6, 6, 1, 4, 2), V161270 = c(9, 13, 9, 9, 9, 14), V161310x = c(1,
1, 1, 1, 1, 1), V161315 = c(1, 1, 1, 1, 1, 1), V161324 = c(1,
0, 2, 0, 3, 1), V161326 = c(1, 1, 1, 1, 2, 1), V161361x = c(13,
17, 6, 20, 3, 1), V161522 = c(2, 3, 1, 2, 2, 1)), row.names = c(NA,
6L), class = "data.frame")
如果我在原始数据上运行它而不强制转换为数字类型,则会发生以下情况:
anes %>%
mutate(
across(V162078:V161522,
~case_when(. == 1 ~ NA_real_,
. != 1 ~ .)))
> rlang::last_error()
█
├─<error/dplyr:::mutate_error>
│ Problem with `mutate()` input `..1`.
│ x must have class `numeric`, not class `labelled/numeric`.
│ ℹ Input `..1` is `(function (.cols = everything(), .fns = NULL, ..., .names = NULL) ...`.
└─<error/rlang_error>
must have class `numeric`, not class `labelled/numeric`.
Backtrace:
1. `%>%`(...)
8. dplyr::case_when(. == 1 ~ NA_real_, . != 1 ~ .)
9. dplyr:::replace_with(...)
10. dplyr:::check_class(val, x, name)
11. dplyr:::glubort(header, "must have class `{exp_classes}`, not class `{out_classes}`.")
Run `rlang::last_trace()` to see the full context.
最终答案:
anes %>%
mutate(across(V162078:V161522,
~case_when(
. == -1 ~ NA_real_,
. == -2 ~ NA_real_,
. == -3 ~ NA_real_,
. == -4 ~ NA_real_,
. == -5 ~ NA_real_,
. == -6 ~ NA_real_,
. == -7 ~ NA_real_,
. == -8 ~ NA_real_,
. == -9 ~ NA_real_,
. == 99 ~ NA_real_,
. == 998 ~ NA_real_,
. == 999 ~ NA_real_,
TRUE ~ as.numeric((.))))) %>% #This catches all values that are not declared in case_when
mutate(across(V162078:V161522, as.integer))
anes %>%
mutate(across(V162078:V161522, as.integer),
across(V162078:V161522,
~if_else(.x == 1, NA_integer_, .x)))
或
anes %>%
mutate(across(V162078:V161522, as.integer),
across(V162078:V161522,
~case_when(.x == 1 ~ NA_integer_,
TRUE ~ .x)))
或者我们可以使用dplyr::near
来测试是否接近相等,这样我们就可以完全跳过类型转换:
anes %>%
mutate(across(V162078:V161522,
~if_else(near(.x, 1), NA_real_, .x)))