我每次都有一组列,我想用mutate()
和across()
将其转换为许多布尔列(按类别(,如下所示:
data <- data.frame(category_t1 = c("A","B","C","C","A","B"),
category_t2 = c("A","C","B","B","B",NA),
category_t3 = c("C","C",NA,"B",NA,"A"))
data %>% mutate(across(starts_with("category"),
~case_when(.x == "A" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'A')}"),
across(starts_with("category"),
~case_when(.x == "B" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'B')}"),
across(starts_with("category"),
~case_when(.x == "C" ~ TRUE, !is.na(.x) ~ FALSE),
.names = "{str_replace(.col, 'category', 'C')}"))
哪个制造商:
category_t1 category_t2 category_t3 A_t1 A_t2 A_t3 B_t1 B_t2 B_t3 C_t1 C_t2
1 A A C TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
2 B C C FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE NA FALSE TRUE NA TRUE FALSE
4 C B B FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
5 A B <NA> TRUE FALSE NA FALSE TRUE NA FALSE FALSE
6 B <NA> A FALSE NA TRUE TRUE NA FALSE FALSE NA
它是有效的,但我想知道是否有更好的想法,因为我在这里做了3次相同的代码,而不是一个大代码(想象一下,如果我有10次重复它…(。我以为我可以用map()
来做,但我没能让它起作用。我认为存在问题,因为across()
中的.names
参数无法与我在case_when()
中使用的字符串连接。
我认为在...
的论点中可能有一些事情要做,比如:
data %>% mutate(across(starts_with("category"),
~case_when(.x == mod ~ TRUE, !is.na(.x) ~ FALSE),
mod = levels(as.factor(data$category_t1)),
.names = "{str_replace(.col, 'category', mod)}"))
但这在这里当然不起作用。你知道怎么做吗?
非常感谢。
我们可以在across
中使用table
library(dplyr)
library(stringr)
library(tidyr)
data %>%
mutate(across(everything(), ~ as.data.frame.matrix(table(row_number(), .x) *
NA^(is.na(.x)) > 0),
.names = "{str_remove(.col, 'category_')}")) %>%
unpack(where(is.data.frame), names_sep = ".")
-输出
# A tibble: 6 × 12
category_t1 category_t2 category_t3 t1.A t1.B t1.C t2.A t2.B t2.C t3.A t3.B t3.C
<chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE FALSE
或使用base R
中的model.matrix
data1 <- replace(data, is.na(data), "NA")
lvls <- lapply(data1, (x) levels(factor(x, levels = c("NA", "A", "B", "C"))))
m1 <- model.matrix(~ 0 + ., data = data1, xlev = lvls)
out <- cbind(data, m1[, -grep("NA", colnames(m1))] > 0)
-输出
out
category_t1 category_t2 category_t3 category_t1A category_t1B category_t1C category_t2A category_t2B category_t2C category_t3A category_t3B category_t3C
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
6 B <NA> A FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
> colnames(out)
[1] "category_t1" "category_t2" "category_t3"
[4] "category_t1A" "category_t1B" "category_t1C"
[7] "category_t2A" "category_t2B" "category_t2C"
[10] "category_t3A"
[11] "category_t3B" "category_t3C"
或table
的另一个选项
cbind(data, do.call(cbind.data.frame,
lapply(data, (x) (table(seq_along(x), x)* NA^is.na(x)) > 0)))
-输出
category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE
category_t3.C
1 TRUE
2 TRUE
3 NA
4 FALSE
5 NA
6 FALSE
不是tidyverse
选项(尽管管道兼容(,但它与包fastDummies
:非常容易实现
fastDummies::dummy_cols(data, ignore_na = TRUE)
category_t1 category_t2 category_t3 category_t1_A category_t1_B category_t1_C category_t2_A category_t2_B category_t2_C category_t3_A category_t3_B category_t3_C
1 A A C 1 0 0 1 0 0 0 0 1
2 B C C 0 1 0 0 0 1 0 0 1
3 C B <NA> 0 0 1 0 1 0 NA NA NA
4 C B B 0 0 1 0 1 0 0 1 0
5 A B <NA> 1 0 0 0 1 0 NA NA NA
6 B <NA> A 0 1 0 NA NA NA 1 0 0
purrr
的map_dfc
可以很好地匹配您当前的方法:
library(dplyr)
library(purrr)
bind_cols(data,
map_dfc(LETTERS[1:3], (letter) { mutate(data,
across(starts_with("category"),
~ case_when(.x == letter ~ TRUE, !is.na(.x) ~ FALSE),
.names = paste0("{str_replace(.col, 'category', '", letter, "')}")),
.keep = "none") }
)
)
或者跳过bind_cols
,使用.keep = ifelse(letter == "A", "all", "none")
。
输出:
category_t1 category_t2 category_t3 A_t1 A_t2 A_t3 B_t1 B_t2 B_t3 C_t1 C_t2 C_t3
1 A A C TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
3 C B <NA> FALSE FALSE NA FALSE TRUE NA TRUE FALSE NA
4 C B B FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE
5 A B <NA> TRUE FALSE NA FALSE TRUE NA FALSE FALSE NA
6 B <NA> A FALSE NA TRUE TRUE NA FALSE FALSE NA FALSE
具有嵌套lapply()
:的base
解决方案
cbind(data, lapply(data, (x) {
lev <- levels(factor(x))
sapply(setNames(lev, lev), (y) x == y)
}))
category_t1 category_t2 category_t3 category_t1.A category_t1.B category_t1.C category_t2.A category_t2.B category_t2.C category_t3.A category_t3.B category_t3.C
1 A A C TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE
2 B C C FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
3 C B <NA> FALSE FALSE TRUE FALSE TRUE FALSE NA NA NA
4 C B B FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE
5 A B <NA> TRUE FALSE FALSE FALSE TRUE FALSE NA NA NA
6 B <NA> A FALSE TRUE FALSE NA NA NA TRUE FALSE FALSE