在示例df中,我想移动"组"变量用5个级别来表示5个不同的变量,并确定它们属于哪一组。(我知道这是一种奇怪的格式,但这更容易为以后的计算和分析)
df例子:
|x |group |date |
|1 |1 |2021-01-01 |
|1 |1 |2021-01-02 |
|1 |1 |2021-01-03 |
|1 |2 |2021-01-10 |
|1 |2 |2021-01-11 |
|1 |3 |2021-01-20 |
|1 |3 |2021-01-21 |
|1 |3 |2021-01-22 |
|1 |4 |2021-02-22 |
|1 |5 |2021-03-22 |
预期结果:
|x |date |group1 |group2 |group3 |group4 |group5 |
|1 |2021-01-01 |TRUE |FALSE |FALSE |FALSE |FALSE |
|1 |2021-01-02 |TRUE |FALSE |FALSE |FALSE |FALSE |
|1 |2021-01-03 |TRUE |FALSE |FALSE |FALSE |FALSE |
|1 |2021-01-10 |FALSE |TRUE |FALSE |FALSE |FALSE |
|1 |2021-01-11 |FALSE |TRUE |FALSE |FALSE |FALSE |
|1 |2021-01-20 |FALSE |FALSE |TRUE |FALSE |FALSE |
|1 |2021-01-21 |FALSE |FALSE |TRUE |FALSE |FALSE |
|1 |2021-01-22 |FALSE |FALSE |TRUE |FALSE |FALSE |
|1 |2021-02-22 |FALSE |FALSE |FALSE |TRUE |FALSE |
|1 |2021-03-22 |FALSE |FALSE |FALSE |FALSE |TRUE |
我试过pivot_wider,但我不知道应该从哪里得到值。我还尝试了以下代码:
df = df %>%
mutate(group1= 0,
group2= 0,
group3= 0,
group4= 0,
group5= 0) %>%
for (i in 1:nrow(df)){
if(group == 1){group1 = TRUE}
else if(group == 2){group2 = TRUE}
else if(group == 3){group3 = TRUE}
else if(group == 3){group4 = TRUE}
else {group5 = TRUE}
}
错误是:error in for(。在i) 1中:nrow(df):传递给'for'的4个参数需要3
将group
类转化为factor
类后,我们可以做:
1。tidyverse
way:
library(purrr)
library(tidyr)
library(dplyr)
df %>%
mutate(group = as.factor(group),
group_X = map(group, ~set_names(levels(group) == .x,
levels(group)))) %>%
unnest_wider(group_X)
x group date group_1 group_2 group_3 group_4 group_5
<int> <fct> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
1 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
2 1 1 2021-01-02 TRUE FALSE FALSE FALSE FALSE
3 1 1 2021-01-03 TRUE FALSE FALSE FALSE FALSE
4 1 2 2021-01-10 FALSE TRUE FALSE FALSE FALSE
5 1 2 2021-01-11 FALSE TRUE FALSE FALSE FALSE
6 1 3 2021-01-20 FALSE FALSE TRUE FALSE FALSE
7 1 3 2021-01-21 FALSE FALSE TRUE FALSE FALSE
8 1 3 2021-01-22 FALSE FALSE TRUE FALSE FALSE
9 1 4 2021-02-22 FALSE FALSE FALSE TRUE FALSE
10 1 5 2021-03-22 FALSE FALSE FALSE FALSE TRUE
2。sapply
:
library(dplyr)
df$group <- as.factor(df$group)
df %>% cbind(sapply(paste0("group_",levels(.$group)), `==`, .$group))
x group date group_1 group_2 group_3 group_4 group_5
1 1 1 2021-01-01 FALSE FALSE FALSE FALSE FALSE
2 1 1 2021-01-02 FALSE FALSE FALSE FALSE FALSE
3 1 1 2021-01-03 FALSE FALSE FALSE FALSE FALSE
4 1 2 2021-01-10 FALSE FALSE FALSE FALSE FALSE
5 1 2 2021-01-11 FALSE FALSE FALSE FALSE FALSE
6 1 3 2021-01-20 FALSE FALSE FALSE FALSE FALSE
7 1 3 2021-01-21 FALSE FALSE FALSE FALSE FALSE
8 1 3 2021-01-22 FALSE FALSE FALSE FALSE FALSE
9 1 4 2021-02-22 FALSE FALSE FALSE FALSE FALSE
10 1 5 2021-03-22 FALSE FALSE FALSE FALSE FALSE
library(tidyverse)
library(lubridate)
df = tibble(
x=1,
group=sample(1:5,20, replace=TRUE),
date=sample(seq(ymd("20210101"),
ymd("20210201"),
ddays(1)),
20, replace=TRUE)
)
df %>% mutate(
group1 = group==1,
group2 = group==2,
group3 = group==3,
group4 = group==4,
group5 = group==5,
)
输出A tibble: 20 x 8
x group date group1 group2 group3 group4 group5
<dbl> <int> <date> <lgl> <lgl> <lgl> <lgl> <lgl>
1 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
2 1 3 2021-01-01 FALSE FALSE TRUE FALSE FALSE
3 1 5 2021-01-01 FALSE FALSE FALSE FALSE TRUE
4 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
5 1 3 2021-01-01 FALSE FALSE TRUE FALSE FALSE
6 1 4 2021-01-01 FALSE FALSE FALSE TRUE FALSE
7 1 4 2021-01-01 FALSE FALSE FALSE TRUE FALSE
8 1 5 2021-01-01 FALSE FALSE FALSE FALSE TRUE
9 1 4 2021-01-01 FALSE FALSE FALSE TRUE FALSE
10 1 5 2021-01-01 FALSE FALSE FALSE FALSE TRUE
11 1 2 2021-01-01 FALSE TRUE FALSE FALSE FALSE
12 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
13 1 5 2021-01-01 FALSE FALSE FALSE FALSE TRUE
14 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
15 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
16 1 3 2021-01-01 FALSE FALSE TRUE FALSE FALSE
17 1 4 2021-01-01 FALSE FALSE FALSE TRUE FALSE
18 1 5 2021-01-01 FALSE FALSE FALSE FALSE TRUE
19 1 2 2021-01-01 FALSE TRUE FALSE FALSE FALSE
20 1 5 2021-01-01 FALSE FALSE FALSE FALSE TRUE
我们可以从tidyr
中使用pivot_wider
来获得宽格式的数据。在values_fn
中,对于已存在的组值可以返回TRUE
,否则使用values_fill
参数返回FALSE
。
tidyr::pivot_wider(df,
names_from = group,
values_from = group,
values_fn = function(x) TRUE,
names_prefix = 'group',
values_fill = FALSE)
# x date group1 group2 group3 group4 group5
# <int> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 1 2021-01-01 TRUE FALSE FALSE FALSE FALSE
# 2 1 2021-01-02 TRUE FALSE FALSE FALSE FALSE
# 3 1 2021-01-03 TRUE FALSE FALSE FALSE FALSE
# 4 1 2021-01-10 FALSE TRUE FALSE FALSE FALSE
# 5 1 2021-01-11 FALSE TRUE FALSE FALSE FALSE
# 6 1 2021-01-20 FALSE FALSE TRUE FALSE FALSE
# 7 1 2021-01-21 FALSE FALSE TRUE FALSE FALSE
# 8 1 2021-01-22 FALSE FALSE TRUE FALSE FALSE
# 9 1 2021-02-22 FALSE FALSE FALSE TRUE FALSE
#10 1 2021-03-22 FALSE FALSE FALSE FALSE TRUE
如果您以可重复的格式提供数据,则更容易提供帮助
df <- structure(list(x = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L), date = c("2021-01-01",
"2021-01-02", "2021-01-03", "2021-01-10", "2021-01-11", "2021-01-20",
"2021-01-21", "2021-01-22", "2021-02-22", "2021-03-22")), row.names = c(NA,
-10L), class = "data.frame")