R:扩展和扩大关于日期的数据集



在示例df中,我想移动"组"变量用5个级别来表示5个不同的变量,并确定它们属于哪一组。(我知道这是一种奇怪的格式,但这更容易为以后的计算和分析)

df例子:

|x   |group   |date          |
|1   |1       |2021-01-01    |
|1   |1       |2021-01-02    |
|1   |1       |2021-01-03    |
|1   |2       |2021-01-10    |
|1   |2       |2021-01-11    |
|1   |3       |2021-01-20    |
|1   |3       |2021-01-21    |
|1   |3       |2021-01-22    |
|1   |4       |2021-02-22    |
|1   |5       |2021-03-22    |

预期结果:

|x   |date          |group1   |group2   |group3   |group4   |group5   |
|1   |2021-01-01    |TRUE     |FALSE    |FALSE    |FALSE    |FALSE    |
|1   |2021-01-02    |TRUE     |FALSE    |FALSE    |FALSE    |FALSE    |
|1   |2021-01-03    |TRUE     |FALSE    |FALSE    |FALSE    |FALSE    |
|1   |2021-01-10    |FALSE    |TRUE     |FALSE    |FALSE    |FALSE    |
|1   |2021-01-11    |FALSE    |TRUE     |FALSE    |FALSE    |FALSE    |
|1   |2021-01-20    |FALSE    |FALSE    |TRUE     |FALSE    |FALSE    |
|1   |2021-01-21    |FALSE    |FALSE    |TRUE     |FALSE    |FALSE    |
|1   |2021-01-22    |FALSE    |FALSE    |TRUE     |FALSE    |FALSE    |
|1   |2021-02-22    |FALSE    |FALSE    |FALSE    |TRUE     |FALSE    |
|1   |2021-03-22    |FALSE    |FALSE    |FALSE    |FALSE    |TRUE     |

我试过pivot_wider,但我不知道应该从哪里得到值。我还尝试了以下代码:

df = df %>%
mutate(group1= 0,
group2= 0,
group3= 0,
group4= 0,
group5= 0) %>%
for (i in 1:nrow(df)){
if(group == 1){group1 = TRUE}
else if(group == 2){group2 = TRUE}
else if(group == 3){group3 = TRUE}
else if(group == 3){group4 = TRUE}
else {group5 = TRUE}
}

错误是:error in for(。在i) 1中:nrow(df):传递给'for'的4个参数需要3

group类转化为factor类后,我们可以做:

1。tidyverseway:

library(purrr)
library(tidyr)
library(dplyr)
df %>% 
mutate(group = as.factor(group),
group_X = map(group, ~set_names(levels(group) == .x,
levels(group)))) %>% 
unnest_wider(group_X)
x group date       group_1 group_2 group_3 group_4 group_5
<int> <fct> <chr>      <lgl>   <lgl>   <lgl>   <lgl>   <lgl>  
1     1 1     2021-01-01 TRUE    FALSE   FALSE   FALSE   FALSE  
2     1 1     2021-01-02 TRUE    FALSE   FALSE   FALSE   FALSE  
3     1 1     2021-01-03 TRUE    FALSE   FALSE   FALSE   FALSE  
4     1 2     2021-01-10 FALSE   TRUE    FALSE   FALSE   FALSE  
5     1 2     2021-01-11 FALSE   TRUE    FALSE   FALSE   FALSE  
6     1 3     2021-01-20 FALSE   FALSE   TRUE    FALSE   FALSE  
7     1 3     2021-01-21 FALSE   FALSE   TRUE    FALSE   FALSE  
8     1 3     2021-01-22 FALSE   FALSE   TRUE    FALSE   FALSE  
9     1 4     2021-02-22 FALSE   FALSE   FALSE   TRUE    FALSE  
10     1 5     2021-03-22 FALSE   FALSE   FALSE   FALSE   TRUE   

2。sapply:

library(dplyr)
df$group <- as.factor(df$group)
df %>%  cbind(sapply(paste0("group_",levels(.$group)), `==`, .$group))
x group       date group_1 group_2 group_3 group_4 group_5
1  1     1 2021-01-01   FALSE   FALSE   FALSE   FALSE   FALSE
2  1     1 2021-01-02   FALSE   FALSE   FALSE   FALSE   FALSE
3  1     1 2021-01-03   FALSE   FALSE   FALSE   FALSE   FALSE
4  1     2 2021-01-10   FALSE   FALSE   FALSE   FALSE   FALSE
5  1     2 2021-01-11   FALSE   FALSE   FALSE   FALSE   FALSE
6  1     3 2021-01-20   FALSE   FALSE   FALSE   FALSE   FALSE
7  1     3 2021-01-21   FALSE   FALSE   FALSE   FALSE   FALSE
8  1     3 2021-01-22   FALSE   FALSE   FALSE   FALSE   FALSE
9  1     4 2021-02-22   FALSE   FALSE   FALSE   FALSE   FALSE
10 1     5 2021-03-22   FALSE   FALSE   FALSE   FALSE   FALSE
library(tidyverse)
library(lubridate)
df = tibble(
x=1,
group=sample(1:5,20, replace=TRUE),
date=sample(seq(ymd("20210101"),
ymd("20210201"),
ddays(1)),
20, replace=TRUE)
)
df %>% mutate(
group1 = group==1,
group2 = group==2,  
group3 = group==3,
group4 = group==4,
group5 = group==5,
)

输出
A tibble: 20 x 8
x group date       group1 group2 group3 group4 group5
<dbl> <int> <date>     <lgl>  <lgl>  <lgl>  <lgl>  <lgl> 
1     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
2     1     3 2021-01-01 FALSE  FALSE  TRUE   FALSE  FALSE 
3     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
4     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
5     1     3 2021-01-01 FALSE  FALSE  TRUE   FALSE  FALSE 
6     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
7     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
8     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
9     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
10     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
11     1     2 2021-01-01 FALSE  TRUE   FALSE  FALSE  FALSE 
12     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
13     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
14     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
15     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
16     1     3 2021-01-01 FALSE  FALSE  TRUE   FALSE  FALSE 
17     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
18     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
19     1     2 2021-01-01 FALSE  TRUE   FALSE  FALSE  FALSE 
20     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  

我们可以从tidyr中使用pivot_wider来获得宽格式的数据。在values_fn中,对于已存在的组值可以返回TRUE,否则使用values_fill参数返回FALSE

tidyr::pivot_wider(df, 
names_from = group, 
values_from = group, 
values_fn = function(x) TRUE, 
names_prefix = 'group', 
values_fill = FALSE)
#       x date       group1 group2 group3 group4 group5
#   <int> <chr>      <lgl>  <lgl>  <lgl>  <lgl>  <lgl> 
# 1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
# 2     1 2021-01-02 TRUE   FALSE  FALSE  FALSE  FALSE 
# 3     1 2021-01-03 TRUE   FALSE  FALSE  FALSE  FALSE 
# 4     1 2021-01-10 FALSE  TRUE   FALSE  FALSE  FALSE 
# 5     1 2021-01-11 FALSE  TRUE   FALSE  FALSE  FALSE 
# 6     1 2021-01-20 FALSE  FALSE  TRUE   FALSE  FALSE 
# 7     1 2021-01-21 FALSE  FALSE  TRUE   FALSE  FALSE 
# 8     1 2021-01-22 FALSE  FALSE  TRUE   FALSE  FALSE 
# 9     1 2021-02-22 FALSE  FALSE  FALSE  TRUE   FALSE 
#10     1 2021-03-22 FALSE  FALSE  FALSE  FALSE  TRUE  

如果您以可重复的格式提供数据,则更容易提供帮助

df <- structure(list(x = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L), date = c("2021-01-01", 
"2021-01-02", "2021-01-03", "2021-01-10", "2021-01-11", "2021-01-20", 
"2021-01-21", "2021-01-22", "2021-02-22", "2021-03-22")), row.names = c(NA, 
-10L), class = "data.frame")

相关内容

  • 没有找到相关文章