我有一个带有多个问卷的df1,每个人(id(在特定时间点回答(日期(。通常,每个人每个环节都应填写三份问卷(第一次、前一次、后一次(。一些参与者未能填写全部三份问卷。他们可能只回答三个中的一个或两个。因此,可能的模式可能是完整的(参与者A(、缺少"后"(参与者B(、缺少了"第一"(参与者C(、缺少的"前"(参与者D(,或者只回答了三者中的一个(参与者E、F、G(。
参见df1:
df1 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 6L, 7L), .Label = c("A", "B", "C", "D", "E", "F", "G"), class = "factor"), measure = structure(c(1L, 3L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 1L, 3L, 2L), .Label = c("first", "post", "pre"), class = "factor"), date = structure(c(17558, 17558, 17558, 17558, 17559, 17559, 17559, 17559, 17558, 17558, 17558, 17558 ), class = "Date"), result = c(1, 5, 4, 7, 8, 7, 2, 1, 3, 5, 7, 7)), class = "data.frame", row.names = c(NA, -12L))
现在,我想在数据集中添加缺失的行,其中包含id和measure,以及缺失日期和结果的"NA"。最后的df应该看起来像df2。
df2 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L), .Label = c("A", "B", "C", "D", "E", "F", "G"), class = "factor"), measure = structure(c(1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L), .Label = c("first", "post", "pre"), class = "factor"), date = structure(c(17558, 17558, 17558, 17558, 17559, NA, NA, 17559, 17559, 17559, NA, 17558, 17558, NA, NA, NA, 17558, NA, NA, NA, 17558), class = "Date"), result = c(1, 5, 4, 7, 8, NA, NA, 7, 2, 1, NA, 3, 5, NA, NA, NA, 7, NA, NA, NA, 7)), class = "data.frame", row.names = c(NA, -21L))
我试着用可能缺失的组合进行分组并插入一行。但这并没有带来预期的结果。
require (tidyverse)
final <- df1 %>%
group_by(id, measure == "first" & lag(measure, 1, default=NA) == "post") %>%
do(add_row(., measure = "pre", .after = 0)) %>%
ungroup()
我也试过
final <- df1 %>% complete(id, nesting(measure, date))
也许更为复杂的是,参与者可以参加不止一次会议。因此,有可能每个id都有x*(first,post,pre(。
应该简单地由complete(df1, id, measure)
完成。试试这个:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
df1 <- structure(list(
id = structure(c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 6L, 7L),
.Label = c("A", "B", "C", "D", "E", "F", "G"),
class = "factor"),
measure = structure(c(1L, 3L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 1L, 3L, 2L),
.Label = c("first", "post", "pre"),
class = "factor"),
date = structure(c(17558, 17558, 17558, 17558, 17559, 17559, 17559, 17559, 17558, 17558, 17558, 17558 ), class = "Date"),
result = c(1, 5, 4, 7, 8, 7, 2, 1, 3, 5, 7, 7)), class = "data.frame", row.names = c(NA, -12L))
df2 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L), .Label = c("A", "B", "C", "D", "E", "F", "G"), class = "factor"), measure = structure(c(1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L), .Label = c("first", "post", "pre"), class = "factor"), date = structure(c(17558, 17558, 17558, 17558, 17559, NA, NA, 17559, 17559, 17559, NA, 17558, 17558, NA, NA, NA, 17558, NA, NA, NA, 17558), class = "Date"), result = c(1, 5, 4, 7, 8, NA, NA, 7, 2, 1, NA, 3, 5, NA, NA, NA, 7, NA, NA, NA, 7)), class = "data.frame", row.names = c(NA, -21L))
# Result with complete(df1, id, measure) and setting order of measure
complete(df1, id, measure) %>%
mutate(measure = factor(measure, levels = c("first", "pre", "post"))) %>%
arrange(id, measure, date) %>%
as.data.frame()
#> id measure date result
#> 1 A first 2018-01-27 1
#> 2 A pre 2018-01-27 5
#> 3 A post 2018-01-27 4
#> 4 B first 2018-01-27 7
#> 5 B pre 2018-01-28 8
#> 6 B post <NA> NA
#> 7 C first <NA> NA
#> 8 C pre 2018-01-28 7
#> 9 C post 2018-01-28 2
#> 10 D first 2018-01-28 1
#> 11 D pre <NA> NA
#> 12 D post 2018-01-27 3
#> 13 E first 2018-01-27 5
#> 14 E pre <NA> NA
#> 15 E post <NA> NA
#> 16 F first <NA> NA
#> 17 F pre 2018-01-27 7
#> 18 F post <NA> NA
#> 19 G first <NA> NA
#> 20 G pre <NA> NA
#> 21 G post 2018-01-27 7
# Desired output
df2 %>%
mutate(measure = factor(measure, levels = c("first", "pre", "post"))) %>%
arrange(id, measure, date)
#> id measure date result
#> 1 A first 2018-01-27 1
#> 2 A pre 2018-01-27 5
#> 3 A post 2018-01-27 4
#> 4 B first 2018-01-27 7
#> 5 B pre 2018-01-28 8
#> 6 B post <NA> NA
#> 7 C first <NA> NA
#> 8 C pre 2018-01-28 7
#> 9 C post 2018-01-28 2
#> 10 D first 2018-01-28 1
#> 11 D pre <NA> NA
#> 12 D post 2018-01-27 3
#> 13 E first 2018-01-27 5
#> 14 E pre <NA> NA
#> 15 E post <NA> NA
#> 16 F first <NA> NA
#> 17 F pre 2018-01-27 7
#> 18 F post <NA> NA
#> 19 G first <NA> NA
#> 20 G pre <NA> NA
#> 21 G post 2018-01-27 7
由reprex包(v0.3.0(于2020-03-09创建