使用dplyr计算R数据帧中的成对差异



我创建了一个简单的数据帧:

library(dplyr)
df <- tibble(
UserId = c("A", "A", "A", "A", "A", "B", "B", "B", "B"),
Answer_Date = as.Date(c("2010-12-31", "2011-12-29", "2012-12-25", "2013-12-10", "2014-12-31", "2010-10-31", "2011-10-28", "2013-10-31", "2015-10-31")),
Q1 = c(3, 1, 1, 0, 1, 4, 2, 5, 4),
Q2 = c(2, 0, 1, 2, 1, 8, 2, 6, 5),
) %>%
group_by(UserId) %>%
mutate(First_Date = min(Answer_Date)) %>%
mutate(Last_Date  = max(Answer_Date)) %>%
ungroup()

这给了我

> df
# A tibble: 9 x 6
UserId Answer_Date    Q1    Q2 First_Date Last_Date 
<chr>  <date>      <dbl> <dbl> <date>     <date>    
1 A      2010-12-31      3     2 2010-12-31 2014-12-31
2 A      2011-12-29      1     0 2010-12-31 2014-12-31
3 A      2012-12-25      1     1 2010-12-31 2014-12-31
4 A      2013-12-10      0     2 2010-12-31 2014-12-31
5 A      2014-12-31      1     1 2010-12-31 2014-12-31
6 B      2010-10-31      4     8 2010-10-31 2015-10-31
7 B      2011-10-28      2     2 2010-10-31 2015-10-31
8 B      2013-10-31      5     6 2010-10-31 2015-10-31
9 B      2015-10-31      4     5 2010-10-31 2015-10-31

我现在想计算每个受试者在回答问卷的第一个和最后一个日期之间的答案变化。我从写开始

df_tmp <- df %>%
filter(Answer_Date == First_Date) %>%
select(c("UserId", "Q1", "Q2"))
colnames(df_tmp) <- c("UserId", paste0("First_Response_", c("Q1", "Q2")))
df <- merge(df, df_tmp, by = "UserId") 
df_tmp <- df %>%
filter(Answer_Date == Last_Date) %>%
select(c("UserId", "Q1", "Q2"))
colnames(df_tmp) <- c("UserId", paste0("Last_Response_", c("Q1", "Q2")))
df <- merge(df, df_tmp, by = "UserId")

给我

> df
UserId Answer_Date Q1 Q2 First_Date  Last_Date First_Q1 First_Q2 Last_Q1 Last_Q2
1      A  2010-12-31  3  2 2010-12-31 2014-12-31        3        2       1       1
2      A  2011-12-29  1  0 2010-12-31 2014-12-31        3        2       1       1
3      A  2012-12-25  1  1 2010-12-31 2014-12-31        3        2       1       1
4      A  2013-12-10  0  2 2010-12-31 2014-12-31        3        2       1       1
5      A  2014-12-31  1  1 2010-12-31 2014-12-31        3        2       1       1
6      B  2010-10-31  4  8 2010-10-31 2015-10-31        4        8       4       5
7      B  2011-10-28  2  2 2010-10-31 2015-10-31        4        8       4       5
8      B  2013-10-31  5  6 2010-10-31 2015-10-31        4        8       4       5
9      B  2015-10-31  4  5 2010-10-31 2015-10-31        4        8       4       5

我现在希望创建两个now列,Delta_Q1 = Last_Q1 - First_Q1Delta_Q2 = Last_Q2 - First_Q2,但(可能(使用mutate, paste0("First_", c("Q1", "Q2")), paste0("Last_", c("Q1", "Q2"))paste0("Delta_", c("Q1", "Q2"))

按顺序计算列对之间的差异(或者通常是两个变量的函数(的正确语法是什么?我不想手动写下差异的原因很简单——真正的数据帧有很多列对。

非常感谢您的帮助。

真诚的

Thomas Philips

您可以创建两个向量列,并直接减去它们来创建新列。

first_r_col <- grep('First_Response', colnames(df))
last_r_col <- grep('Last_Response', colnames(df))
df[paste0('delta', seq_along(first_r_col))] <- df[last_r_col] - df[first_r_col]

使用dplyrselect语句可能是选择列的简单方法。

library(dplyr)
df[paste0('delta', seq_along(first_r_col))]  <- 
df %>% select(starts_with('Last_Response')) - 
df %>% select(starts_with('First_Response'))

以下是一种不需要创建First_DateLast_Date列的方法:

library(dplyr)
df %>%
group_by(UserId) %>%
arrange(UserId, Answer_Date) %>%
filter(row_number() == 1 | row_number() == n()) %>%
summarize(Delta_Q1 = diff(Q1),
Delta_Q2 = diff(Q2))

我认为不需要太多编码,下面是dplyr解决方案:

df %>%
group_by(UserId) %>% 
arrange(Answer_Date) %>% 
summarize(First_Q1 = first(Q1), 
First_Q2 = first(Q2), 
Last_Q1 = last(Q1),
Last_Q2 = last(Q2)) %>% 
mutate(Delta_Q1 = Last_Q1 - First_Q1,
Delta_Q2 = Last_Q2 - First_Q2)

输出:

# A tibble: 2 x 7
UserId First_Q1 First_Q2 Last_Q1 Last_Q2 Delta_Q1 Delta_Q2
<chr>     <dbl>    <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
1 A             3        2       1       1       -2       -1
2 B             4        8       4       5        0       -3

事后看来,我错过了显而易见的答案,并使它变得比应该的更难:

QUESTIONS <- c("Q1", "Q2")
FIRST_RESPONSE_PREFIX <- "First_"
LAST_RESPONSE_PREFIX  <- "Last_"
DELTA_RESPONSE_PREFIX <- "Delta_"
first_response_cols <- paste0(FIRST_RESPONSE_PREFIX, QUESTIONS)
last_response_cols  <- paste0(LAST_RESPONSE_PREFIX,  QUESTIONS)
delta_response_cols <- paste0(DELTA_RESPONSE_PREFIX, QUESTIONS)
df_tmp1 <- df %>%
filter(Answer_Date == First_Answer_Date) %>%
select(c("UserId", QUESTIONS))
colnames(df_tmp1) <- c("UserId", first_response_cols)
df   <- merge(df, df_tmp1, by = "UserId")
df_tmp2 <- df %>%
filter(Answer_Date == Last_Answer_Date) %>%
select(c("UserId", QUESTIONS))
colnames(df_tmp2) <- c("UserId", last_response_cols)
df  <- merge(df, df_tmp2, by = "UserId") 
df[delta_response_cols] <- df[last_response_cols] - df[first_response_cols]

当我运行代码时,我得到了我想要的:

> df
UserId Answer_Date Q1 Q2 First_Answer_Date Last_Answer_Date First_Q1 First_Q2 Last_Q1 Last_Q2 Delta_Q1 Delta_Q2
1      A  2010-12-31  3  2        2010-12-31       2014-12-31        3        2       1       1       -2       -1
2      A  2011-12-29  1  0        2010-12-31       2014-12-31        3        2       1       1       -2       -1
3      A  2012-12-25  1  1        2010-12-31       2014-12-31        3        2       1       1       -2       -1
4      A  2013-12-10  0  2        2010-12-31       2014-12-31        3        2       1       1       -2       -1
5      A  2014-12-31  1  1        2010-12-31       2014-12-31        3        2       1       1       -2       -1
6      B  2010-10-31  4  8        2010-10-31       2015-10-31        4        8       4       5        0       -3
7      B  2011-10-28  2  2        2010-10-31       2015-10-31        4        8       4       5        0       -3
8      B  2013-10-31  5  6        2010-10-31       2015-10-31        4        8       4       5        0       -3
9      B  2015-10-31  4  5        2010-10-31       2015-10-31        4        8       4       5        0       -3

也就是说,谢谢你的帮助——我通过查看建议的答案学到了一些东西。

最新更新