r-如何根据只复制一次的行来减去数据帧中的多列值



想象一下我有一个这样的数据帧:

#Stack example
df <- data.frame(DATE = c("2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", 
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29"), 
ID = c("1", "2", "3", "4", "5", "6", 
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6", 
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6", 
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6", 
"1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "9", "10"),
CV = c("SV", "SV", "SV", "SV", "SV", "SV", 
"SV", "SV", "SV", "SV", "SV", "SV", 
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "PD", "PD", "PD",
"SV", "SV", "SV", "SV", "SV", "SV", 
"SV", "SV", "SV", "SV", "SV", "SV", 
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "SV", "PD", "SV"),
TR= c("T1", "T1", "T1", "T1", "T1", "T1", 
"T1", "T1", "T1", "T1", "T1", "T1", 
"T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2"),
Values_1 = c(34.9003695, 34.9003695, 28.2389394, 28.2389394, 26.0821875, 26.0821875,
30.5515533, 30.5515533, 22.6469958, 22.6469958, 34.5662974, 34.5662974,
35.2881883, 35.2881883, 41.3885176, 41.3885176, 19.9440042, 19.9440042,
5.6987524,  5.6987524, 37.4641052, 37.4641052,  2.4808126,  2.4808126,
1.7883822, 21.1799057, 21.1799057, 21.1799057,  2.7334442,  2.7334442,
2.7334442, 11.3880187, 11.3880187, 11.3880187,  7.8442267,  7.8442267,
7.8442267,  5.2445510,  5.2445510,  5.2445510, 20.4706600, 20.4706600,
15.7275634, 15.7275634,  4.4575814,  4.4575814, 17.0854186, 17.0854186,
5.6987524,  5.6987524, 37.4641052, 37.4641052,  2.4808126,  2.4808126),
Values_2 = c(76.24359,  76.24359,  58.52421,  58.52421,  80.14131,  80.14131,  
59.05000, 102.19699, 102.19699,  72.39848,  72.39848,  58.15000,
68.31217, 68.31217,  53.67941,  53.67941,  56.88980,  56.88980, 
108.98399,  96.64207,  96.64207,  38.88542,  38.88542,  54.60000, 
52.12500,  52.12500,  17.20875,  17.20875,  47.26923,  47.26923, 
67.80738,  60.41250,  60.41250,  83.93404,  83.93404,  37.20336, 
50.02500,  50.02500,  94.73309,  94.73309,  41.31748,  41.31748, 
56.88344,  59.74702,  59.74702,  48.23750,  48.23750,  95.14831,
108.98399,  96.64207,  96.64207,  38.88542,  38.88542,  54.60000))

对于具有两个时间点("2022-08-29"one_answers"2022-09-05"(的每个ID、CV、TR组合,我想从最后一个时间点减去第一个时间点值(values_1和values 2(,并返回减去的差值,生成以下输出:

TR CV ID    Values_1  Values_2
1  T1 PD  1 -29.5894359  40.67182
2  T1 PD  2 -29.5894359  28.32990
3  T1 PD  3  -3.9244124  42.96266
4  T1 PD  4  -3.9244124 -14.79399
5  T1 PD  5 -17.4631916 -18.00438
6  T1 PD  6 -17.4631916  -2.28980
7  T1 SV  1  -4.3488162 -17.19359
8  T1 SV  2  -4.3488162  25.95340
9  T1 SV  3  -5.5919436  43.67278
10 T1 SV  4  -5.5919436  13.87427
11 T1 SV  5   8.4841099  -7.74283
12 T1 SV  6   8.4841099 -21.99131
13 T2 PD  1   7.8833367   6.85844
14 T2 PD  2  10.4830124   9.72202
15 T2 PD  3  -0.7869696 -34.98607
16 T2 PD  4  -0.7869696 -46.49559
17 T2 PD  5  -3.3852414   6.92002
18 T2 PD  6  -3.3852414  53.83083
19 T2 SV  1   0.9450620  15.68238
20 T2 SV  2  -9.7918870   8.28750
21 T2 SV  3  -9.7918870  43.20375
22 T2 SV  4  -9.7918870  66.72529
23 T2 SV  5   5.1107825  36.66481
24 T2 SV  6   5.1107825 -10.06587

我找到了以下解决方案,但它没有返回所需的结果,尤其是当存在杂散行或额外重复时,因为我的解决方案基于对数据帧进行排序,并且假设我有一个正确排序的数据帧:

#Remove rows that are not duplicated (do not have two timepoints)
dupe = df[,c("ID", "CV", "TR")] # select columns to check duplicates
ONLY_DUPES=df[duplicated(dupe) | duplicated(dupe, fromLast=TRUE),]
#Now we substract last timepoint with first timepoint
#First we melt the data
ONLY_DUPES <- as.data.frame(ONLY_DUPES)
melted<-melt(ONLY_DUPES, id=c("DATE", "ID", "CV", "TR"))
#Then we cast with diff
#First we order the dataframe
melted= melted[with(melted, order(TR, CV, ID, variable, DATE)),]
casted=cast(melted, TR+CV+ID  ~variable, fun.aggregate = diff)
#Returns:
TR CV ID    Values_1  Values_2
1  T1 PD  1 -29.5894359  40.67182
2  T1 PD  2 -29.5894359  28.32990
3  T1 PD  3  -3.9244124  42.96266
4  T1 PD  4  -3.9244124 -14.79399
5  T1 PD  5 -17.4631916 -18.00438
6  T1 PD  6 -17.4631916  -2.28980
7  T1 SV  1  -4.3488162 -17.19359
8  T1 SV  2  -4.3488162  25.95340
9  T1 SV  3  -5.5919436  43.67278
10 T1 SV  4  -5.5919436  13.87427
11 T1 SV  5   8.4841099  -7.74283
12 T1 SV  6   8.4841099 -21.99131
13 T2 PD  1   7.8833367   6.85844
14 T2 PD  2  10.4830124   9.72202
15 T2 PD  3  -0.7869696 -34.98607
16 T2 PD  4  -0.7869696 -46.49559
17 T2 PD  5  -3.3852414   6.92002
18 T2 PD  6  -3.3852414  53.83083
19 T2 PD  9 -34.9832926 -57.75665 !
20 T2 SV  1   0.9450620  15.68238
21 T2 SV 10 -34.9832926  15.71458 !
22 T2 SV  2  -9.7918870   8.28750
23 T2 SV  3  -9.7918870  43.20375
24 T2 SV  4  -9.7918870  66.72529
25 T2 SV  5   5.1107825  36.66481
26 T2 SV  6   5.1107825 -10.06587

我可以应用什么方法以防故障的方式执行此操作?

更新>2个值列:

如果有多个列,可以使用across()定义要应用函数的列。有了这个,你只需要输入你想操作的开始和结束列:

对于这个例子,我只是复制了Values_1Values_2,并将它们分别存储在新的变量Values_3Values_4中。

library(tidyverse)
df %>%
group_by(ID, CV, TR) %>%
summarise(across(Values_1:Values_4, ~ .x[DATE == "2022-09-05"] - .x[DATE == "2022-08-29"])) %>%
arrange(TR, CV) %>%
print(., n = 24)
#> `summarise()` has grouped output by 'ID', 'CV', 'TR'. You can override using
#> the `.groups` argument.
#> # A tibble: 24 × 7
#> # Groups:   ID, CV, TR [24]
#>    ID    CV    TR    Values_1 Values_2 Values_3 Values_4
#>    <chr> <chr> <chr>    <dbl>    <dbl>    <dbl>    <dbl>
#>  1 1     PD    T1     -29.6      40.7   -29.6      40.7 
#>  2 2     PD    T1     -29.6      28.3   -29.6      28.3 
#>  3 3     PD    T1      -3.92     43.0    -3.92     43.0 
#>  4 4     PD    T1      -3.92    -14.8    -3.92    -14.8 
#>  5 5     PD    T1     -17.5     -18.0   -17.5     -18.0 
#>  6 6     PD    T1     -17.5      -2.29  -17.5      -2.29
#>  7 1     SV    T1      -4.35    -17.2    -4.35    -17.2 
#>  8 2     SV    T1      -4.35     26.0    -4.35     26.0 
#>  9 3     SV    T1      -5.59     43.7    -5.59     43.7 
#> 10 4     SV    T1      -5.59     13.9    -5.59     13.9 
#> 11 5     SV    T1       8.48     -7.74    8.48     -7.74
#> 12 6     SV    T1       8.48    -22.0     8.48    -22.0 
#> 13 1     PD    T2       7.88      6.86    7.88      6.86
#> 14 2     PD    T2      10.5       9.72   10.5       9.72
#> 15 3     PD    T2      -0.787   -35.0    -0.787   -35.0 
#> 16 4     PD    T2      -0.787   -46.5    -0.787   -46.5 
#> 17 5     PD    T2      -3.39      6.92   -3.39      6.92
#> 18 6     PD    T2      -3.39     53.8    -3.39     53.8 
#> 19 1     SV    T2       0.945    15.7     0.945    15.7 
#> 20 2     SV    T2      -9.79      8.29   -9.79      8.29
#> 21 3     SV    T2      -9.79     43.2    -9.79     43.2 
#> 22 4     SV    T2      -9.79     66.7    -9.79     66.7 
#> 23 5     SV    T2       5.11     36.7     5.11     36.7 
#> 24 6     SV    T2       5.11    -10.1     5.11    -10.1

创建于2022-09-06,reprex v2.0.2

第一个答案:

这应该行得通。使用tidyverse软件包,您可以首先根据变量IDCVTR进行分组,然后使用summarise()根据DATE 1和DATE 2:上的值计算减法

请注意,最后一个命令print(., n = 24)仅用于显示此处的输出,在复制代码时可以省略。

library(tidyverse)
df %>%
group_by(ID, CV, TR) %>%
summarise(Values_1_new = Values_1[DATE == "2022-09-05"] - Values_1[DATE == "2022-08-29"],
Values_2_new = Values_2[DATE == "2022-09-05"] - Values_2[DATE == "2022-08-29"]) %>%
arrange(TR, CV) %>%
print(., n = 24)
#> `summarise()` has grouped output by 'ID', 'CV', 'TR'. You can override using
#> the `.groups` argument.
#> # A tibble: 24 × 5
#> # Groups:   ID, CV, TR [24]
#>    ID    CV    TR    Values_1_new Values_2_new
#>    <chr> <chr> <chr>        <dbl>        <dbl>
#>  1 1     PD    T1         -29.6          40.7 
#>  2 2     PD    T1         -29.6          28.3 
#>  3 3     PD    T1          -3.92         43.0 
#>  4 4     PD    T1          -3.92        -14.8 
#>  5 5     PD    T1         -17.5         -18.0 
#>  6 6     PD    T1         -17.5          -2.29
#>  7 1     SV    T1          -4.35        -17.2 
#>  8 2     SV    T1          -4.35         26.0 
#>  9 3     SV    T1          -5.59         43.7 
#> 10 4     SV    T1          -5.59         13.9 
#> 11 5     SV    T1           8.48         -7.74
#> 12 6     SV    T1           8.48        -22.0 
#> 13 1     PD    T2           7.88          6.86
#> 14 2     PD    T2          10.5           9.72
#> 15 3     PD    T2          -0.787       -35.0 
#> 16 4     PD    T2          -0.787       -46.5 
#> 17 5     PD    T2          -3.39          6.92
#> 18 6     PD    T2          -3.39         53.8 
#> 19 1     SV    T2           0.945        15.7 
#> 20 2     SV    T2          -9.79          8.29
#> 21 3     SV    T2          -9.79         43.2 
#> 22 4     SV    T2          -9.79         66.7 
#> 23 5     SV    T2           5.11         36.7 
#> 24 6     SV    T2           5.11        -10.1

创建于2022-09-06,reprex v2.0.2

最新更新