想象一下我有一个这样的数据帧:
#Stack example
df <- data.frame(DATE = c("2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29"),
ID = c("1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "9", "10"),
CV = c("SV", "SV", "SV", "SV", "SV", "SV",
"SV", "SV", "SV", "SV", "SV", "SV",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "PD", "PD", "PD",
"SV", "SV", "SV", "SV", "SV", "SV",
"SV", "SV", "SV", "SV", "SV", "SV",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "SV", "PD", "SV"),
TR= c("T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2"),
Values_1 = c(34.9003695, 34.9003695, 28.2389394, 28.2389394, 26.0821875, 26.0821875,
30.5515533, 30.5515533, 22.6469958, 22.6469958, 34.5662974, 34.5662974,
35.2881883, 35.2881883, 41.3885176, 41.3885176, 19.9440042, 19.9440042,
5.6987524, 5.6987524, 37.4641052, 37.4641052, 2.4808126, 2.4808126,
1.7883822, 21.1799057, 21.1799057, 21.1799057, 2.7334442, 2.7334442,
2.7334442, 11.3880187, 11.3880187, 11.3880187, 7.8442267, 7.8442267,
7.8442267, 5.2445510, 5.2445510, 5.2445510, 20.4706600, 20.4706600,
15.7275634, 15.7275634, 4.4575814, 4.4575814, 17.0854186, 17.0854186,
5.6987524, 5.6987524, 37.4641052, 37.4641052, 2.4808126, 2.4808126),
Values_2 = c(76.24359, 76.24359, 58.52421, 58.52421, 80.14131, 80.14131,
59.05000, 102.19699, 102.19699, 72.39848, 72.39848, 58.15000,
68.31217, 68.31217, 53.67941, 53.67941, 56.88980, 56.88980,
108.98399, 96.64207, 96.64207, 38.88542, 38.88542, 54.60000,
52.12500, 52.12500, 17.20875, 17.20875, 47.26923, 47.26923,
67.80738, 60.41250, 60.41250, 83.93404, 83.93404, 37.20336,
50.02500, 50.02500, 94.73309, 94.73309, 41.31748, 41.31748,
56.88344, 59.74702, 59.74702, 48.23750, 48.23750, 95.14831,
108.98399, 96.64207, 96.64207, 38.88542, 38.88542, 54.60000))
对于具有两个时间点("2022-08-29"one_answers"2022-09-05"(的每个ID、CV、TR组合,我想从最后一个时间点减去第一个时间点值(values_1和values 2(,并返回减去的差值,生成以下输出:
TR CV ID Values_1 Values_2
1 T1 PD 1 -29.5894359 40.67182
2 T1 PD 2 -29.5894359 28.32990
3 T1 PD 3 -3.9244124 42.96266
4 T1 PD 4 -3.9244124 -14.79399
5 T1 PD 5 -17.4631916 -18.00438
6 T1 PD 6 -17.4631916 -2.28980
7 T1 SV 1 -4.3488162 -17.19359
8 T1 SV 2 -4.3488162 25.95340
9 T1 SV 3 -5.5919436 43.67278
10 T1 SV 4 -5.5919436 13.87427
11 T1 SV 5 8.4841099 -7.74283
12 T1 SV 6 8.4841099 -21.99131
13 T2 PD 1 7.8833367 6.85844
14 T2 PD 2 10.4830124 9.72202
15 T2 PD 3 -0.7869696 -34.98607
16 T2 PD 4 -0.7869696 -46.49559
17 T2 PD 5 -3.3852414 6.92002
18 T2 PD 6 -3.3852414 53.83083
19 T2 SV 1 0.9450620 15.68238
20 T2 SV 2 -9.7918870 8.28750
21 T2 SV 3 -9.7918870 43.20375
22 T2 SV 4 -9.7918870 66.72529
23 T2 SV 5 5.1107825 36.66481
24 T2 SV 6 5.1107825 -10.06587
我找到了以下解决方案,但它没有返回所需的结果,尤其是当存在杂散行或额外重复时,因为我的解决方案基于对数据帧进行排序,并且假设我有一个正确排序的数据帧:
#Remove rows that are not duplicated (do not have two timepoints)
dupe = df[,c("ID", "CV", "TR")] # select columns to check duplicates
ONLY_DUPES=df[duplicated(dupe) | duplicated(dupe, fromLast=TRUE),]
#Now we substract last timepoint with first timepoint
#First we melt the data
ONLY_DUPES <- as.data.frame(ONLY_DUPES)
melted<-melt(ONLY_DUPES, id=c("DATE", "ID", "CV", "TR"))
#Then we cast with diff
#First we order the dataframe
melted= melted[with(melted, order(TR, CV, ID, variable, DATE)),]
casted=cast(melted, TR+CV+ID ~variable, fun.aggregate = diff)
#Returns:
TR CV ID Values_1 Values_2
1 T1 PD 1 -29.5894359 40.67182
2 T1 PD 2 -29.5894359 28.32990
3 T1 PD 3 -3.9244124 42.96266
4 T1 PD 4 -3.9244124 -14.79399
5 T1 PD 5 -17.4631916 -18.00438
6 T1 PD 6 -17.4631916 -2.28980
7 T1 SV 1 -4.3488162 -17.19359
8 T1 SV 2 -4.3488162 25.95340
9 T1 SV 3 -5.5919436 43.67278
10 T1 SV 4 -5.5919436 13.87427
11 T1 SV 5 8.4841099 -7.74283
12 T1 SV 6 8.4841099 -21.99131
13 T2 PD 1 7.8833367 6.85844
14 T2 PD 2 10.4830124 9.72202
15 T2 PD 3 -0.7869696 -34.98607
16 T2 PD 4 -0.7869696 -46.49559
17 T2 PD 5 -3.3852414 6.92002
18 T2 PD 6 -3.3852414 53.83083
19 T2 PD 9 -34.9832926 -57.75665 !
20 T2 SV 1 0.9450620 15.68238
21 T2 SV 10 -34.9832926 15.71458 !
22 T2 SV 2 -9.7918870 8.28750
23 T2 SV 3 -9.7918870 43.20375
24 T2 SV 4 -9.7918870 66.72529
25 T2 SV 5 5.1107825 36.66481
26 T2 SV 6 5.1107825 -10.06587
我可以应用什么方法以防故障的方式执行此操作?
更新>2个值列:
如果有多个列,可以使用across()
定义要应用函数的列。有了这个,你只需要输入你想操作的开始和结束列:
对于这个例子,我只是复制了Values_1
和Values_2
,并将它们分别存储在新的变量Values_3
和Values_4
中。
library(tidyverse)
df %>%
group_by(ID, CV, TR) %>%
summarise(across(Values_1:Values_4, ~ .x[DATE == "2022-09-05"] - .x[DATE == "2022-08-29"])) %>%
arrange(TR, CV) %>%
print(., n = 24)
#> `summarise()` has grouped output by 'ID', 'CV', 'TR'. You can override using
#> the `.groups` argument.
#> # A tibble: 24 × 7
#> # Groups: ID, CV, TR [24]
#> ID CV TR Values_1 Values_2 Values_3 Values_4
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 PD T1 -29.6 40.7 -29.6 40.7
#> 2 2 PD T1 -29.6 28.3 -29.6 28.3
#> 3 3 PD T1 -3.92 43.0 -3.92 43.0
#> 4 4 PD T1 -3.92 -14.8 -3.92 -14.8
#> 5 5 PD T1 -17.5 -18.0 -17.5 -18.0
#> 6 6 PD T1 -17.5 -2.29 -17.5 -2.29
#> 7 1 SV T1 -4.35 -17.2 -4.35 -17.2
#> 8 2 SV T1 -4.35 26.0 -4.35 26.0
#> 9 3 SV T1 -5.59 43.7 -5.59 43.7
#> 10 4 SV T1 -5.59 13.9 -5.59 13.9
#> 11 5 SV T1 8.48 -7.74 8.48 -7.74
#> 12 6 SV T1 8.48 -22.0 8.48 -22.0
#> 13 1 PD T2 7.88 6.86 7.88 6.86
#> 14 2 PD T2 10.5 9.72 10.5 9.72
#> 15 3 PD T2 -0.787 -35.0 -0.787 -35.0
#> 16 4 PD T2 -0.787 -46.5 -0.787 -46.5
#> 17 5 PD T2 -3.39 6.92 -3.39 6.92
#> 18 6 PD T2 -3.39 53.8 -3.39 53.8
#> 19 1 SV T2 0.945 15.7 0.945 15.7
#> 20 2 SV T2 -9.79 8.29 -9.79 8.29
#> 21 3 SV T2 -9.79 43.2 -9.79 43.2
#> 22 4 SV T2 -9.79 66.7 -9.79 66.7
#> 23 5 SV T2 5.11 36.7 5.11 36.7
#> 24 6 SV T2 5.11 -10.1 5.11 -10.1
创建于2022-09-06,reprex v2.0.2
第一个答案:
这应该行得通。使用tidyverse
软件包,您可以首先根据变量ID
、CV
和TR
进行分组,然后使用summarise()
根据DATE 1和DATE 2:上的值计算减法
请注意,最后一个命令print(., n = 24)
仅用于显示此处的输出,在复制代码时可以省略。
library(tidyverse)
df %>%
group_by(ID, CV, TR) %>%
summarise(Values_1_new = Values_1[DATE == "2022-09-05"] - Values_1[DATE == "2022-08-29"],
Values_2_new = Values_2[DATE == "2022-09-05"] - Values_2[DATE == "2022-08-29"]) %>%
arrange(TR, CV) %>%
print(., n = 24)
#> `summarise()` has grouped output by 'ID', 'CV', 'TR'. You can override using
#> the `.groups` argument.
#> # A tibble: 24 × 5
#> # Groups: ID, CV, TR [24]
#> ID CV TR Values_1_new Values_2_new
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1 PD T1 -29.6 40.7
#> 2 2 PD T1 -29.6 28.3
#> 3 3 PD T1 -3.92 43.0
#> 4 4 PD T1 -3.92 -14.8
#> 5 5 PD T1 -17.5 -18.0
#> 6 6 PD T1 -17.5 -2.29
#> 7 1 SV T1 -4.35 -17.2
#> 8 2 SV T1 -4.35 26.0
#> 9 3 SV T1 -5.59 43.7
#> 10 4 SV T1 -5.59 13.9
#> 11 5 SV T1 8.48 -7.74
#> 12 6 SV T1 8.48 -22.0
#> 13 1 PD T2 7.88 6.86
#> 14 2 PD T2 10.5 9.72
#> 15 3 PD T2 -0.787 -35.0
#> 16 4 PD T2 -0.787 -46.5
#> 17 5 PD T2 -3.39 6.92
#> 18 6 PD T2 -3.39 53.8
#> 19 1 SV T2 0.945 15.7
#> 20 2 SV T2 -9.79 8.29
#> 21 3 SV T2 -9.79 43.2
#> 22 4 SV T2 -9.79 66.7
#> 23 5 SV T2 5.11 36.7
#> 24 6 SV T2 5.11 -10.1
创建于2022-09-06,reprex v2.0.2