我正在聚合具有可变bin大小的数据(请参阅前面的问题:R:根据第二列的sum(n(,用变量n聚合每n行(。除了计算变量范围组上的和和和均值外,我还需要在每个组范围的中点提取单值协变量。当我尝试在运行中这样做时,我只得到第一组的值和剩余组的NA。
df.summary<-as.data.frame(df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), d=sum(d, na.rm=T), ,i.start=first(rn), i.end=last(rn), y=nth(y, round(first(rn)+(last(rn)-first(rn))/2-1))))
head(df.summary)
grp x d i.start i.end y
1 1 0.07458317 88.99342 1 4 19.78992
2 2 0.07594546 97.62130 5 8 NA
3 3 0.05353308 104.69683 9 12 NA
4 4 0.06498291 106.23468 13 16 NA
5 5 0.08601759 98.24939 17 20 NA
6 6 0.06262427 84.43745 21 23 NA
样本数据:
structure(list(x = c(0.10000112377193, 0.110742170350877, 0.0300274304561404,
0.0575619395964912, 0.109060465438596, 0.0595491225614035, 0.0539270264912281,
0.0812452063859649, 0.0341699389122807, 0.0391744879122807, 0.0411787485614035,
0.0996091644385965, 0.0970479474912281, 0.0595715843684211, 0.0483489989122807,
0.0549631194561404, 0.0705080555964912, 0.080437472631579, 0.105883664631579,
0.0872411613684211, 0.103236660631579, 0.0381296894912281, 0.0465064491578947,
0.0936565184561403, 0.0410095752631579, 0.0311180032105263, 0.0257758157894737,
0.0354721928947368, 0.0584999394736842, 0.0241286060175439, 0.112053376666667,
0.0769823868596491, 0.0558137530526316, 0.0374491000701754, 0.0419279142631579,
0.0260257506842105, 0.0544360374561404, 0.107411071842105, 0.103873468,
0.0419322114035088, 0.0483912961052632, 0.0328373653157895, 0.0866868717719298,
0.063990467245614, 0.0799280314035088, 0.123490407070175, 0.145676836280702,
0.0292878782807018, 0.0432093036666667, 0.0203547443684211),
d = c(22.2483512600033, 22.2483529247042, 22.2483545865809,
22.2483562542823, 22.24835791863, 25.1243105415557, 25.1243148759953,
25.1243192107884, 25.1243235416981, 25.1243278750792, 27.2240858553058,
27.2240943134697, 27.2241027638674, 27.224111222031, 27.2241196741942,
24.5623431981188, 24.5623453409221, 24.5623474809012, 24.562349626705,
24.5623517696847, 28.1458125837154, 28.1458157376341, 28.1458188889053,
28.1458220452951, 28.1458251983314, 27.8293318542146, 27.8293366652115,
27.8293414829159, 27.829346292148, 27.8293511094993, 27.5271773325046,
27.5271834011289, 27.5271894694002, 27.5271955369655, 27.5272016048837,
28.0376097925214, 28.0376146410729, 28.0376194959786, 28.0376243427651,
28.0376291969647, 26.8766095768196, 26.8766122563318, 26.8766149309023,
26.8766176123562, 26.8766202925746, 27.8736950101666, 27.8736960528853,
27.8736971017815, 27.8736981446767, 27.8736991932199), y = c(19.79001,
19.789922, 19.789834, 19.789746, 19.789658, 19.78957, 19.789468,
19.789366, 19.789264, 19.789162, 19.78906, 19.78896, 19.78886,
19.78876, 19.78866, 19.78856, 19.788458, 19.788356, 19.788254,
19.788152, 19.78805, 19.787948, 19.787846, 19.787744, 19.787642,
19.78754, 19.787442, 19.787344, 19.787246, 19.787148, 19.78705,
19.786956, 19.786862, 19.786768, 19.786674, 19.78658, 19.786486,
19.786392, 19.786298, 19.786204, 19.78611, 19.786016, 19.785922,
19.785828, 19.785734, 19.78564, 19.785544, 19.785448, 19.785352,
19.785256)), row.names = c(NA, 50L), class = "data.frame")
让我们在summarise
部分添加变量z
和n
。这些变量的定义如下。
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
z = round(first(rn)+(last(rn)-first(rn))/2-1),
n = n())
grp x d i.start i.end z n
<dbl> <dbl> <dbl> <int> <int> <dbl> <int>
1 1 0.0746 89.0 1 4 2 4
2 2 0.0759 97.6 5 8 6 4
3 3 0.0535 105. 9 12 10 4
4 4 0.0650 106. 13 16 14 4
5 5 0.0860 98.2 17 20 18 4
6 6 0.0626 84.4 21 23 21 3
7 7 0.0479 112. 24 27 24 4
8 8 0.0394 83.5 28 30 28 3
9 9 0.0706 110. 31 34 32 4
10 10 0.0575 112. 35 38 36 4
11 11 0.0647 83.0 39 41 39 3
12 12 0.0659 108. 42 45 42 4
13 13 0.0854 111. 46 49 46 4
14 14 0.0204 27.9 50 50 49 1
在上面的数据帧中,n
表示由grp
分隔的每个组的样本大小。但是,当您声明group_by(grp)
时,当您调用nth(y, z)
时,您将通过GROUP调用Z-TH VALUE。
这意味着对于第五组,虽然只有4个值,但您调用y
的第18个值。所以它打印NA
。
要做到这一点,我认为最简单的方法是使用n()
。
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d=sum(d, na.rm=T), ,i.start=first(rn),
i.end=last(rn),
y=nth(y, round(n()/2)))
grp x d i.start i.end y
<dbl> <dbl> <dbl> <int> <int> <dbl>
1 1 0.0746 89.0 1 4 19.8
2 2 0.0759 97.6 5 8 19.8
3 3 0.0535 105. 9 12 19.8
4 4 0.0650 106. 13 16 19.8
5 5 0.0860 98.2 17 20 19.8
6 6 0.0626 84.4 21 23 19.8
7 7 0.0479 112. 24 27 19.8
8 8 0.0394 83.5 28 30 19.8
9 9 0.0706 110. 31 34 19.8
10 10 0.0575 112. 35 38 19.8
11 11 0.0647 83.0 39 41 19.8
12 12 0.0659 108. 42 45 19.8
13 13 0.0854 111. 46 49 19.8
14 14 0.0204 27.9 50 50 NA
您将把floor(n/2)
称为第y
,意思是位于每组中间的y
。请注意,您也可以尝试floor(n/2)+1
。
df %>%
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d = sum(d, na.rm=T),
i.start=first(rn),
i.end=last(rn),
y = nth(y, floor(median(rn)) - i.start))