R:dplyr使用nth()调用按组汇总数据,在聚合过程中计算变量n



我正在聚合具有可变bin大小的数据(请参阅前面的问题:R:根据第二列的sum(n(,用变量n聚合每n行(。除了计算变量范围组上的和和和均值外,我还需要在每个组范围的中点提取单值协变量。当我尝试在运行中这样做时,我只得到第一组的值和剩余组的NA。

df.summary<-as.data.frame(df %>% 
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), d=sum(d, na.rm=T), ,i.start=first(rn), i.end=last(rn), y=nth(y, round(first(rn)+(last(rn)-first(rn))/2-1))))
head(df.summary)
grp          x         d i.start i.end        y
1   1 0.07458317  88.99342       1     4 19.78992
2   2 0.07594546  97.62130       5     8       NA
3   3 0.05353308 104.69683       9    12       NA
4   4 0.06498291 106.23468      13    16       NA
5   5 0.08601759  98.24939      17    20       NA
6   6 0.06262427  84.43745      21    23       NA

样本数据:

structure(list(x = c(0.10000112377193, 0.110742170350877, 0.0300274304561404, 
0.0575619395964912, 0.109060465438596, 0.0595491225614035, 0.0539270264912281, 
0.0812452063859649, 0.0341699389122807, 0.0391744879122807, 0.0411787485614035, 
0.0996091644385965, 0.0970479474912281, 0.0595715843684211, 0.0483489989122807, 
0.0549631194561404, 0.0705080555964912, 0.080437472631579, 0.105883664631579, 
0.0872411613684211, 0.103236660631579, 0.0381296894912281, 0.0465064491578947, 
0.0936565184561403, 0.0410095752631579, 0.0311180032105263, 0.0257758157894737, 
0.0354721928947368, 0.0584999394736842, 0.0241286060175439, 0.112053376666667, 
0.0769823868596491, 0.0558137530526316, 0.0374491000701754, 0.0419279142631579, 
0.0260257506842105, 0.0544360374561404, 0.107411071842105, 0.103873468, 
0.0419322114035088, 0.0483912961052632, 0.0328373653157895, 0.0866868717719298, 
0.063990467245614, 0.0799280314035088, 0.123490407070175, 0.145676836280702, 
0.0292878782807018, 0.0432093036666667, 0.0203547443684211), 
d = c(22.2483512600033, 22.2483529247042, 22.2483545865809, 
22.2483562542823, 22.24835791863, 25.1243105415557, 25.1243148759953, 
25.1243192107884, 25.1243235416981, 25.1243278750792, 27.2240858553058, 
27.2240943134697, 27.2241027638674, 27.224111222031, 27.2241196741942, 
24.5623431981188, 24.5623453409221, 24.5623474809012, 24.562349626705, 
24.5623517696847, 28.1458125837154, 28.1458157376341, 28.1458188889053, 
28.1458220452951, 28.1458251983314, 27.8293318542146, 27.8293366652115, 
27.8293414829159, 27.829346292148, 27.8293511094993, 27.5271773325046, 
27.5271834011289, 27.5271894694002, 27.5271955369655, 27.5272016048837, 
28.0376097925214, 28.0376146410729, 28.0376194959786, 28.0376243427651, 
28.0376291969647, 26.8766095768196, 26.8766122563318, 26.8766149309023, 
26.8766176123562, 26.8766202925746, 27.8736950101666, 27.8736960528853, 
27.8736971017815, 27.8736981446767, 27.8736991932199), y = c(19.79001, 
19.789922, 19.789834, 19.789746, 19.789658, 19.78957, 19.789468, 
19.789366, 19.789264, 19.789162, 19.78906, 19.78896, 19.78886, 
19.78876, 19.78866, 19.78856, 19.788458, 19.788356, 19.788254, 
19.788152, 19.78805, 19.787948, 19.787846, 19.787744, 19.787642, 
19.78754, 19.787442, 19.787344, 19.787246, 19.787148, 19.78705, 
19.786956, 19.786862, 19.786768, 19.786674, 19.78658, 19.786486, 
19.786392, 19.786298, 19.786204, 19.78611, 19.786016, 19.785922, 
19.785828, 19.785734, 19.78564, 19.785544, 19.785448, 19.785352, 
19.785256)), row.names = c(NA, 50L), class = "data.frame")

让我们在summarise部分添加变量zn。这些变量的定义如下。

df %>% 
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), 
d=sum(d, na.rm=T), ,i.start=first(rn), 
i.end=last(rn), 
z = round(first(rn)+(last(rn)-first(rn))/2-1),
n = n())
grp      x     d i.start i.end     z     n
<dbl>  <dbl> <dbl>   <int> <int> <dbl> <int>
1     1 0.0746  89.0       1     4     2     4
2     2 0.0759  97.6       5     8     6     4
3     3 0.0535 105.        9    12    10     4
4     4 0.0650 106.       13    16    14     4
5     5 0.0860  98.2      17    20    18     4
6     6 0.0626  84.4      21    23    21     3
7     7 0.0479 112.       24    27    24     4
8     8 0.0394  83.5      28    30    28     3
9     9 0.0706 110.       31    34    32     4
10    10 0.0575 112.       35    38    36     4
11    11 0.0647  83.0      39    41    39     3
12    12 0.0659 108.       42    45    42     4
13    13 0.0854 111.       46    49    46     4
14    14 0.0204  27.9      50    50    49     1

在上面的数据帧中,n表示由grp分隔的每个组的样本大小。但是,当您声明group_by(grp)时,当您调用nth(y, z)时,您将通过GROUP调用Z-TH VALUE

这意味着对于第五组,虽然只有4个值,但您调用y的第18个值。所以它打印NA

要做到这一点,我认为最简单的方法是使用n()

df %>% 
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE), 
d=sum(d, na.rm=T), ,i.start=first(rn), 
i.end=last(rn), 
y=nth(y, round(n()/2)))
grp      x     d i.start i.end     y
<dbl>  <dbl> <dbl>   <int> <int> <dbl>
1     1 0.0746  89.0       1     4  19.8
2     2 0.0759  97.6       5     8  19.8
3     3 0.0535 105.        9    12  19.8
4     4 0.0650 106.       13    16  19.8
5     5 0.0860  98.2      17    20  19.8
6     6 0.0626  84.4      21    23  19.8
7     7 0.0479 112.       24    27  19.8
8     8 0.0394  83.5      28    30  19.8
9     9 0.0706 110.       31    34  19.8
10    10 0.0575 112.       35    38  19.8
11    11 0.0647  83.0      39    41  19.8
12    12 0.0659 108.       42    45  19.8
13    13 0.0854 111.       46    49  19.8
14    14 0.0204  27.9      50    50  NA  

您将把floor(n/2)称为第y,意思是位于每组中间的y。请注意,您也可以尝试floor(n/2)+1

df %>% 
mutate(rn = row_number()) %>%
group_by(grp = (cumsum(d)-1)%/% 100 + 1) %>%
summarise(x=mean(x, na.rm = TRUE),
d = sum(d, na.rm=T),
i.start=first(rn), 
i.end=last(rn), 
y = nth(y, floor(median(rn)) - i.start))

最新更新