如何操作我的数据来计算具有多个NA实例的R中的平均值



我使用的是R studio,有一个非常大的数据集,其中有4个变量。我试图计算每天06:30:00到19:00:00之间O2柱的平均值。数据集的示例如下:

RBWQ <- read.csv(".../R/WY06 to WY19.csv", skip = 2, header = TRUE, sep = ",", na.strings=c("","NA"))
RBWQ <- setNames(RBWQ, c("datetime","NDCQ","O2","SpCond","PaseoQ"))
View(RBWQ)
datetime              NDCQ O2   SpCond PaseoQ
1   2005-10-01 00:00:00   NA   NA   NA     827
2   2005-10-01 00:15:00   NA   NA   NA     857
3   2005-10-01 00:30:00   NA   NA   NA     857
4   2005-10-01 00:45:00   NA   NA   NA     807
5   2005-10-01 01:00:00   NA   10   300    797
6   2005-10-01 01:15:00   NA   10.3 319    817
7   2005-10-01 01:30:00   NA   10   401    797
8   2005-10-01 01:45:00   NA   10.2 398    788
9   2005-10-01 02:00:00   NA   NA   NA     788
10  2005-10-01 02:15:00   NA   NA   NA     807

每小时有15分钟的间隔,正如你所看到的,这个数据集中有很多NA。我如何操作这个数据集来确定每天特定时间段的平均值,并让它在表中生成一个以R表示的日期和平均值的报告?

dput(head(RBWQ))
structure(list(datetime = structure(list(sec = c(0, 0, 0, 0, 
0, 0), min = c(0L, 15L, 30L, 45L, 0L, 15L), hour = c(0L, 0L, 
0L, 0L, 1L, 1L), mday = c(1L, 1L, 1L, 1L, 1L, 1L), mon = c(9L, 
9L, 9L, 9L, 9L, 9L), year = c(105L, 105L, 105L, 105L, 105L, 105L
), wday = c(6L, 6L, 6L, 6L, 6L, 6L), yday = c(273L, 273L, 273L, 
273L, 273L, 273L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L), zone = c("MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT"), gmtoff = c(NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt", 
"POSIXt")), NDCQ = c(NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), O2 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), SpCond = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), PaseoQ = c(827L, 857L, 857L, 807L, 
797L, 817L)), row.names = c(NA, 6L), class = "data.frame")

有些日子可能会因为各种原因,比如被埋葬,导致仪器读数不正确,一整天都没有NA。

dput(head(RBWQ, 96))
structure(list(datetime = structure(list(sec = c(0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 15L, 30L, 45L, 0L, 15L, 
30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 
45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 
0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 
15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 
30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 
45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 
0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L), hour = c(0L, 
0L, 0L, 0L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 
4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 
8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 
11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 
15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L, 
18L, 18L, 18L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 21L, 21L, 
21L, 21L, 22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L), mday = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
mon = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), year = c(105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 
105L, 105L, 105L), wday = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), yday = c(273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 
273L, 273L, 273L, 273L, 273L), isdst = c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), zone = c("MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", 
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_)), class = c("POSIXlt", "POSIXt"
)), NDCQ = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), O2 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), 
SpCond = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_), PaseoQ = c(827L, 857L, 857L, 807L, 797L, 817L, 
797L, 788L, 788L, 807L, 788L, 797L, 807L, 797L, 778L, 797L, 
807L, 797L, 750L, 741L, 750L, 714L, 714L, 696L, 696L, 687L, 
687L, 678L, 678L, 661L, 669L, 661L, 678L, 678L, 696L, 661L, 
644L, 635L, 661L, 644L, 644L, 635L, 635L, 627L, 635L, 652L, 
652L, 652L, 644L, 635L, 644L, 652L, 661L, 652L, 661L, 635L, 
661L, 644L, 652L, 635L, 644L, 619L, 619L, 619L, 635L, 635L, 
635L, 652L, 635L, 652L, 652L, 661L, 652L, 652L, 652L, 652L, 
661L, 652L, 644L, 627L, 619L, 635L, 619L, 619L, 603L, 611L, 
603L, 603L, 619L, 635L, 635L, 635L, 627L, 619L, 635L, 619L
)), row.names = c(NA, 96L), class = "data.frame")

对于时间序列数据,请考虑xts包。根据您的数据创建一个时间序列(在第一列中显示您的日期时间(。然后根据时间帧进行子集,并使用colMeans来计算日均值(去除NA(。如果这是你的想法,请告诉我。

library(xts)
RBWQ_xts <- xts(RBWQ[, -1], RBWQ[, 1])
apply.daily(RBWQ_xts["T06:30:00/T19:00:00"], colMeans, na.rm = TRUE)

由于行太多,我建议使用data.table方法。

首先,我们分配新的列datetime,以便在下一步中轻松地对它们进行筛选。我们可以使用优化的data.table函数as.IDateas.ITime,以便快速进行计算。

接下来,我们过滤您感兴趣的时间。

我们用基R的mean函数计算均值,并使用参数na.rm = TRUE

最后,我们对by = date进行分组,以便计算并返回每天的平均值。

library(data.table)
setDT(RBWQ)[,`:=`(date = as.IDate(datetime),time = as.ITime(datetime))][
time > as.ITime("06:30:00") & time < as.ITime("19:00:00"),
.(meanO2 = mean(O2,na.rm=TRUE)),
by = date]
#         date    meanO2
#1: 2005-10-01 10.210526
#2: 2005-10-02  9.910526

我怀疑大约450000行的计算可以在几秒钟内完成。

最新更新