我使用的是R studio,有一个非常大的数据集,其中有4个变量。我试图计算每天06:30:00到19:00:00之间O2柱的平均值。数据集的示例如下:
RBWQ <- read.csv(".../R/WY06 to WY19.csv", skip = 2, header = TRUE, sep = ",", na.strings=c("","NA"))
RBWQ <- setNames(RBWQ, c("datetime","NDCQ","O2","SpCond","PaseoQ"))
View(RBWQ)
datetime NDCQ O2 SpCond PaseoQ
1 2005-10-01 00:00:00 NA NA NA 827
2 2005-10-01 00:15:00 NA NA NA 857
3 2005-10-01 00:30:00 NA NA NA 857
4 2005-10-01 00:45:00 NA NA NA 807
5 2005-10-01 01:00:00 NA 10 300 797
6 2005-10-01 01:15:00 NA 10.3 319 817
7 2005-10-01 01:30:00 NA 10 401 797
8 2005-10-01 01:45:00 NA 10.2 398 788
9 2005-10-01 02:00:00 NA NA NA 788
10 2005-10-01 02:15:00 NA NA NA 807
每小时有15分钟的间隔,正如你所看到的,这个数据集中有很多NA。我如何操作这个数据集来确定每天特定时间段的平均值,并让它在表中生成一个以R表示的日期和平均值的报告?
dput(head(RBWQ))
structure(list(datetime = structure(list(sec = c(0, 0, 0, 0,
0, 0), min = c(0L, 15L, 30L, 45L, 0L, 15L), hour = c(0L, 0L,
0L, 0L, 1L, 1L), mday = c(1L, 1L, 1L, 1L, 1L, 1L), mon = c(9L,
9L, 9L, 9L, 9L, 9L), year = c(105L, 105L, 105L, 105L, 105L, 105L
), wday = c(6L, 6L, 6L, 6L, 6L, 6L), yday = c(273L, 273L, 273L,
273L, 273L, 273L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L), zone = c("MDT",
"MDT", "MDT", "MDT", "MDT", "MDT"), gmtoff = c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), class = c("POSIXlt",
"POSIXt")), NDCQ = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), O2 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), SpCond = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), PaseoQ = c(827L, 857L, 857L, 807L,
797L, 817L)), row.names = c(NA, 6L), class = "data.frame")
有些日子可能会因为各种原因,比如被埋葬,导致仪器读数不正确,一整天都没有NA。
dput(head(RBWQ, 96))
structure(list(datetime = structure(list(sec = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 15L, 30L, 45L, 0L, 15L,
30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L,
45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L,
0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L,
15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L,
30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L,
45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L,
0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L, 0L, 15L, 30L, 45L), hour = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L,
8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L,
11L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L,
15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L,
18L, 18L, 18L, 19L, 19L, 19L, 19L, 20L, 20L, 20L, 20L, 21L, 21L,
21L, 21L, 22L, 22L, 22L, 22L, 23L, 23L, 23L, 23L), mday = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
mon = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), year = c(105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L, 105L,
105L, 105L, 105L), wday = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), yday = c(273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L, 273L,
273L, 273L, 273L, 273L, 273L), isdst = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), zone = c("MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT",
"MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT", "MDT"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), class = c("POSIXlt", "POSIXt"
)), NDCQ = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), O2 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
SpCond = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), PaseoQ = c(827L, 857L, 857L, 807L, 797L, 817L,
797L, 788L, 788L, 807L, 788L, 797L, 807L, 797L, 778L, 797L,
807L, 797L, 750L, 741L, 750L, 714L, 714L, 696L, 696L, 687L,
687L, 678L, 678L, 661L, 669L, 661L, 678L, 678L, 696L, 661L,
644L, 635L, 661L, 644L, 644L, 635L, 635L, 627L, 635L, 652L,
652L, 652L, 644L, 635L, 644L, 652L, 661L, 652L, 661L, 635L,
661L, 644L, 652L, 635L, 644L, 619L, 619L, 619L, 635L, 635L,
635L, 652L, 635L, 652L, 652L, 661L, 652L, 652L, 652L, 652L,
661L, 652L, 644L, 627L, 619L, 635L, 619L, 619L, 603L, 611L,
603L, 603L, 619L, 635L, 635L, 635L, 627L, 619L, 635L, 619L
)), row.names = c(NA, 96L), class = "data.frame")
对于时间序列数据,请考虑xts
包。根据您的数据创建一个时间序列(在第一列中显示您的日期时间(。然后根据时间帧进行子集,并使用colMeans
来计算日均值(去除NA
(。如果这是你的想法,请告诉我。
library(xts)
RBWQ_xts <- xts(RBWQ[, -1], RBWQ[, 1])
apply.daily(RBWQ_xts["T06:30:00/T19:00:00"], colMeans, na.rm = TRUE)
由于行太多,我建议使用data.table
方法。
首先,我们分配新的列date
和time
,以便在下一步中轻松地对它们进行筛选。我们可以使用优化的data.table
函数as.IDate
和as.ITime
,以便快速进行计算。
接下来,我们过滤您感兴趣的时间。
我们用基R的mean
函数计算均值,并使用参数na.rm = TRUE
。
最后,我们对by = date
进行分组,以便计算并返回每天的平均值。
library(data.table)
setDT(RBWQ)[,`:=`(date = as.IDate(datetime),time = as.ITime(datetime))][
time > as.ITime("06:30:00") & time < as.ITime("19:00:00"),
.(meanO2 = mean(O2,na.rm=TRUE)),
by = date]
# date meanO2
#1: 2005-10-01 10.210526
#2: 2005-10-02 9.910526
我怀疑大约450000行的计算可以在几秒钟内完成。