我有这个数据集
x<-data.frame(matrix(c("01-01-2010", "01-01-2010", "01-01-2010","01-01-2010"," 00:01"," 00:02"," 00:03"," 00:04", "12.2", "12.1", "13.1", "11.4", "12", "13", "5", "8","12", "4","7","9", "16.9", "17.5","18.8", "21.0"), ncol=6))
names(x)<-c("date","time","pressure","temperature","rain","windspeed")
date time pressure temperature rain windspeed
1 01-01-2010 00:01 12.2 12 12 16.9
2 01-01-2010 00:02 12.1 13 4 17.5
3 01-01-2010 00:03 13.1 5 7 18.8
4 01-01-2010 00:04 11.4 8 9 21.0
这是我的数据集的简化版本。我的数据集从2010年1月1日00:01到2017年12月31日23:59。
我在找
1( 平均气压、温度和风速转换为每小时的数据。
2( 将降雨汇总为每小时的数据。
制作一个新的每小时时间戳来粘贴所有这些新数据很简单,我只需要知道什么是对不同列进行平均和求和的最佳方法,并且最多只重复60行(制作1小时需要60分钟(,直到2017年12月31日23:59
谢谢你的建议。
我建议使用整洁的包和tibbletime来清晰轻松地完成任务。我添加了一些清理代码,以获得所需格式的示例数据。
这种方法具有高度的可重复性和可解释性。TibbleTime允许您在使用通用函数的同时对基于时间的数据进行大量聚合和滚动计算。
# The provided example data -----------------------------------------------
x<-data.frame(matrix(c("01-01-2010", "01-01-2010", "01-01-2010","01-01-2010"," 00:01"," 00:02"," 00:03"," 00:04", "12.2", "12.1", "13.1", "11.4", "12", "13", "5", "8","12", "4","7","9", "16.9", "17.5","18.8", "21.0"), ncol=6),
stringsAsFactors = FALSE)
names(x)<-c("date","time","pressure","temperature","rain","windspeed")
# Load Libraries ----------------------------------------------------------
library(dplyr)
library(lubridate)
library(tibbletime)
# Fix column classes of data ----------------------------------------------
x <- x %>%
mutate_at(vars(pressure:windspeed),as.numeric)
# Convert to tibbletime object --------------------------------------------
x <- x %>%
mutate(date_time = mdy_hm(paste0(date,time))) %>%
as_tbl_time(index = date_time) %>%
select(date_time,everything())
# Use tibbletime function to roll up hourly -------------------------------
x_hourly <- x %>%
collapse_by('hourly',side = 'start') %>%
group_by(date_time) %>%
summarise(pressure = mean(pressure, na.rm = TRUE),
temperature = mean(temperature, na.rm = TRUE),
rain = sum(rain, na.rm = TRUE),
windspeed = mean(windspeed, na.rm = TRUE))
结果:
> x_hourly
# A time tibble: 1 x 5
# Index: date_time
date_time pressure temperature rain windspeed
<dttm> <dbl> <dbl> <dbl> <dbl>
1 2010-01-01 00:01:00 12.2 9.5 32 18.6
# sample data
x1 <- data.frame(matrix(c("01-01-2010", "01-01-2010", "01-01-2010","01-01-
2010","00:00:01","00:00:02","00:00:03","00:00:04", "12.2", "12.1", "13.1", "11.4",
"12", "13", "5", "8","12", "4","7","9", "16.9", "17.5","18.8", "21.0"), ncol=6))
x2 <- data.frame(matrix(c("01-01-2010", "01-01-2010", "01-01-2010","01-01-
2010","01:00:01","01:00:02","01:00:03","01:00:04", "12.2", "12.1", "13.1", "11.4",
"12", "13", "5", "8","12", "4","7","9", "16.9", "17.5","18.8", "21.0"), ncol=6))
x <- rbind(x1, x2)
names(x) <- c("date","time","pressure","temperature","rain","windspeed")
x[,3:6] <- apply(x[,3:6], 2, as.numeric)
# two separate aggregates
aggregate(x[,c('pressure', 'temperature', 'windspeed')], by = list(paste0(x$date,
substring(x$time, 1, 2))), FUN = 'mean')
aggregate(x[,c('rain'), drop = FALSE],
by = list(paste0(x$date, substring(x$time, 1, 2))), FUN = 'sum')
# Group.1 pressure temperature windspeed
#1 01-01-201000 12.2 9.5 18.55
#2 01-01-201001 12.2 9.5 18.55
# Group.1 rain
#1 01-01-201000 32
#2 01-01-201001 32