我在一个名为ulyDataLefs60_12:的数据帧中有这个相当复杂的数据
Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
1 2000 122 0 1 38.01 3.31 0.662 0.662 2.65 1.32 0.000 3.310 1.32 1.980 1.980 0.662 0.000
2 2000 122 0 1 50.10 1.98 3.310 1.980 1.98 1.98 1.320 4.630 1.32 1.320 0.662 0.000 3.310
3 2000 122 0 2 2.19 1.98 1.320 3.970 1.98 1.32 0.662 0.662 3.97 1.320 0.662 1.320 0.662
4 2000 122 0 2 14.28 2.65 1.320 2.650 3.31 2.65 1.320 3.970 2.65 2.650 0.000 0.662 2.650
5 2000 122 0 2 26.38 3.97 6.620 0.662 3.31 3.31 4.630 5.290 1.98 0.000 0.000 1.980 0.662
6 2000 122 0 2 38.47 2.65 0.662 3.310 1.98 1.32 1.980 1.980 2.65 0.662 1.320 1.980 1.320
E2.S5 E2.S6 E2.S7 E2.S8 E3.S1 E3.S2 E3.S3 E3.S4 E3.S5 E3.S6 E3.S7 E3.S8 E4.S1 E4.S2 E4.S3 E4.S4
1 1.320 1.32 2.65 2.650 0.662 0.000 1.320 2.650 1.320 0.000 1.320 1.320 0.000 0.000 0.662 0.662
2 0.000 0.00 1.98 0.662 0.000 0.662 0.000 0.662 1.980 1.980 0.662 1.320 0.000 0.000 0.000 0.662
3 0.662 1.98 2.65 1.980 0.000 0.662 0.662 1.320 0.662 0.000 1.320 3.310 0.662 0.000 1.980 0.662
4 0.662 1.32 1.32 0.662 0.000 0.662 0.662 0.662 0.662 0.662 0.662 0.000 0.000 0.662 0.000 0.000
5 0.000 1.32 1.32 0.662 0.662 0.000 0.000 0.662 0.000 0.662 1.320 0.662 0.000 0.000 0.000 0.662
6 1.320 1.32 1.32 0.000 1.320 0.000 0.000 0.662 1.320 0.000 0.662 0.662 0.662 1.320 0.000 0.000
E4.S5 E4.S6 E4.S7 E4.S8 FP5.S1 FP5.S2 FP5.S3 FP5.S4 FP5.S5 FP5.S6 FP5.S7 FP5.S8 FP6.S1 FP6.S2
1 0.000 0.662 0.662 0.000 0.331 0 0.662 0.000 0.662 0 0.000 0.331 0 0.331
2 0.000 0.000 0.662 0.662 0.331 0 0.662 0.000 0.662 0 0.000 0.331 0 0.331
3 0.662 0.000 0.662 1.320 0.000 0 0.662 0.000 0.331 0 0.000 0.000 0 0.000
4 0.662 0.662 0.000 0.662 0.000 0 0.662 0.000 0.331 0 0.000 0.000 0 0.000
5 0.000 0.000 0.662 0.000 0.331 0 0.000 0.331 0.331 0 0.331 0.000 0 0.000
6 0.000 0.000 0.662 0.662 0.331 0 0.000 0.331 0.331 0 0.331 0.000 0 0.000
FP6.S3 FP6.S4 FP6.S5 FP6.S6 FP6.S7 FP6.S8 FP7.S1 FP7.S2 FP7.S3 FP7.S4 FP7.S5 FP7.S6 FP7.S7 FP7.S8
1 0.331 0.000 0.000 0.000 0 0.000 0 0.331 0.331 0.662 0 0.000 0.331 0.000
2 0.331 0.000 0.000 0.000 0 0.000 0 0.331 0.331 0.662 0 0.000 0.331 0.000
3 0.662 0.000 0.662 0.000 0 0.331 0 0.000 0.000 0.331 0 0.000 0.000 0.000
4 0.662 0.000 0.662 0.000 0 0.331 0 0.000 0.000 0.331 0 0.000 0.000 0.000
5 0.000 0.662 0.000 0.992 0 0.000 0 0.000 0.000 0.000 0 0.331 0.000 0.331
6 0.000 0.662 0.000 0.992 0 0.000 0 0.000 0.000 0.000 0 0.331 0.000 0.331
PA.LEFS60S1 PA.LEFS60S2 PA.LEFS60S3 PA.LEFS60S4 PA.LEFS60S5 PA.LEFS60S6 PA.LEFS60S7 PA.LEFS60S8
1 64.2 52.0 70.9 105.0 144 170 134 96.2
2 62.6 49.5 68.8 104.0 142 168 134 95.4
3 62.7 47.7 66.2 101.0 140 167 135 96.5
4 62.4 46.3 64.4 99.3 138 166 135 96.7
5 59.9 43.7 63.2 98.8 138 164 133 94.8
6 62.3 45.7 63.7 98.7 137 166 136 96.9
BX BY BZ Bmag....nT. X datetime
1 2.64 4.98 2.25 6.07 NA 2000-05-01 00:01:38
2 2.67 5.16 2.03 6.15 NA 2000-05-01 00:01:50
3 2.52 5.35 1.88 6.21 NA 2000-05-01 00:02:02
4 2.43 5.45 1.74 6.22 NA 2000-05-01 00:02:14
5 2.53 5.46 1.46 6.19 NA 2000-05-01 00:02:26
6 2.29 5.26 1.61 5.96 NA 2000-05-01 00:02:38
dput(head(ulyDataLefs60_12))
structure(list(Year = c(2000L, 2000L, 2000L, 2000L, 2000L, 2000L
), Day = c(122L, 122L, 122L, 122L, 122L, 122L), Hour = c(0L,
0L, 0L, 0L, 0L, 0L), Min = c(1L, 1L, 2L, 2L, 2L, 2L), Sec. = c(38.01,
50.1, 2.19, 14.28, 26.38, 38.47), E1.S1 = c(3.31, 1.98, 1.98,
2.65, 3.97, 2.65), E1.S2 = c(0.662, 3.31, 1.32, 1.32, 6.62, 0.662
), E1.S3 = c(0.662, 1.98, 3.97, 2.65, 0.662, 3.31), E1.S4 = c(2.65,
1.98, 1.98, 3.31, 3.31, 1.98), E1.S5 = c(1.32, 1.98, 1.32, 2.65,
3.31, 1.32), E1.S6 = c(0, 1.32, 0.662, 1.32, 4.63, 1.98), E1.S7 = c(3.31,
4.63, 0.662, 3.97, 5.29, 1.98), E1.S8 = c(1.32, 1.32, 3.97, 2.65,
1.98, 2.65), E2.S1 = c(1.98, 1.32, 1.32, 2.65, 0, 0.662), E2.S2 = c(1.98,
0.662, 0.662, 0, 0, 1.32), E2.S3 = c(0.662, 0, 1.32, 0.662, 1.98,
1.98), E2.S4 = c(0, 3.31, 0.662, 2.65, 0.662, 1.32), E2.S5 = c(1.32,
0, 0.662, 0.662, 0, 1.32), E2.S6 = c(1.32, 0, 1.98, 1.32, 1.32,
1.32), E2.S7 = c(2.65, 1.98, 2.65, 1.32, 1.32, 1.32), E2.S8 = c(2.65,
0.662, 1.98, 0.662, 0.662, 0), E3.S1 = c(0.662, 0, 0, 0, 0.662,
1.32), E3.S2 = c(0, 0.662, 0.662, 0.662, 0, 0), E3.S3 = c(1.32,
0, 0.662, 0.662, 0, 0), E3.S4 = c(2.65, 0.662, 1.32, 0.662, 0.662,
0.662), E3.S5 = c(1.32, 1.98, 0.662, 0.662, 0, 1.32), E3.S6 = c(0,
1.98, 0, 0.662, 0.662, 0), E3.S7 = c(1.32, 0.662, 1.32, 0.662,
1.32, 0.662), E3.S8 = c(1.32, 1.32, 3.31, 0, 0.662, 0.662), E4.S1 = c(0,
0, 0.662, 0, 0, 0.662), E4.S2 = c(0, 0, 0, 0.662, 0, 1.32), E4.S3 = c(0.662,
0, 1.98, 0, 0, 0), E4.S4 = c(0.662, 0.662, 0.662, 0, 0.662, 0
), E4.S5 = c(0, 0, 0.662, 0.662, 0, 0), E4.S6 = c(0.662, 0, 0,
0.662, 0, 0), E4.S7 = c(0.662, 0.662, 0.662, 0, 0.662, 0.662),
E4.S8 = c(0, 0.662, 1.32, 0.662, 0, 0.662), FP5.S1 = c(0.331,
0.331, 0, 0, 0.331, 0.331), FP5.S2 = c(0, 0, 0, 0, 0, 0),
FP5.S3 = c(0.662, 0.662, 0.662, 0.662, 0, 0), FP5.S4 = c(0,
0, 0, 0, 0.331, 0.331), FP5.S5 = c(0.662, 0.662, 0.331, 0.331,
0.331, 0.331), FP5.S6 = c(0, 0, 0, 0, 0, 0), FP5.S7 = c(0,
0, 0, 0, 0.331, 0.331), FP5.S8 = c(0.331, 0.331, 0, 0, 0,
0), FP6.S1 = c(0, 0, 0, 0, 0, 0), FP6.S2 = c(0.331, 0.331,
0, 0, 0, 0), FP6.S3 = c(0.331, 0.331, 0.662, 0.662, 0, 0),
FP6.S4 = c(0, 0, 0, 0, 0.662, 0.662), FP6.S5 = c(0, 0, 0.662,
0.662, 0, 0), FP6.S6 = c(0, 0, 0, 0, 0.992, 0.992), FP6.S7 = c(0,
0, 0, 0, 0, 0), FP6.S8 = c(0, 0, 0.331, 0.331, 0, 0), FP7.S1 = c(0,
0, 0, 0, 0, 0), FP7.S2 = c(0.331, 0.331, 0, 0, 0, 0), FP7.S3 = c(0.331,
0.331, 0, 0, 0, 0), FP7.S4 = c(0.662, 0.662, 0.331, 0.331,
0, 0), FP7.S5 = c(0, 0, 0, 0, 0, 0), FP7.S6 = c(0, 0, 0,
0, 0.331, 0.331), FP7.S7 = c(0.331, 0.331, 0, 0, 0, 0), FP7.S8 = c(0,
0, 0, 0, 0.331, 0.331), PA.LEFS60S1 = c(64.2, 62.6, 62.7,
62.4, 59.9, 62.3), PA.LEFS60S2 = c(52, 49.5, 47.7, 46.3,
43.7, 45.7), PA.LEFS60S3 = c(70.9, 68.8, 66.2, 64.4, 63.2,
63.7), PA.LEFS60S4 = c(105, 104, 101, 99.3, 98.8, 98.7),
PA.LEFS60S5 = c(144, 142, 140, 138, 138, 137), PA.LEFS60S6 = c(170,
168, 167, 166, 164, 166), PA.LEFS60S7 = c(134, 134, 135,
135, 133, 136), PA.LEFS60S8 = c(96.2, 95.4, 96.5, 96.7, 94.8,
96.9), BX = c(2.64, 2.67, 2.52, 2.43, 2.53, 2.29), BY = c(4.98,
5.16, 5.35, 5.45, 5.46, 5.26), BZ = c(2.25, 2.03, 1.88, 1.74,
1.46, 1.61), Bmag....nT. = c(6.07, 6.15, 6.21, 6.22, 6.19,
5.96), X = c(NA, NA, NA, NA, NA, NA), datetime = structure(list(
sec = c(38, 50, 2, 14, 26, 38), min = c(1L, 1L, 2L, 2L,
2L, 2L), hour = c(0L, 0L, 0L, 0L, 0L, 0L), mday = c(1L,
1L, 1L, 1L, 1L, 1L), mon = c(4L, 4L, 4L, 4L, 4L, 4L),
year = c(100L, 100L, 100L, 100L, 100L, 100L), wday = c(1L,
1L, 1L, 1L, 1L, 1L), yday = c(121L, 121L, 121L, 121L,
121L, 121L), isdst = c(1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("sec",
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst"
), class = c("POSIXlt", "POSIXt"))), .Names = c("Year", "Day",
"Hour", "Min", "Sec.", "E1.S1", "E1.S2", "E1.S3", "E1.S4", "E1.S5",
"E1.S6", "E1.S7", "E1.S8", "E2.S1", "E2.S2", "E2.S3", "E2.S4",
"E2.S5", "E2.S6", "E2.S7", "E2.S8", "E3.S1", "E3.S2", "E3.S3",
"E3.S4", "E3.S5", "E3.S6", "E3.S7", "E3.S8", "E4.S1", "E4.S2",
"E4.S3", "E4.S4", "E4.S5", "E4.S6", "E4.S7", "E4.S8", "FP5.S1",
"FP5.S2", "FP5.S3", "FP5.S4", "FP5.S5", "FP5.S6", "FP5.S7", "FP5.S8",
"FP6.S1", "FP6.S2", "FP6.S3", "FP6.S4", "FP6.S5", "FP6.S6", "FP6.S7",
"FP6.S8", "FP7.S1", "FP7.S2", "FP7.S3", "FP7.S4", "FP7.S5", "FP7.S6",
"FP7.S7", "FP7.S8", "PA.LEFS60S1", "PA.LEFS60S2", "PA.LEFS60S3",
"PA.LEFS60S4", "PA.LEFS60S5", "PA.LEFS60S6", "PA.LEFS60S7", "PA.LEFS60S8",
"BX", "BY", "BZ", "Bmag....nT.", "X", "datetime"), row.names = c(NA,
6L), class = "data.frame")
我想要的是得到一定行数的平均值和中值。比方说,我想要一个新的数据帧,它不是所有这些值,而是所有列中每5行的平均值或中值(或者至少在从E1.S1列开始的所有列中(。
我首先看了一个例子:计算行的平均数,它确实让我能够获得数据帧的单列N行的平均值。
ulyDataLefs60_12_avg = colSums(matrix(ulyDataLefs60_12$E1.S1, nrow=5))
问题是,我想要使用的R函数,colSums,不适用于某些字段,即日期时间字段(原因很明显(,所以我无法将其应用于所有列并获得一个良好的平均数据帧。
ulyDataLefs60_12_avg = colSums(matrix(ulyDataLefs60_12, nrow=5))
Error in colSums(matrix(ulyDataLefs60_12, nrow = 5)) :
'x' must be numeric
我很高兴在每5行的开头都有一个datetime字段,用于获取平均值和中值(如果我把datetime设置在5个值间隔的中心会更好(,但到目前为止,我还没有得到同时做这两件事的答案。
也许这是一件很容易做的事,但它让我头疼。
对于此数据:
> dput(df)
df <- structure(list(Year = c(2000L, 2000L, 2000L, 2000L, 2000L, 2000L
), Day = c(122L, 122L, 122L, 122L, 122L, 122L), Hour = c(0L,
0L, 0L, 0L, 0L, 0L), Min = c(1L, 1L, 2L, 2L, 2L, 2L), Sec. = c(38.01,
50.1, 2.19, 14.28, 26.38, 38.47), E1.S1 = c(3.31, 1.98, 1.98,
2.65, 3.97, 2.65), E1.S2 = c(0.662, 3.31, 1.32, 1.32, 6.62, 0.662
), E1.S3 = c(0.662, 1.98, 3.97, 2.65, 0.662, 3.31), E1.S4 = c(2.65,
1.98, 1.98, 3.31, 3.31, 1.98), E1.S5 = c(1.32, 1.98, 1.32, 2.65,
3.31, 1.32), E1.S6 = c(0, 1.32, 0.662, 1.32, 4.63, 1.98), E1.S7 = c(3.31,
4.63, 0.662, 3.97, 5.29, 1.98), E1.S8 = c(1.32, 1.32, 3.97, 2.65,
1.98, 2.65), E2.S1 = c(1.98, 1.32, 1.32, 2.65, 0, 0.662), E2.S2 = c(1.98,
0.662, 0.662, 0, 0, 1.32), E2.S3 = c(0.662, 0, 1.32, 0.662, 1.98,
1.98), E2.S4 = c(0, 3.31, 0.662, 2.65, 0.662, 1.32)), .Names = c("Year",
"Day", "Hour", "Min", "Sec.", "E1.S1", "E1.S2", "E1.S3", "E1.S4",
"E1.S5", "E1.S6", "E1.S7", "E1.S8", "E2.S1", "E2.S2", "E2.S3",
"E2.S4"), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6"))
这项工作:
lapply(split(df, ceiling(seq_len(nrow(df)) / 5)), colMeans)
# $`1`
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7
# 2000.0000 122.0000 0.0000 1.6000 26.1920 2.7780 2.6464 1.9848 2.6460 2.1160 1.5864 3.5724
# E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# 2.2480 1.4540 0.6608 0.9248 1.4568
#
# $`2`
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8
# 2000.000 122.000 0.000 2.000 38.470 2.650 0.662 3.310 1.980 1.320 1.980 1.980 2.650
# E2.S1 E2.S2 E2.S3 E2.S4
# 0.662 1.320 1.980 1.320
#
然后你可以只bind
它们:
do.call(rbind, lapply(split(df, ceiling(seq_len(nrow(df)) / 5)), colMeans))
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# 1 2000 122 0 1.6 26.192 2.778 2.6464 1.9848 2.646 2.116 1.5864 3.5724 2.248 1.454 0.6608 0.9248 1.4568
# 2 2000 122 0 2.0 38.470 2.650 0.6620 3.3100 1.980 1.320 1.9800 1.9800 2.650 0.662 1.3200 1.9800 1.3200
注意:这也有助于通过执行以下操作来检查您想要获取mean
的所有列是integers
还是numeric
> sapply(df, class)
# Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7
# "integer" "integer" "integer" "integer" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# E1.S8 E2.S1 E2.S2 E2.S3 E2.S4
# "numeric" "numeric" "numeric" "numeric" "numeric"
编辑:以下OP评论:
idx <- ceiling(seq_len(nrow(dd)) / 5)
# do colMeans on all columns except last one.
res <- lapply(split(dd[-(ncol(dd))], idx), colMeans, na.rm = TRUE)
# assign first value of "datetime" in each 5-er group as names to list
names(res) <- dd$datetime[seq(1, nrow(df), by=5)]
# bind them to give a matrix
res <- do.call(rbind, res)
或者,如果您希望data.frame
和datetime
作为列:
idx <- ceiling(seq_len(nrow(dd)) / 5)
res <- as.data.frame(do.call(rbind, lapply(split(dd[-(ncol(dd))], idx),
colMeans, na.rm = TRUE)))
res$datetime <- dd$datetime[seq(1, nrow(dd), by=5)]
您可以将数据视为时间序列。然后使用xts
包就可以使用period.apply
功能
dat.xts <- xts(dat[,-ncol(dat)],dat$datetime)
## here I take every minutes because I don't have enouhgt data
## I think in your case 5 rows is equal to 5*12 mintues = 1 hour
pts <- endpoints(dat.xts,on='mins')
period.apply(dat.xts,pts,mean)
Year Day Hour Min Sec. E1.S1 E1.S2 E1.S3 E1.S4 E1.S5 E1.S6 E1.S7 E1.S8 E2.S1 E2.S2 E2.S3 E2.S4 E2.S5 E2.S6
2000-05-01 00:01:50 2000 122 0 1 44.055 2.6450 1.9860 1.321 2.315 1.65 0.660 3.9700 1.3200 1.650 1.3210 0.3310 1.6550 0.660 0.660
2000-05-01 00:02:38 2000 122 0 2 20.330 2.8125 2.4805 2.648 2.645 2.15 2.148 2.9755 2.8125 1.158 0.4955 1.4855 1.3235 0.661 1.485
EDIT显示如何将xts
对象转换为data.frame
:
要用ggplot2
绘制数据,你需要将xts对象强制到data.frame。例如,你可以这样做:
dat <- data.frame(date=index(dat.xts),coredata(dat.xts))
然后绘制E1.S1与日期的关系图:
library(ggplot2)
ggplot(data=dat)+
geom_line(aes(x=date,y=E1.S1))