我有一个数据帧:
Date Mean_sum Current_value Working_perc Type1 Type2
2020-08-04 124 142 15% 1 A
2020-08-04 117 109 24% 1 B
2020-08-04 98 105 21% 1 C
2020-08-04 114 127 11% 2 A
2020-08-04 145 139 22% 2 B
2020-08-04 115 125 55% 2 C
2020-08-05 121 125 12% 1 A
2020-08-05 111 122 24% 1 B
2020-08-05 105 108 21% 1 C
2020-08-05 119 122 11% 2 A
2020-08-05 148 149 22% 2 B
2020-08-05 118 127 55% 2 C
我想将"日期"列从00:00:00扩展到04:00:00,每小时保持相同的值。我想将Mean_sum、Current_value和Working_perc列合并为一列:Metrics。所以,想要的结果必须是这样的:
Date Date_hours Metrics Value Type1 Type2
2020-08-04 2020-08-04 00:00:00 Mean_sum 124 1 A
2020-08-04 2020-08-04 01:00:00 Mean_sum 124 1 A
2020-08-04 2020-08-04 02:00:00 Mean_sum 124 1 A
2020-08-04 2020-08-04 03:00:00 Mean_sum 124 1 A
2020-08-04 2020-08-04 04:00:00 Mean_sum 124 1 A
2020-08-04 2020-08-04 00:00:00 Current_value 142 1 B
2020-08-04 2020-08-04 01:00:00 Current_value 142 1 B
2020-08-04 2020-08-04 02:00:00 Current_value 142 1 B
2020-08-04 2020-08-04 03:00:00 Current_value 142 1 B
2020-08-04 2020-08-04 04:00:00 Current_value 142 1 B
2020-08-04 2020-08-04 00:00:00 Working_perc 15% 1 C
2020-08-04 2020-08-04 01:00:00 Working_perc 15% 1 C
2020-08-04 2020-08-04 02:00:00 Working_perc 15% 1 C
2020-08-04 2020-08-04 03:00:00 Working_perc 15% 1 C
2020-08-04 2020-08-04 04:00:00 Working_perc 15% 1 C
.............................................................................
我怎么能那样做?我不知道
这与@thelatemail的答案基本相同,但使用了tidyverse
函数。
获取长格式的数据,添加行,在每个日期时间中添加小时。
library(dplyr)
library(tidyr)
df %>%
mutate(across(c(Mean_sum, Current_value,Working_perc), as.character),
Date = as.POSIXct(Date)) %>%
pivot_longer(cols = c(Mean_sum, Current_value,Working_perc)) %>%
uncount(5) %>%
mutate(Date = Date + lubridate::hours(0:4))
# A tibble: 180 x 5
# Date Type1 Type2 name value
# <dttm> <int> <chr> <chr> <chr>
# 1 2020-08-04 00:00:00 1 A Mean_sum 124
# 2 2020-08-04 01:00:00 1 A Mean_sum 124
# 3 2020-08-04 02:00:00 1 A Mean_sum 124
# 4 2020-08-04 03:00:00 1 A Mean_sum 124
# 5 2020-08-04 04:00:00 1 A Mean_sum 124
# 6 2020-08-04 00:00:00 1 A Current_value 142
# 7 2020-08-04 01:00:00 1 A Current_value 142
# 8 2020-08-04 02:00:00 1 A Current_value 142
# 9 2020-08-04 03:00:00 1 A Current_value 142
#10 2020-08-04 04:00:00 1 A Current_value 142
# … with 170 more rows
数据
df <- structure(list(Date = c("2020-08-04", "2020-08-04", "2020-08-04",
"2020-08-04", "2020-08-04", "2020-08-04", "2020-08-05", "2020-08-05",
"2020-08-05", "2020-08-05", "2020-08-05", "2020-08-05"), Mean_sum = c(124L,
117L, 98L, 114L, 145L, 115L, 121L, 111L, 105L, 119L, 148L, 118L
), Current_value = c(142L, 109L, 105L, 127L, 139L, 125L, 125L,
122L, 108L, 122L, 149L, 127L), Working_perc = c("15%", "24%",
"21%", "11%", "22%", "55%", "12%", "24%", "21%", "11%", "22%",
"55%"), Type1 = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L), Type2 = c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A",
"B", "C")), class = "data.frame", row.names = c(NA, -12L))
看起来像是从宽到长的重塑,然后重复每一行。第一部分大约有400万种方法,所以这里有一个基本的R版本:
## reshape to long format
idv <- match(c("Type1","Type2","Date"), names(dat))
out <- reshape(dat, idvar=idv, varying=list(-idv), direction="long",
v.names="Value", timevar="Metrics", times=names(dat)[-idv])
## repeat each row 5 times and add 0:4 hours in turn to each set of 5 rows
out <- out[rep(seq_len(nrow(out)), each=5),]
out$Date <- as.POSIXct(out$Date, tz="UTC") + as.difftime(0:4, units="hours")
rownames(out) <- NULL
## check a few rows at the start and end
rbind(head(out), tail(out))
# Date Type1 Type2 Metrics Value
#1 2020-08-04 00:00:00 1 A Mean_sum 124
#2 2020-08-04 01:00:00 1 A Mean_sum 124
#3 2020-08-04 02:00:00 1 A Mean_sum 124
#4 2020-08-04 03:00:00 1 A Mean_sum 124
#5 2020-08-04 04:00:00 1 A Mean_sum 124
#6 2020-08-04 00:00:00 1 B Mean_sum 117
#175 2020-08-05 04:00:00 2 B Working_perc 22%
#176 2020-08-05 00:00:00 2 C Working_perc 55%
#177 2020-08-05 01:00:00 2 C Working_perc 55%
#178 2020-08-05 02:00:00 2 C Working_perc 55%
#179 2020-08-05 03:00:00 2 C Working_perc 55%
#180 2020-08-05 04:00:00 2 C Working_perc 55%
我使用的数据是:
dat <- read.table(text=" Date Mean_sum Current_value Working_perc Type1 Type2
2020-08-04 124 142 15% 1 A
2020-08-04 117 109 24% 1 B
2020-08-04 98 105 21% 1 C
2020-08-04 114 127 11% 2 A
2020-08-04 145 139 22% 2 B
2020-08-04 115 125 55% 2 C
2020-08-05 121 125 12% 1 A
2020-08-05 111 122 24% 1 B
2020-08-05 105 108 21% 1 C
2020-08-05 119 122 11% 2 A
2020-08-05 148 149 22% 2 B
2020-08-05 118 127 55% 2 C", header=TRUE)