我有一个数据集,如示例数据集员工中给出的:
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 10-Jan-19 10:45 11:45 1
2 George 10-Jan-19 15-Jan-19 06:00 09:00 3
3 George 15-Jan-19 15-Jan-19 12:00 03:00 3
4 George 5-Feb-19 18-Feb-19 12:50 14:50 2
5 Jacob 2-Feb-19 20-Feb-19 15:50 16:50 1
6 Jacob 20-Feb-19 24-Feb-19 14:30 18:30 4
7 Jacob 3-Dec-19 25-Dec-19 06:40 11:40 5
8 Jacob 25-Dec-19 30-Dec-19 09:40 12:40 3
9 Mike 02-Jun-19 02-Jun-19 6:40 07:40 1
10 Mike 02-Jun-19 02-Jun-19 2:45 3:45 1
11 Mike 02-Jun-19 20-Jun-19 10:00 12:00 2
12 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1
我想要的输出是:
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 15-Jan-19 10:45 03:00 7
2 George 5-Feb-19 18-Feb-19 12:50 14:50 2
3 Jacob 2-Feb-19 24-Feb-19 15:50 18:30 5
4 Jacob 3-Dec-19 30-Dec-19 06:40 12:40 8
5 Mike 2-Jun-19 20-Jun-19 6:40 12:00 4
6 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1
我正在使用 dplyr库来总结这一点,但我在使用这段代码时遇到了一些问题,但我没有得到我想要的输出
employee <- employee %>% group_by(e.name) %>% summarise(
s.date=first(s.date),
e.date=last(e.date),
s.time=first(s.time),
e.time=last(e.time),
total.hrs=sum(total.hrs))
从我的代码中,我得到的结果如下
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 18-Jan-19 10:45 14:50 6
2 Jacob 2-Feb-19 30-Dec-19 15:50 12:40 12
但我想要这样的结果
s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 15-Jan-19 10:45 03:00 7
2 George 5-Feb-19 18-Feb-19 12:50 14:50 2
3 Jacob 2-Feb-19 24-Feb-19 15:50 18:30 5
4 Jacob 3-Dec-19 30-Dec-19 06:40 12:40 8
5 Mike 2-Jun-19 20-Jun-19 6:40 12:00 4
6 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1
with data.table...
library(data.table)
setDT(DT)
res = DT[, .(
s.no = first(s.no),
s.date = first(s.date),
e.date = last(e.date),
s.time = first(s.time),
e.time = last(e.time),
total.hrs = sum(total.hrs)
), by=.(e.name, .g = cumsum(s.date != shift(e.date, fill=first(s.date))))]
res[, .g := NULL]
e.name s.no s.date e.date s.time e.time total.hrs
1: George 1 1-Jan-19 15-Jan-19 10:45 03:00 7
2: George 4 5-Feb-19 18-Feb-19 12:50 14:50 2
3: Jacob 5 2-Feb-19 24-Feb-19 15:50 18:30 5
4: Jacob 7 3-Dec-19 30-Dec-19 06:40 12:40 8
5: Mike 9 02-Jun-19 20-Jun-19 6:40 12:00 4
6: Mike 12 23-Jun-19 25-Jun-19 4:00 5:00 1
dplyr 中的模拟:
library(dplyr)
DT %>% group_by(e.name, .g = cumsum(s.date != lag(e.date, default=first(s.date)))) %>%
summarise(
s.no = first(s.no),
s.date = first(s.date),
e.date = last(e.date),
s.time = first(s.time),
e.time = last(e.time),
total.hrs = sum(total.hrs)
) %>% select(-.g)
# A tibble: 6 x 7
# Groups: e.name [3]
e.name s.no s.date e.date s.time e.time total.hrs
<chr> <int> <chr> <chr> <chr> <chr> <int>
1 George 1 1-Jan-19 15-Jan-19 10:45 03:00 7
2 George 4 5-Feb-19 18-Feb-19 12:50 14:50 2
3 Jacob 5 2-Feb-19 24-Feb-19 15:50 18:30 5
4 Jacob 7 3-Dec-19 30-Dec-19 06:40 12:40 8
5 Mike 9 02-Jun-19 20-Jun-19 6:40 12:00 4
6 Mike 12 23-Jun-19 25-Jun-19 4:00 5:00 1
这个答案...
- 跳过读取数据
- 跳过将日期和时间转换为正确格式
- 假设数据已排序
@Mouad的答案更彻底,因为它修复了这些问题(就像OP应该处理它们的真实数据一样)。那里的方法也基本相同 - 按e.name
和标志cumsum
分组,以表示s.date
从其先前/移动/滞后值更改的位置。
数据
library(data.table)
DT = fread("s.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 10-Jan-19 10:45 11:45 1
2 George 10-Jan-19 15-Jan-19 06:00 09:00 3
3 George 15-Jan-19 15-Jan-19 12:00 03:00 3
4 George 5-Feb-19 18-Feb-19 12:50 14:50 2
5 Jacob 2-Feb-19 20-Feb-19 15:50 16:50 1
6 Jacob 20-Feb-19 24-Feb-19 14:30 18:30 4
7 Jacob 3-Dec-19 25-Dec-19 06:40 11:40 5
8 Jacob 25-Dec-19 30-Dec-19 09:40 12:40 3
9 Mike 02-Jun-19 02-Jun-19 6:40 07:40 1
10 Mike 02-Jun-19 02-Jun-19 2:45 3:45 1
11 Mike 02-Jun-19 20-Jun-19 10:00 12:00 2
12 Mike 23-Jun-19 25-Jun-19 4:00 5:00 1")
require(dplyr)
my_df <- read.table(text =
's.no e.name s.date e.date s.time e.time total.hrs
1 George 1-Jan-19 10-Jan-19 10:45 11:45 1
2 George 10-Jan-19 15-Jan-19 06:00 09:00 3
3 George 5-Feb-19 18-Feb-19 12:50 14:50 2
4 Jacob 2-Feb-19 20-Feb-19 15:50 16:50 1
5 Jacob 20-Feb-19 24-Feb-19 14:30 18:30 4
5 Jacob 3-Dec-19 25-Dec-19 06:40 11:40 5
6 Jacob 25-Dec-19 30-Dec-19 09:40 12:40 3',
header = TRUE, stringsAsFactors = FALSE) %>% as_tibble()
my_df <-
my_df %>%
mutate(s.date2 = as.Date(s.date, '%d-%B-%y'),
e.date2 = as.Date(e.date, '%d-%B-%y')) %>%
arrange(e.name, s.date2) %>%
group_by(e.name) %>%
mutate(lag_e.date2 = lag(e.date2,1)) %>%
ungroup %>%
mutate(new_episode = as.numeric(is.na(lag_e.date2) | s.date2 != lag_e.date2 )) %>%
mutate(episode = cumsum(new_episode)) %>%
group_by(episode) %>%
mutate(asc_rank = rank(s.date2),
desc_rank = rank(desc(s.date2)),
sum_hours = sum(total.hrs)) %>%
ungroup
然后
my_df %>%
mutate(s.date_new = if_else(asc_rank ==1, s.date2, as.Date('1900-01-01')),
s.time_new = if_else(asc_rank ==1, s.time, '00:00'),
e.date_new = if_else(desc_rank ==1, e.date2, as.Date('1900-01-01')),
e.time_new = if_else(desc_rank ==1, e.time, '00:00')) %>%
select(e.name, s.date_new, e.date_new, s.time_new, e.time_new, sum_hours, episode) %>%
group_by(episode) %>%
mutate(s.date = max(s.date_new, na.rm = TRUE),
e.date = max(e.date_new, na.rm = TRUE),
s.time = max(s.time_new, na.rm = TRUE),
e.time = max(e.time_new, na.rm = TRUE),
sum_hours = max(sum_hours)) %>%
ungroup %>%
select(e.name, s.date, e.date, s.time, e.time, sum_hours) %>% distinct
# # A tibble: 4 x 6
# e.name s.date e.date s.time e.time sum_hours
# <chr> <date> <date> <chr> <chr> <dbl>
# 1 George 2019-01-01 2019-01-15 10:45 09:00 4
# 2 George 2019-02-05 2019-02-18 12:50 14:50 2
# 3 Jacob 2019-02-02 2019-02-24 15:50 18:30 5
# 4 Jacob 2019-12-03 2019-12-30 06:40 12:40 8
# >
#