我有一个包含变量ID、月份(或期间(和当月收入的基数。如果客户在未来3个月内购买,我需要输入1,如果没有,则输入0,并对所有ID执行此操作。例如,如果我在第1个月,并且在接下来的3个月内有一次购买,那么在该行中为该客户输入1。在最后一段时间内,由于不会有3个月,出现NA。
df<-tibble::tribble(
~ID, ~Month, ~Incomes,
1L, 1L, 5000L,
1L, 2L, 0L,
1L, 3L, 0L,
1L, 4L, 0L,
1L, 5L, 0L,
1L, 6L, 0L,
1L, 7L, 400L,
1L, 8L, 300L,
1L, 9L, 0L,
1L, 10L, 0L,
1L, 11L, 0L,
1L, 12L, 0L,
1L, 13L, 400L,
2L, 1L, 0L,
2L, 2L, 100L,
2L, 3L, 0L,
2L, 4L, 0L,
2L, 5L, 0L,
2L, 6L, 0L,
2L, 7L, 0L,
2L, 8L, 1500L,
2L, 9L, 0L,
2L, 10L, 0L,
2L, 11L, 0L,
2L, 12L, 100L,
2L, 13L, 750L,
3L, 1L, 0L,
3L, 2L, 0L,
3L, 3L, 0L,
3L, 4L, 0L,
3L, 5L, 700L,
3L, 6L, 240L,
3L, 7L, 100L,
3L, 8L, 0L,
3L, 9L, 0L,
3L, 10L, 0L,
3L, 11L, 0L,
3L, 12L, 500L,
3L, 13L, 760L
)
df<-as.data.frame(df)
# ID Month Incomes
# 1 1 5000
# 1 2 0
# 1 3 0
# 1 4 0
# 1 5 0
# 1 6 0
# 1 7 400
# 1 8 300
# 1 9 0
# 1 10 0
# 1 11 0
# 1 12 0
# 1 13 400
# 2 1 0
# 2 2 100
# 2 3 0
# 2 4 0
# 2 5 0
# 2 6 0
# 2 7 0
# 2 8 1500
# 2 9 0
# 2 10 0
# 2 11 0
# 2 12 100
# 2 13 750
# 3 1 0
# 3 2 0
# 3 3 0
# 3 4 0
# 3 5 700
# 3 6 240
# 3 7 100
# 3 8 0
# 3 9 0
# 3 10 0
# 3 11 0
# 3 12 500
# 3 13 760
我希望应该是这样的:
dffinal<- tibble::tribble(
~ID_RUT, ~Month, ~Incomes, ~Quarter,
1L, 1L, 5000L, 0L,
1L, 2L, 0L, 0L,
1L, 3L, 0L, 0L,
1L, 4L, 0L, 1L,
1L, 5L, 0L, 1L,
1L, 6L, 0L, 1L,
1L, 7L, 400L, 1L,
1L, 8L, 300L, 0L,
1L, 9L, 0L, 0L,
1L, 10L, 0L, 0L,
1L, 11L, 0L, NA,
1L, 12L, 0L, NA,
1L, 13L, 400L, NA,
2L, 1L, 0L, 1L,
2L, 2L, 100L, 0L,
2L, 3L, 0L, 0L,
2L, 4L, 0L, 0L,
2L, 5L, 0L, 1L,
2L, 6L, 0L, 1L,
2L, 7L, 0L, 1L,
2L, 8L, 1500L, 0L,
2L, 9L, 0L, 1L,
2L, 10L, 0L, 1L,
2L, 11L, 0L, NA,
2L, 12L, 100L, NA,
2L, 13L, 750L, NA,
3L, 1L, 0L, 0L,
3L, 2L, 0L, 1L,
3L, 3L, 0L, 1L,
3L, 4L, 0L, 1L,
3L, 5L, 700L, 1L,
3L, 6L, 240L, 1L,
3L, 7L, 100L, 0L,
3L, 8L, 0L, 0L,
3L, 9L, 0L, 1L,
3L, 10L, 0L, 1L,
3L, 11L, 0L, NA,
3L, 12L, 500L, NA,
3L, 13L, 760L, NA
)
# ID Month Incomes Quarterly
# 1 1 5000 0
# 1 2 0 0
# 1 3 0 0
# 1 4 0 1
# 1 5 0 1
# 1 6 0 1
# 1 7 400 1
# 1 8 300 0
# 1 9 0 0
# 1 10 0 0
# 1 11 0 NA
# 1 12 0 NA
# 1 13 400 NA
# 2 1 0 1
# 2 2 100 0
# 2 3 0 0
# 2 4 0 0
# 2 5 0 1
# 2 6 0 1
# 2 7 0 1
# 2 8 1500 0
# 2 9 0 1
# 2 10 0 1
# 2 11 0 NA
# 2 12 100 NA
# 2 13 750 NA
# 3 1 0 0
# 3 2 0 1
# 3 3 0 1
# 3 4 0 1
# 3 5 700 1
# 3 6 240 1
# 3 7 100 0
# 3 8 0 0
# 3 9 0 1
# 3 10 0 1
# 3 11 0 NA
# 3 12 500 NA
# 3 13 760 NA
有人知道怎么做吗?感谢您抽出时间
1(rollapply沿Incomes > 0
向前滚动,如果有返回TRUE,则返回FALSE。使用+
将其转换为数字。1:3表示使用当前点的偏移1、2、3,即接下来的三个收入。如果你想考虑下一个和下一个两个收入,在每组末尾不剩下三个的地方,把partial=TRUE
的论点加到rollapply
上。
library(dplyr)
library(zoo)
df %>%
group_by(ID) %>%
mutate(Quarter = +rollapply(Incomes > 0, list(1:3), any, fill = NA)) %>%
ungroup
2(SQLSQL解决方案是:
library(sqldf)
over <- "partition by ID rows between 1 following and 3 following"
fn$sqldf("select
*,
(max(Incomes > 0) over ($over)) +
(case when (count(*) over ($over)) = 3 then 0 else Null end) as Quarter
from df")
如果可以处理后面少于3行的元素,这可以简化。over
来自上方:
fn$sqldf("select *, (max(Incomes > 0) over ($over)) as Quarter from df")
dplyr解决方案:使用lag
对接下来的三个月求和,并取结果的符号。
df %>%
group_by(ID) %>%
mutate(quarter = sign(lead(Incomes, 3) + lead(Incomes, 2) + lead(Incomes))) %>%
as.data.frame()
#> ID Month Incomes quarter
#> 1 1 1 5000 0
#> 2 1 2 0 0
#> 3 1 3 0 0
#> 4 1 4 0 1
#> 5 1 5 0 1
#> 6 1 6 0 1
#> 7 1 7 400 1
#> 8 1 8 300 0
#> 9 1 9 0 0
#> 10 1 10 0 1
#> 11 1 11 0 NA
#> 12 1 12 0 NA
#> 13 1 13 400 NA
#> 14 2 1 0 1
#> 15 2 2 100 0
#> 16 2 3 0 0
#> 17 2 4 0 0
#> 18 2 5 0 1
#> 19 2 6 0 1
#> 20 2 7 0 1
#> 21 2 8 1500 0
#> 22 2 9 0 1
#> 23 2 10 0 1
#> 24 2 11 0 NA
#> 25 2 12 100 NA
#> 26 2 13 750 NA
#> 27 3 1 0 0
#> 28 3 2 0 1
#> 29 3 3 0 1
#> 30 3 4 0 1
#> 31 3 5 700 1
#> 32 3 6 240 1
#> 33 3 7 100 0
#> 34 3 8 0 0
#> 35 3 9 0 1
#> 36 3 10 0 1
#> 37 3 11 0 NA
#> 38 3 12 500 NA
#> 39 3 13 760 NA
另一个选项:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(
Quarterly = c(
sapply(1:(n() - 3), function(x) +any(Incomes[(x + 1):(x + 3)] > 0)),
rep(NA, 3)
)
) %>% as.data.frame
输出:
ID Month Incomes Quarterly
1 1 1 5000 0
2 1 2 0 0
3 1 3 0 0
4 1 4 0 1
5 1 5 0 1
6 1 6 0 1
7 1 7 400 1
8 1 8 300 0
9 1 9 0 0
10 1 10 0 1
11 1 11 0 NA
12 1 12 0 NA
13 1 13 400 NA
14 2 1 0 1
15 2 2 100 0
16 2 3 0 0
17 2 4 0 0
18 2 5 0 1
19 2 6 0 1
20 2 7 0 1
21 2 8 1500 0
22 2 9 0 1
23 2 10 0 1
24 2 11 0 NA
25 2 12 100 NA
26 2 13 750 NA
27 3 1 0 0
28 3 2 0 1
29 3 3 0 1
30 3 4 0 1
31 3 5 700 1
32 3 6 240 1
33 3 7 100 0
34 3 8 0 0
35 3 9 0 1
36 3 10 0 1
37 3 11 0 NA
38 3 12 500 NA
39 3 13 760 NA
和base
等价物:
transform(df, Quarterly = ave(Incomes, ID,
FUN = function(x) c(
sapply(1:(length(x) - 3), function(y) +any(x[(y + 1):(y + 3)] > 0)),
rep(NA, 3)
)
)
)