R count连续列中模式匹配的个数



我有一个'0'和'1'的数据帧,如下所示:

DATA <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))

我想知道每行中'0'后跟下一列'1'的次数。如果第一列的值是'1',这也应该被计算在内。

我有一个循环,它将每一行绑定到一个向量中,然后使用stringi::stri_count_fixedstringr::str_count计算'01'的数量:

for(n in 1:nrow(DATA)) {
# Paste row into a single character vector, with extra 0 at start in case
# the first column value is 1.
STRING <- do.call(paste0, c(0, DATA[n, 1:ncol(DATA)]))
# Count number of 0-1 transitions.
COUNT <- stringr::str_count(STRING, pattern = "01")
# Add this to the summary column.
DATA$Count[n] <- COUNT
}

然而,对于我的真实数据集(3000 - 4000列),这两种方法都非常慢。有什么加快速度的办法吗?

所需输出:

> DATA$Count
[1] 2 1 1 0 1 2 0 1 1 1

一个可能的解决方案,在base R:

DATA$Count <- 
apply(DATA, 1, (x) x[1] + sum((x[2:length(x)] - x[1:(length(x)-1)]) > 0))
DATA
#>    V1 V2 V3 V4 V5 Count
#> 1   0  1  0  1  0     2
#> 2   0  0  0  1  0     1
#> 3   0  0  0  1  0     1
#> 4   0  0  0  0  0     0
#> 5   1  1  1  1  1     1
#> 6   1  1  0  1  1     2
#> 7   0  0  0  0  0     0
#> 8   1  1  1  1  1     1
#> 9   1  1  1  1  1     1
#> 10  1  1  1  1  1     1

使用dplyr:

DATA %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
V1    V2    V3    V4    V5 count
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1     0     1     0     1     0     2
2     0     0     0     1     0     1
3     0     0     0     1     0     1
4     0     0     0     0     0     0
5     1     1     1     1     1     1
6     1     1     0     1     1     2
7     0     0     0     0     0     0
8     1     1     1     1     1     1
9     1     1     1     1     1     1
10     1     1     1     1     1     1
library(data.table)
DATA$Count = 
lapply(transpose(DATA), (x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |> 
unlist(use.names = FALSE)
# > DATA
#    V1 V2 V3 V4 V5 Count
# 1   0  1  0  1  0     2
# 2   0  0  0  1  0     1
# 3   0  0  0  1  0     1
# 4   0  0  0  0  0     0
# 5   1  1  1  1  1     1
# 6   1  1  0  1  1     2
# 7   0  0  0  0  0     0
# 8   1  1  1  1  1     1
# 9   1  1  1  1  1     1
# 10  1  1  1  1  1     1

基准:

df = data.table::setDF(lapply(seq_len(4000L), (x) sample(0L:1L, size = 100L, replace=TRUE)))

bench::mark(
sindri = {
lapply(transpose(df), (x) sum(shift(x, fill = 0L) == 0L & x == 1L)) |> 
unlist(use.names = FALSE)
},
tmfmnk = {
df %>%
rowwise() %>%
mutate(count = sum(diff(c(0, c_across(everything()))) == 1))
},
Yuriy = {
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) + (rowSums(df) == ncol(df))
},
iterations = 1L,
check = FALSE,
relative = TRUE
)

expression   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time           gc              
<bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>         <list>          
1 sindri       1      1      113.        1         NaN     1     0    12.77ms <NULL> <Rprofmem [559 x 3]>    <bench_tm [1]> <tibble [1 x 3]>
2 tmfmnk     113.   113.       1        31.1       Inf     1     1      1.45s <NULL> <Rprofmem [30,205 x 3]> <bench_tm [1]> <tibble [1 x 3]>
3 Yuriy       35.8   35.8      3.17      1.37      NaN     1     0   456.42ms <NULL> <Rprofmem [8,260 x 3]>  <bench_tm [1]> <tibble [1 x 3]>

base

df <- data.frame("V1" = c(0,0,0,0,1,1,0,1,1,1),
"V2" = c(1,0,0,0,1,1,0,1,1,1),
"V3" = c(0,0,0,0,1,0,0,1,1,1),
"V4" = c(1,1,1,0,1,1,0,1,1,1),
"V5" = c(0,0,0,0,1,1,0,1,1,1))
df$Count <-
rowSums((df[1:(ncol(df) - 1)] - df[2:ncol(df)]) == 1) + 
(rowSums(df) == ncol(df))

df
#>    V1 V2 V3 V4 V5 Count
#> 1   0  1  0  1  0     2
#> 2   0  0  0  1  0     1
#> 3   0  0  0  1  0     1
#> 4   0  0  0  0  0     0
#> 5   1  1  1  1  1     1
#> 6   1  1  0  1  1     1
#> 7   0  0  0  0  0     0
#> 8   1  1  1  1  1     1
#> 9   1  1  1  1  1     1
#> 10  1  1  1  1  1     1

在2022-05-30由reprex包(v2.0.1)创建

最新更新