r语言 - 具有相同后缀的变量相乘和求和



我的数据框架有22个变量。这是一个简化的示例。变量包括x1 x2 y1_ y2_。我想创建一个新变量。变量值为x1*y1_+x2*y2_。代码如下:

df <- data.frame(x1=c(0,0,0,1),x2=c(0,0,0,1),y1_=c(3,0,2,1),y2_=c(1,0,0,1))
df$var <- df$x1*df$y1_+df$x2*df$y2_

如果没有。变量的个数为22,以上代码不合理。那么,如何得到这个变量呢?

根据列名拆分数据框,然后相乘,然后逐行求和:

x <- colnames(df)
df$var <- rowSums(df[, grepl("^x", x)] * df[, grepl("^y", x)])
df
#   x1 x2 y1_ y2_ var
# 1  0  0   3   1   0
# 2  0  0   0   0   0
# 3  0  0   2   0   0
# 4  1  1   1   1   2

底线:

df$var <- do.call(`+`, 
lapply(split.default(df, gsub(".*([0-9]+)_?$", "\1", names(df))),
function(z) apply(z, 1, prod)))
df
#   x1 x2 y1_ y2_ var
# 1  0  0   3   1   0
# 2  0  0   0   0   0
# 3  0  0   2   0   0
# 4  1  1   1   1   2

初始步骤:

gsub(".*([0-9]+)_?$", "\1", names(df))
# [1] "1" "2" "1" "2"
split.default(df, gsub(".*([0-9]+)_?$", "\1", names(df)))
# $`1`
#   x1 y1_
# 1  0   3
# 2  0   0
# 3  0   2
# 4  1   1
# $`2`
#   x2 y2_
# 1  0   1
# 2  0   0
# 3  0   0
# 4  1   1
lapply(split.default(df, gsub(".*([0-9]+)_?$", "\1", names(df))),
function(z) apply(z, 1, prod))
# $`1`
# [1] 0 0 0 1
# $`2`
# [1] 0 0 0 1

1)使用dplyr,假设以数字结尾的列与以_结尾的列的顺序相同,并且两组分别以数字和下划线结尾,我们可以像这样使用across

library(dplyr)
df %>% mutate(var = rowSums(across(matches("\d$")) * across(ends_with("_"))))

x1 x2 y1_ y2_ var
1  0  0   3   1   0
2  0  0   0   0   0
3  0  0   2   0   0
4  1  1   1   1   2

2)另一种变体是:

df %>%
rowwise %>%
mutate(var = sum(c_across(matches("\d$")) * c_across(ends_with("_")))) %>%
ungroup

注意

df <- structure(list(x1 = c(0, 0, 0, 1), x2 = c(0, 0, 0, 1), y1_ = c(3, 
0, 2, 1), y2_ = c(1, 0, 0, 1)), class = "data.frame", row.names = c(NA, 
-4L))
df
##   x1 x2 y1_ y2_
## 1  0  0   3   1
## 2  0  0   0   0
## 3  0  0   2   0
## 4  1  1   1   1

您可以使用tidyverse

中的各种枢轴,映射和突变函数的组合来实现这一点。
df <- data.frame(x1=c(1,2,3,4),x2=c(1,1,0,1),y1_=c(3,0,2,1),y2_=c(1,4,0,1))
library(tidyr)
library(dplyr)
library(purrr)
pivoted <-
pivot_longer(
df,
cols = everything(),
names_to = c("letter", "number"),
names_pattern = "(.)(.)"
)
pivoted
#> # A tibble: 16 × 3
#>    letter number value
#>    <chr>  <chr>  <dbl>
#>  1 x      1          1
#>  2 x      2          1
#>  3 y      1          3
#>  4 y      2          1
#>  5 x      1          2
#>  6 x      2          1
#>  7 y      1          0
#>  8 y      2          4
#>  9 x      1          3
#> 10 x      2          0
#> 11 y      1          2
#> 12 y      2          0
#> 13 x      1          4
#> 14 x      2          1
#> 15 y      1          1
#> 16 y      2          1
nested <- 
pivoted |> 
group_by(letter, number) |> 
nest(num_data = value)
nested
#> # A tibble: 4 × 3
#> # Groups:   letter, number [4]
#>   letter number num_data        
#>   <chr>  <chr>  <list>          
#> 1 x      1      <tibble [4 × 1]>
#> 2 x      2      <tibble [4 × 1]>
#> 3 y      1      <tibble [4 × 1]>
#> 4 y      2      <tibble [4 × 1]>
summarised <-
nested |>
group_by(number) |> 
summarise(across(num_data, pmap, list))
summarised
#> # A tibble: 2 × 2
#>   number num_data    
#>   <chr>  <named list>
#> 1 1      <list [2]>  
#> 2 2      <list [2]>
summarised <- rowwise(summarised)
summarised <- 
transmute(
summarised,
products = list(
pmap(num_data, prod)
)
)
summarised[["products"]]
#> [[1]]
#> [[1]][[1]]
#> [1] 3
#> 
#> [[1]][[2]]
#> [1] 0
#> 
#> [[1]][[3]]
#> [1] 6
#> 
#> [[1]][[4]]
#> [1] 4
#> 
#> 
#> [[2]]
#> [[2]][[1]]
#> [1] 1
#> 
#> [[2]][[2]]
#> [1] 4
#> 
#> [[2]][[3]]
#> [1] 0
#> 
#> [[2]][[4]]
#> [1] 1
df[["var"]] <- 
summarised[["products"]] |> 
pmap_dbl(sum)
df
#>   x1 x2 y1_ y2_ var
#> 1  1  1   3   1   4
#> 2  2  1   0   4   4
#> 3  3  0   2   0   6
#> 4  4  1   1   1   5

创建于2022-11-18与reprex v2.0.2

最新更新