r-将数据帧中的多列与外部向量进行比较



假设我们有这个向量:

products <- c(a, b, d, f, g, h, i, j, m, o, t, z)

下面是这样一个数据帧:

seller_a seller_b seller_c
a        b        d
d        d        e
g        g        g
h        l        h
t        n        t
z        y        w

我想在数据帧中包括一个额外的行,它将向每个seller列与products向量匹配的扩展发出信号。

换句话说,我的目标是使原始数据帧看起来像这样:

seller_a seller_b seller_c
6        3        4
a        b        d
d        d        e
g        g        g
h        l        h
t        n        t
z        y        w

dplyr中使用summariseacross

library(dplyr)
DF %>% summarise(across(everything(), ~as.character(sum(. %in% products)))) %>%
bind_rows(., DF)
#>   seller_a seller_b seller_c
#> 1        6        3        4
#> 2        a        b        d
#> 3        d        d        e
#> 4        g        g        g
#> 5        h        l        h
#> 6        t        n        t
#> 7        z        y        w

创建于2021-06-07由reprex包(v2.0.0(

您还可以使用tibble中的add_row函数向数据集添加额外的行:

library(dplyr)
df %>%
add_row(seller_a = as.character(sum(df$seller_a %in% products)),
seller_b = as.character(sum(df$seller_b %in% products)),
seller_c = as.character(sum(df$seller_c %in% products)),
.before = 1)
# A tibble: 7 x 3
seller_a seller_b seller_c
<chr>    <chr>    <chr>   
1 6        3        4       
2 a        b        d       
3 d        d        e       
4 g        g        g       
5 h        l        h       
6 t        n        t       
7 z        y        w 

使用末尾注释中可重复显示的输入

as.data.frame(lapply(DF, function(x) c(sum(x %in% products), x)))
##   seller_a seller_b seller_c
## 1        6        3        4
## 2        a        b        d
## 3        d        d        e
## ...snip...

数字矢量

但是,列的所有元素都必须是相同的类型,因此数字将被强制为字符。您可能更喜欢只创建一个单独的数字矢量。

sapply(DF, function(x) sum(x %in% products))
## seller_a seller_b seller_c 
##        6        3        4 

S3

这可能有些过头了,但可以创建一个新的S3类,将产品编号存储为数字属性而不是行,但在打印时显示为行。

as.data.frame1 <- function(x, ...) UseMethod("as.data.frame1")
as.data.frame1.data.frame <- function(x, product, ...) {
out <- structure(x, class = c("data.frame1", class(x)))
attr(out, "product") <-  sapply(DF, function(x) sum(x %in% products))
out
}
format.data.frame1 <- function(x, ...) {
format(as.data.frame(rbind(attr(x, "product"), x)))
}
print.data.frame1 <- function(x, ...) {
print(format(x), ...)
}
DF1 <- as.data.frame1(DF, products)
DF1
##   seller_a seller_b seller_c
## 1        6        3        4
## 2        a        b        d
## 3        d        d        e
## ...snip...
attr(DF1, "product")  # numeric vector
## seller_a seller_b seller_c 
##        6        3        4 
as.data.frame(DF1)
##   seller_a seller_b seller_c
## 1        a        b        d
## 2        d        d        e
## 3        g        g        g
## ...snip...

备注

products <- scan(text = "a, b, d, f, g, h, i, j, m, o, t, z", 
what = "", sep = ",", strip.white = TRUE)
Lines <- "seller_a seller_b seller_c
a        b        d
d        d        e
g        g        g
h        l        h
t        n        t
z        y        w"
DF <- read.table(text = Lines, header = TRUE)

数据:

df <- tibble(
a = c("a", "d", "g", "h", "t", "z"),
b = c("b", "d", "g", "l", "n", "y"),
c = c("d", "e", "g", "h", "t", "w")
)
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")

代码:

library(tidyverse)
df %>% rbind(map_int(., ~sum(products %in% .x)), .)
a     b     c    
<chr> <chr> <chr>
1 6     3     4    
2 a     b     d    
3 d     d     e    
4 g     g     g    
5 h     l     h    
6 t     n     t    
7 z     y     w 

请注意,第一行中的数字将是字符。此外,如果列是因素,代码将不起作用(这就是我使用tibble的原因(

具有data.table:的解决方案

library(data.table)
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
DT <- data.table(
seller_a = c("a", "d", "g", "h", "t", "z"),
seller_b = c("b", "d", "g", "l", "n", "y"),
seller_c = c("d", "e", "g", "h", "t", "w")
)
DT1 <- DT[,.(seller_a = length(which(products%in%seller_a==TRUE)),
seller_b = length(which(products%in%seller_b==TRUE)),
seller_c = length(which(products%in%seller_c==TRUE)))]
# -------------------
> DT1
seller_a seller_b seller_c
1:        6        3        4
> rbind(DT1, DT)
seller_a seller_b seller_c
1:        6        3        4
2:        a        b        d
3:        d        d        e
4:        g        g        g
5:        h        l        h
6:        t        n        t
7:        z        y        w

基本R选项-

rbind(sapply(df, function(x) sum(x %in% products)), df)
#  a b c
#1 6 3 4
#2 a b d
#3 d d e
#4 g g g
#5 h l h
#6 t n t
#7 z y w

我们可以使用

library(dplyr)
df %>% 
summarise(across(everything(), ~ c(sum(products %in% .), .)))

-输出

# A tibble: 7 x 3
a     b     c    
<chr> <chr> <chr>
1 6     3     4    
2 a     b     d    
3 d     d     e    
4 g     g     g    
5 h     l     h    
6 t     n     t    
7 z     y     w

使用基本R,您可以执行以下

#///////////////////
#your data
products <- c("a", "b", "d", "f", "g", "h", "i", "j", "m", "o", "t", "z")
seller_a <- c("a", "d", "g", "h", "t", "z")
seller_b <- c("b", "d", "g", "l", "n", "y")
seller_c <- c("d", "e", "g", "h", "t", "w")
d <- as.data.frame(cbind(seller_a,seller_b,seller_c))
#///////////////////
a <- c(sum(d$seller_a %in% products), sum(d$seller_b %in% products), sum(d$seller_c %in% products))
d <- rbind(a,d)

最新更新