R最常见的字符串值

我有一个数据框架看起来像这样，与NA值

<表类> id cat1 cat2 cat3 cat4 tbody><<tr>1苹果香蕉香蕉橙色2橙色香蕉苹果橙色3苹果NANA橙色4橙色香蕉苹果NA

我们可以从这里使用Mode函数

Mode <- function(x) {
x <- na.omit(x)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

，并应用到每一行。

cbind(df[1], cat = apply(df[-1], 1, Mode))
#  id    cat
#1  1 banana
#2  2 orange

df <- structure(list(id = 1:2, cat1 = c("apple", "orange"), cat2 = c("banana", 
"banana"), cat3 = c("banana", "apple"), cat4 = c("orange", "orange"
)), class = "data.frame", row.names = c(NA, -2L))

data.table选项

setDT(df)[, .(cat = names(tail(sort(table(na.omit(unlist(.SD)))), 1))), id]

为

id    cat
1:  1 banana
2:  2 orange

带apply的base R选项

cbind(
df[1],
cat = apply(
df[-1],
1,
function(x) names(tail(sort(table(na.omit(unlist(x)))), 1))
)
)

为

id    cat
1  1 banana
2  2 orange

> dput(df)
structure(list(id = 1:2, cat1 = c("apple", "orange"), cat2 = c("banana",
"banana"), cat3 = c("banana", "apple"), cat4 = c("orange", "orange"
)), class = "data.frame", row.names = c(NA, -2L))

可以吗?

library(dplyr)
library(tidyr)
df %>% pivot_longer(cols = -id) %>% count(id, value) %>% 
group_by(id) %>% 
summarise(cat = value[which.max(n)])
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 2 x 2
id cat   
<int> <chr> 
1     1 banana
2     2 orange

使用tidyverse

library(dplyr)
library(purrr)
df %>%
transmute(id, cat = pmap_chr(.[-1], ~ Mode(c(...))))
#  id    cat
#1  1 banana
#2  2 orange

Mode <- function(x) {
x <- na.omit(x)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

数据

df <- structure(list(id = 1:2, cat1 = c("apple", "orange"), cat2 = c("banana",
"banana"), cat3 = c("banana", "apple"), cat4 = c("orange", "orange"
)), class = "data.frame", row.names = c(NA, -2L))

相关内容

最新更新

热门标签：