r-按组排列的唯一单词



这是我的示例数据帧

example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))

我只想在组内和通过组获得唯一的单词

所以我想得到这个

group   word
A       car, tree
B       sun

我用了聚合得到了这个

aggregate(word ~ group , data = example,  FUN = paste0) 
group                  word
1     A car, car, house, tree
2     B       sun ,sun, house

但现在我只需要选择唯一的值,但即使这样也不能实现

for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}

带有

Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house",  : 
replacement has 3 rows, data has 1

使用aggregate+subset+ave的基本R选项,类似于

with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, "[, ]+"))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
)

给出

ind    values
1   A car, tree
2   B       sun

这里有一个dplyr解决方案:

library(dplyr)
library(tidyr)
example %>% 
separate_rows(word) %>% 
distinct(group, word) %>% 
group_by(word) %>% 
filter(n() == 1) %>% 
group_by(group) %>% 
summarise(word = toString(word))

输出

group word       
1 A     car, tree
2 B     sun      

base中,您可以使用strsplit来获取单词,split获取,并使用unique来获取每个组的唯一单词。使用table来获取相同单词的数量,并取那些只出现一次的单词。

t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, (x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
#  group      word
#A     A car, tree
#B     B       sun

或者从问题中已经使用的aggregate开始。

t1 <- aggregate(word ~ group , data = example,  FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, (x) x[x %in% t3])
t1
#  group      word
#1     A car, tree
#2     B       sun

只是为了好玩基准

library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, (x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example,  FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, (x) x[x %in% t3])
t1},
ThomasIsCoding = with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, ", "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
),
Mael = {example %>% 
separate_rows(word) %>% 
distinct(group, word) %>% 
group_by(word) %>% 
filter(n() == 1) %>% 
group_by(group) %>% 
summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)

结果

expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
<bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 GKi            445.13µs 486.26µs    1997.    16.03KB     6.15   974     3
2 GKi2           916.97µs 968.68µs    1023.      7.3KB     6.15   499     3
3 ThomasIsCoding   3.54ms   3.73ms     266.     8.19KB     8.45   126     4
4 Mael            16.07ms  16.48ms      60.1   60.04KB     6.68    27     3
5 Nir Graham      37.29ms  39.49ms      24.0   90.59KB     8.00     9     3

GKi比GKi2快2倍,比ThomasIsCoding快7倍,比Mael快30倍,比Nir Graham快80倍。

library(tidyverse)
example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))

最新更新