这是我的示例数据帧
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
我只想在组内和通过组获得唯一的单词
所以我想得到这个
group word
A car, tree
B sun
我用了聚合得到了这个
aggregate(word ~ group , data = example, FUN = paste0)
group word
1 A car, car, house, tree
2 B sun ,sun, house
但现在我只需要选择唯一的值,但即使这样也不能实现
for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}
带有
Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house", :
replacement has 3 rows, data has 1
使用aggregate
+subset
+ave
的基本R选项,类似于
with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, "[, ]+"))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
)
给出
ind values
1 A car, tree
2 B sun
这里有一个dplyr
解决方案:
library(dplyr)
library(tidyr)
example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))
输出
group word
1 A car, tree
2 B sun
在base中,您可以使用strsplit
来获取单词,split
由组获取,并使用unique
来获取每个组的唯一单词。使用table
来获取相同单词的数量,并取那些只出现一次的单词。
t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, (x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
# group word
#A A car, tree
#B B sun
或者从问题中已经使用的aggregate
开始。
t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, (x) x[x %in% t3])
t1
# group word
#1 A car, tree
#2 B sun
只是为了好玩基准
library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ]+"), example$group),
(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, (x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ]+"), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, (x) x[x %in% t3])
t1},
ThomasIsCoding = with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, ", "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
),
Mael = {example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)
结果
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 GKi 445.13µs 486.26µs 1997. 16.03KB 6.15 974 3
2 GKi2 916.97µs 968.68µs 1023. 7.3KB 6.15 499 3
3 ThomasIsCoding 3.54ms 3.73ms 266. 8.19KB 8.45 126 4
4 Mael 16.07ms 16.48ms 60.1 60.04KB 6.68 27 3
5 Nir Graham 37.29ms 39.49ms 24.0 90.59KB 8.00 9 3
GKi比GKi2快2倍,比ThomasIsCoding快7倍,比Mael快30倍,比Nir Graham快80倍。
library(tidyverse)
example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))