有一个这样的数据帧:
df <- structure(list(doc_id = c("1", "2"), ner_words = c("John, Google",
"Amazon, Python, Canada")), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"))
怎么可能做一个table(df$ner_words)
,但在每一行中都采用不同的单词?预期结果示例
data.frame(text = c("John", "Google", "Amazon", "Python", "Canada"), frq = c(1,1,1,1,1))
separate_rows()
然后count()
:
library(dplyr)
library(tidyr)
df %>%
separate_rows(ner_words) %>%
count(ner_words, name = "frq")
# # A tibble: 5 x 2
# ner_words frq
# <chr> <int>
# 1 Amazon 1
# 2 Canada 1
# 3 Google 1
# 4 John 1
# 5 Python 1
这是一个选项:
library(dplyr)
library(tidyr)
df %>%
separate_rows(ner_words, sep = ", ") %>%
group_by(ner_words) %>%
transmute(ner_words, frq = n())
# A tibble: 5 x 2
# Groups: ner_words [5]
ner_words frq
<chr> <int>
1 John 1
2 Google 1
3 Amazon 1
4 Python 1
5 Canada 1
带有str_extract
和count
的选项
library(dplyr)
library(stringr)
library(tidyr)
df %>%
transmute(ner_words = str_extract_all(ner_words, '\w+')) %>%
unnest(c(ner_words)) %>%
count(ner_words)
# A tibble: 5 x 2
# ner_words n
#* <chr> <int>
#1 Amazon 1
#2 Canada 1
#3 Google 1
#4 John 1
#5 Python 1