r-ngram引用回quaneda中的docname



我正在尝试创建一个类似于quanteda::textstat_frequency输出的数据表,但多了一列docnames,这是一个包含特定令牌的文档名称字符串。例如

a_corpus <- quanteda::corpus(c("some corpus text of no consequence that in practice is going to be very large",
"and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
"adding some corpus text word repeats to ensure ngrams top ten selection approaches are working"))
ngrams_dfm <- quanteda::dfm(a_corpus, tolower = T, stem = F, ngrams = 2)
freq = textstat_frequency(ngrams_dfm)
# freq's header has feature, frequency, rank, docfreq, group
data.table(feature = featnames(ngrams_dfm )[1:50], 
frequency = colSums(ngrams_dfm)[1:50],
doc_names = paste(docnames, collapse = ',')?, # what should be here?
keep.rownames = F,
stringsAsFactors = F)

另一种(固执己见的(方法可能是使用udpipe R包。下面的例子-它的优点是很容易根据词性标签进行选择,或者你也可以使用它来选择特定的依赖性解析结果,这比bigrams好得多(但这是另一个问题(

library(udpipe)
library(data.table)
txt <- c("some corpus text of no consequence that in practice is going to be very large",
"and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
"adding some corpus text word repeats to ensure ngrams top ten selection approaches are working")
x <- udpipe(txt, "english", trace = TRUE) ## rich output, but takes a while for large volumes of text
x <- setDT(x)
x <- x[, bigram_lemma := txt_nextgram(lemma, n = 2, sep = "-"), by = list(doc_id, paragraph_id, sentence_id)]
x <- x[, upos_next := txt_next(upos, n = 1), by = list(doc_id, paragraph_id, sentence_id)]
x_nouns <- subset(x, upos %in% c("ADJ") & upos_next %in% c("NOUN"))
View(x)
freqs <- document_term_frequencies(x, document = "doc_id", term = c("bigram_lemma", "lemma"))
dtm <- document_term_matrix(freqs)

首先,您可以将文档名称添加到语料库中:

document_names <- c("doc1", "doc2", "doc3")
a_corpus <- quanteda::corpus(x = c("some corpus text of no consequence that in practice is going to be very large",
"and so one might expect a very large number of ngrams but for nlp purposes only care about top ten",
"adding some corpus text word repeats to ensure ngrams top ten selection approaches are working"),
docnames = document_names)
a_corpus
# Corpus consisting of 3 documents and 0 docvars.

现在,您可以在后续的quaneda函数调用中使用文档名称。

ngrams_dfm <- quanteda::dfm(a_corpus, tolower = T, stem = F, ngrams = 2)
ngrams_dfm 
# Document-feature matrix of: 3 documents, 43 features (63.6% sparse).

您也可以使用textstat_frequency中的groups选项来获取频率结果中的文档名称

freq = textstat_frequency(ngrams_dfm, groups = docnames(ngrams_dfm))
head(freq)
feature frequency rank docfreq group
1      some_corpus         1    1       1  doc1
2      corpus_text         1    2       1  doc1
3          text_of         1    3       1  doc1
4            of_no         1    4       1  doc1
5   no_consequence         1    5       1  doc1
6 consequence_that         1    6       1  doc1

如果您想将数据从ngrams_dfm获取到data.frame,那么quanteda:中有convert函数

convert(ngrams_dfm, to = "data.frame")
document some_corpus corpus_text text_of of_no no_consequence consequence_that that_in in_practice practice_is is_going going_to to_be
1     doc1           1           1       1     1              1                1       1           1           1        1        1     1
2     doc2           0           0       0     0              0                0       0           0           0        0        0     0
3     doc3           1           1       0     0              0                0       0           0           0        0        0     0

你可以重塑它以得到你想要的:这里有一个dplyr/tidyr的例子。

library(dplyr)
convert(ngrams_dfm, to = "data.frame") %>% 
tidyr::gather(feature, frequency, -document) %>% 
group_by(document, feature) %>% 
summarise(frequency = sum(frequency)) 
# A tibble: 129 x 3
# Groups:   document [?]
document feature          frequency
<chr>    <chr>                <dbl>
1 doc1     a_very                   0
2 doc1     about_top                0
3 doc1     adding_some              0
4 doc1     and_so                   0
5 doc1     approaches_are           0
6 doc1     are_working              0
7 doc1     be_very                  1
8 doc1     but_for                  0
9 doc1     care_about               0
10 doc1     consequence_that         1
# ... with 119 more rows

或带有数据。表:

out <- data.table(convert(ngrams_dfm, to = "data.frame"))
melt(out, id.vars = "document", 
variable.name = "feature", value.name = "freq")
document     feature freq
1:     doc1 some_corpus    1
2:     doc2 some_corpus    0
3:     doc3 some_corpus    1
4:     doc1 corpus_text    1
5:     doc2 corpus_text    0
---                          
125:     doc2  care_about    1
126:     doc3  care_about    0
127:     doc1   about_top    0
128:     doc2   about_top    1
129:     doc3   about_top    0

有趣的答案。。。但不是针对OP的问题。在不判断为什么你想要这个的情况下,以下正是你想要的,使用数据.table

# set up the data.table without the doc_names
freq_dt <- textstat_frequency(ngrams_dfm) %>%
data.table()
setkey(freq_dt, feature)
# do the docnames collapsing as a separate data.table
docnames_dt <-
textstat_frequency(ngrams_dfm, groups = docnames(ngrams_dfm))[, c("feature", "group")] %>%
data.table()
docnames_dt <- docnames_dt[, doc_names := paste(group, collapse = ","), by = feature]
docnames_dt <- unique(docnames_dt[, c("feature", "doc_names")])
setkey(docnames_dt, feature)
# quick merge
answerdt <- freq_dt[docnames_dt][, c("feature", "frequency", "doc_names")]
# show the results
setorder(answerdt, -frequency)
head(answerdt, 10)
##            feature frequency   doc_names
##  1:    corpus_text         2 text1,text3
##  2:    some_corpus         2 text1,text3
##  3:        top_ten         2 text2,text3
##  4:     very_large         2 text1,text2
##  5:         a_very         1       text2
##  6:      about_top         1       text2
##  7:    adding_some         1       text3
##  8:         and_so         1       text2
##  9: approaches_are         1       text3
## 10:    are_working         1       text3

最新更新