我有两组文档:580篇新闻文章和一篇大约。560项政治决定。我想找出个别新闻文章和政治决策之间是否有相似之处。这意味着每一篇单独的新闻文章都应该与560个政治决策中的每一个进行比较,使用余弦相似度。我正在使用定量数据包。
这是我到目前为止所尝试的:
news_articles <- readtext(paste0(txt_directory, "*"), encoding = "UTF-8")
news_articles_corpus <- corpus(news_articles)
pol_decisions <- readtext(paste0(txt_directory, "*"), encoding = "UTF-8")
pol_decisions_corpus <- corpus(pol_decisions)
news_articles_toks <- tokens(
news_articles_corpus,
what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
verbose = TRUE)
news_articles_toks <- tokens_tolower(news_articles_toks, keep_acronyms = FALSE)
news_articles_toks <- tokens_select(news_articles_toks, stopwords("danish"), selection = "remove")
news_articles_toks <- tokens_wordstem(news_articles_toks)
pol_decisions_toks <- tokens(
pol_decisions_corpus,
what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
remove_separators = TRUE,
verbose = TRUE)
pol_decisions_toks <- tokens_tolower(pol_decisions_toks, keep_acronyms = FALSE)
pol_decisions_toks <- tokens_select(pol_decisions_toks, stopwords("danish"), selection = "remove")
pol_decisions_toks <- tokens_wordstem(pol_decisions_toks)
news_articles_dfm <- dfm(news_articles_toks)
pol_decisions_dfm <- dfm(pol_decisions_toks)
cosine <- textstat_simil(
news_articles_dfm,
y = pol_decisions_dfm,
selection = NULL,
margin = c("documents"),
method = c("cosine"))
cosine <- as.data.frame(cosine)
cosine <- cosine[order(-cosine$cosine),]
write_xlsx(cosine, "Test.xlsx")
我的问题是,当我运行textstat_simil函数时,R返回所有组合的余弦值-在两组文档内部和之间。但我不想知道两篇新闻文章或两项政治决策之间的余弦相似度。我只想知道一篇新闻文章和一项政治决定之间的余弦相似度。
有什么办法可以解决这个问题吗?
在textstat_simil()
中只使用x
和y
require(quanteda)
#> Loading required package: quanteda
#> Package version: 3.2.1
#> Unicode version: 13.0
#> ICU version: 69.1
#> Parallel computing: 4 of 4 threads used.
#> See https://quanteda.io for tutorials and examples.
require(quanteda.textstats)
#> Loading required package: quanteda.textstats
corp_news <- corpus(c(news1 = "politics party vote",
news2 = "crime police family"))
corp_pol <- corpus(c(pol1 = "member party vote",
pol2 = "family income",
pol3 = "crime prison"))
dfmt_news <- tokens(corp_news) %>% dfm()
dfmt_pol <- tokens(corp_pol) %>% dfm()
dfmt_news
#> Document-feature matrix of: 2 documents, 6 features (50.00% sparse) and 0 docvars.
#> features
#> docs politics party vote crime police family
#> news1 1 1 1 0 0 0
#> news2 0 0 0 1 1 1
dfmt_pol
#> Document-feature matrix of: 3 documents, 7 features (66.67% sparse) and 0 docvars.
#> features
#> docs member party vote family income crime prison
#> pol1 1 1 1 0 0 0 0
#> pol2 0 0 0 1 1 0 0
#> pol3 0 0 0 0 0 1 1
textstat_simil(x = dfmt_news, y = dfmt_pol, method = "cosine")
#> textstat_simil object; method = "cosine"
#> pol1 pol2 pol3
#> news1 0.667 0 0
#> news2 0 0.408 0.408
由reprex包(v2.0.1)创建于2022-06-25