r语言 - 比较两个文档中的单词袋,并在第二个文档中找到匹配的单词及其频率



我计算了'yelp.csv'、'yelpp.csv'、'yelpn.csv的单词袋,并创建了个人数据集的单词频率矩阵。现在,我想将 yelp 的单词袋与 yelpn 进行比较,并检查 yelp 中有多少个单词出现在 yelpn 中以及它们的频率并将其作为矩阵存储在变量中,然后 yelpp 也是如此。大喊大叫包含正面和负面。Yelpp,只有正面和Yelpn,只有负面。任何人都可以完成代码吗?我不知道这段代码是否相关,我希望如此。

getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote=""",stringsAsFactors= TRUE,
strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t
#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote=""",stringsAsFactors= TRUE,
strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp
#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote=""",stringsAsFactors=   TRUE,
strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn

#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat
#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn

我正在从维基文本挖掘中获取文本。使用tm包和findFreqTermsagrep函数是这种方法的要点。

agrep

使用广义的 Levenshtein 编辑距离(将一个字符串转换为另一个字符串所需的最小可能加权插入、删除和替换次数(搜索字符串 x(第二个参数(中模式(第一个参数(的近似匹配项。

接近步骤:

文本 ->语料库 ->数据清理 ->查找 ->与其他术语比较 文档矩阵

library(tm)
c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))
c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))
c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))
# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))
c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)
c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)
c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)
dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))
ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)
#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
find <- agrep(t, ft2)
if(length(find) != 0){
common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
}
}
# Note : this for loop can be substituted by apply family functions if taking time for large text

common.c1c2包含语料库 1 和语料库 2 之间的常用词

,频率
> common.c1c2
term freq
1     also    1
2     data    2
3  derived    1
4 deriving    1
5   mining    1
6  pattern    1
7 patterns    1
8  process    1
9     text    1
> ft1
[1] "also"        "analytics"   "data"        "derived"     "deriving"    "devising"    "equivalent" 
[8] "highquality" "information" "learning"    "means"       "mining"      "pattern"     "patterns"   
[15] "process"     "referred"    "roughly"     "statistical" "text"        "trends"      "typically"  
> ft2
[1] "addition"       "along"          "data"           "database"       "derived"        "deriving"      
[7] "evaluation"     "features"       "finally"        "input"          "insertion"      "interpretation"
[13] "involves"       "linguistic"     "mining"         "others"         "output"         "parsing"       
[19] "patterns"       "process"        "removal"        "structured"     "structuring"    "subsequent"    
[25] "text"           "usually"        "within"        

这个解决方案不是最有效的解决方案,但希望它有所帮助。

最新更新