主题建模错误(doc2bow在输入时需要unicode标记数组,而不是单个字符串)


from nltk.tokenize import RegexpTokenizer
#from stop_words import get_stop_words
from gensim import corpora, models 
import gensim
import os
from os import path
from time import sleep
filename_2 = "buisness1.txt"
file1 = open(filename_2, encoding='utf-8')  
Reader = file1.read()
tdm = []
# Tokenized the text to individual terms and created the stop list
tokens = Reader.split()
#insert stopwords files
stopwordfile = open("StopWords.txt", encoding='utf-8')  
# Use this to read file content as a stream  
readstopword = stopwordfile.read() 
stop_words = readstopword.split() 
for r in tokens:  
    if not r in stop_words: 
        #stopped_tokens = [i for i in tokens if not i in en_stop]
        tdm.append(r)
dictionary = corpora.Dictionary(tdm)
corpus = [dictionary.doc2bow(i) for i in tdm]
sleep(3)
#Implemented the LdaModel
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary)
print(ldamodel.print_topics(num_topics=1, num_words=1))

我正在尝试使用一个单独的包含停止字的txt文件来删除停止字。在我删除了停止词之后,我会添加文本文件中不在停止词中的单词。我在dictionary = corpora.Dictionary(tdm)处得到错误doc2bow expects an array of unicode tokens on input, not a single string

有人能帮我更正我的代码吗

这几乎肯定是重复的,但请使用以下内容:

dictionary = corpora.Dictionary([tdm])

最新更新