Loading LSA sklearn vector



我用sklearn训练了一个LSA模型,这个模型用pickle保存。

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
import os.path
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pickle

def load_data(path,file_name):
"""
Input  : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"r") as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
document_list,titles=load_data("","a-choose")
#clean_text=preprocess_data(document_list)

# raw documents to tf-idf matrix: 
vectorizer = TfidfVectorizer(stop_words='english', 
use_idf=True, 
smooth_idf=True)
# SVD to reduce dimensionality: 
svd_model = TruncatedSVD(n_components=4,
algorithm='randomized',
n_iter=10)
# pipeline of tf-idf + SVD, fit to and applied to documents:
svd_transformer = Pipeline([('tfidf', vectorizer), 
('svd', svd_model)])
svd_matrix = svd_transformer.fit_transform(document_list)
# svd_matrix can later be used to compare documents, compare words, or compare queries with documents
sentence=["football"]
sentence2=["match"]
query=svd_transformer.transform(sentence2)
query_vector = svd_transformer.transform(sentence)
#print(query_vector)
#print(query)

with open("lsa_model.bin","wb") as f:
pickle.dump(svd_matrix, f)

作为第二步,我使用另一个程序来加载这个模型,它将比较词向量。问题是我无法加载这些向量,我的代码在

下面
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import numpy as np
import numpy as np
from gensim.models import KeyedVectors
import codecs
import pickle
model = pickle.load(open('lsa_model.bin', 'rb'))
query="best"
query_vector = model.transform(query)
print(query_vector)

生成一个错误

query_vector = model.transform(query) AttributeError: 'numpy. narray '对象没有属性'transform'

我认为你需要在这里使用fit而不是fit_transform:

svd_matrix = svd_transformer.fit(document_list)

我不知道为什么它只在第二部分起作用

最新更新