如何保存使用矢量器、管道和GridSearchV的scikit学习分类器



我使用以下步骤构建了一个情感分类器:

load dataset with pandas
count = CountVectorizer()
bag = count.fit_transform(x)
bag.toarray()
tfidf = TfidfTransformer(use_idf=True, norm="l2",smooth_idf=True)
tfidf.fit_transform(bag).toarray()
from collections import Counter
vocab = Counter()
for text in x:
for word in text.split(" "):
vocab[word] += 1
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
vocab_reduced = Counter()
for w, c in vocab.items():
if not w in stop:
vocab_reduced[w]=c
def preprocessor(text):
""" Return a cleaned version of text
"""
# Remove HTML markup
text = re.sub('<[^>]*>', '', text)
# Save emoticons for later appending
emoticons = re.findall('(?::|;|=)(?:-)?(?:)|(|D|P)', text)
# Remove any non-word character and append the emoticons,
# removing the nose character for standarization. Convert to lower case
text = (re.sub('[W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))

return text
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'vect__preprocessor': [None, preprocessor],
'vect__use_idf':[False],
'vect__norm':[None],
"clf__alpha":[0,1],
"clf__fit_prior":[False,True]},
]
multi_tfidf = Pipeline([("vect", tfidf),
( "clf", MultinomialNB())])
gs_multi_tfidf = GridSearchCV(multi_tfidf, param_grid,
scoring="accuracy",
cv=5,
verbose=1,
n_jobs=-1)
gs_multi_tfidf.fit(X_train,y_train)

我试着用joblib保存管道,并保存两者、分类器和管道,然后将其用于网站。但我每次尝试都没有成功。我要么得到:ValueError: not enough values to unpack (expected 2, got 1)(当保存了管道和分类器时(,要么只使用分类器时得到TypeError: 'module' object is not callable

请尝试使用以下内容。为什么不包括CountVector((和TfidfTransformer((?您还应该准确地指定尝试保存模型的方式。

multi_tfidf = Pipeline([("vect", TfidfVectorizer()),
( "clf", MultinomialNB())])

最新更新