自定义转换器和 word2vec 的功能联盟



我正在尝试使用多组功能对一组文本文档进行分类。我正在使用sklearn的特征联盟来组合不同的功能以适应单个模型。其中一个功能包括使用gensim的word2vec进行词嵌入。

import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories)#dummy dataset
w2v_model= Word2Vec(data .data, size=100, window=5, min_count=5, workers=2)
word2vec={w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)} #dictionary of word embeddings
feat_select = SelectKBest(score_func=chi2, k=10) #other features
TSVD = TruncatedSVD(n_components=50, algorithm = "randomized", n_iter = 5)
#other features

为了包含 sklearn 中尚不可用的转换器/估计器,我正在尝试将我的 word2vec 结果包装到返回向量平均值的自定义转换器类中。

class w2vTransformer(TransformerMixin):
    """
    Wrapper class for running word2vec into pipelines and FeatureUnions
    """
    def __init__(self,word2vec,**kwargs):
        self.word2vec=word2vec
        self.kwargs=kwargs
        self.dim = len(word2vec.values())
    def fit(self,x, y=None):
        return self
    def transform(self, X):
        return np.array([
        np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
            or [np.zeros(self.dim)], axis=0)
       for words in X
])

但是,当需要拟合模型时,我会收到错误。

combined_features = FeatureUnion([("w2v_class",w2vTransformer(word2vec)),
     ("feat",feat_select),("TSVD",TSVD)])#join features into combined_features
#combined_features = FeatureUnion([("feat",feat_select),("TSVD",TSVD)])#runs when word embeddings are not included    
text_clf_svm = Pipeline([('vect', CountVectorizer()),
         ('tfidf', TfidfTransformer()),
         ('feature_selection', combined_features),
          ('clf-svm',  SGDClassifier( loss="modified_huber")),
 ]) 
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
Traceback (most recent call last):
  File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
    text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 248, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 213, in _fit
    **fit_params_steps[name])
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibmemory.py", line 362, in __call__
    return self.func(*args, **kwargs)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 739, in fit_transform
    for name, trans, weight in self._iter())
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 588, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblib_parallel_backends.py", line 111, in apply_async
    result = ImmediateResult(func)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblib_parallel_backends.py", line 332, in __init__
    self.results = batch()
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnbase.py", line 520, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
    for words in X
  File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
    for words in X
  File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
    np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
Traceback (most recent call last):
  File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
    text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 248, in fit
    Xt, fit_params = self._fit(X, y, **fit_params)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 213, in _fit
    **fit_params_steps[name])
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibmemory.py", line 362, in __call__
    return self.func(*args, **kwargs)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 739, in fit_transform
    for name, trans, weight in self._iter())
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 779, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 625, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 588, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblib_parallel_backends.py", line 111, in apply_async
    result = ImmediateResult(func)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblib_parallel_backends.py", line 332, in __init__
    self.results = batch()
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 131, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnexternalsjoblibparallel.py", line 131, in <listcomp>
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnpipeline.py", line 581, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:UsersrluskAppDataLocalContinuumAnaconda3libsite-packagessklearnbase.py", line 520, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
    for words in X
  File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
    for words in X
  File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
    np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'

我知道错误是因为变量"words"是一个csr_matrix,但它需要是一个可迭代的,例如列表。我的问题是如何修改转换器类或数据,以便我可以使用单词嵌入作为特征馈送到 FeatureUnion 中?这是我的第一篇SO帖子,请温柔一点。

您可以使用Gensim直接提供的新scikit-learnAPI来避免该错误,而不是自定义转换器! https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html

此外,这取决于您的 Gensim 版本,但就我而言,我可以使用 word2vec 对象的 wv 属性解决相同的错误,而不是在对象本身上索引。

在 w2vTransformer 类的转换方法中:

self.word2vec.wv[w]

而不是

self.word2vec[w]

希望对您有所帮助!

相关内容

  • 没有找到相关文章

最新更新