向Sklearn TFIDIF矢量器添加新文本(Python)

是否有一个函数添加到现有的语料库?我已经生成了我的矩阵，我希望定期添加到表中，而不需要重新处理整个sha-bang

如;

articleList = ['here is some text blah blah','another text object', 'more foo for your bar right now']
tfidf_vectorizer = TfidfVectorizer(
                        max_df=.8,
                        max_features=2000,
                        min_df=.05,
                        preprocessor=prep_text,
                        use_idf=True,
                        tokenizer=tokenize_text
                    )
tfidf_matrix = tfidf_vectorizer.fit_transform(articleList)
#### ADDING A NEW ARTICLE TO EXISTING SET?
bigger_tfidf_matrix = tfidf_vectorizer.fit_transform(['the last article I wanted to add'])

您可以直接访问矢量器的vocabulary_属性，并且可以通过_tfidf._idf_diag访问idf_矢量，因此可以像这样进行猴子补丁:

import re 
import numpy as np
from scipy.sparse.dia import dia_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
def partial_fit(self, X):
    max_idx = max(self.vocabulary_.values())
    for a in X:
        #update vocabulary_
        if self.lowercase: a = a.lower()
        tokens = re.findall(self.token_pattern, a)
        for w in tokens:
            if w not in self.vocabulary_:
                max_idx += 1
                self.vocabulary_[w] = max_idx
        #update idf_
        df = (self.n_docs + self.smooth_idf)/np.exp(self.idf_ - 1) - self.smooth_idf
        self.n_docs += 1
        df.resize(len(self.vocabulary_))
        for w in tokens:
            df[self.vocabulary_[w]] += 1
        idf = np.log((self.n_docs + self.smooth_idf)/(df + self.smooth_idf)) + 1
        self._tfidf._idf_diag = dia_matrix((idf, 0), shape=(len(idf), len(idf)))
TfidfVectorizer.partial_fit = partial_fit
articleList = ['here is some text blah blah','another text object', 'more foo for your bar right now']
vec = TfidfVectorizer()
vec.fit(articleList)
vec.n_docs = len(articleList)
vec.partial_fit(['the last text I wanted to add'])
vec.transform(['the last text I wanted to add']).toarray()
# array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
#          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
#          0.        ,  0.        ,  0.27448674,  0.        ,  0.43003652,
#          0.43003652,  0.43003652,  0.43003652,  0.43003652]])

我相信给出的(优秀的)答案有几个错误—如果一个标记在一个文档中多次出现，文档频率应该只更新一次，并且词汇字典应该以不同的顺序:

def _partial_fit(self, X:):
        X = X.copy()
        for doc in X:
            if self.lowercase:
                doc = doc.lower()
            tokens = re.findall(self.token_pattern, doc)
            tokens = [token for token in tokens if token not in my_stop_words]
            indices_to_insert = []
            for w in tokens:
                # We now need to update the vocabulary with the new tokens
                if w not in self.vocabulary_:
                    # temporary placeholder in the dict
                    self.vocabulary_[w] = -1
                    # create a list in alphabetical order
                    # each token's value in the dict is equal to its place in the list
                    # this aligns with the internal dict of sklearn's TfidfVectorizer
                    tmp_keys = sorted(list(self.vocabulary_.keys()))
                    # the dictionary must be in order it has seen the tokens
                    tmp_dict = {tmp_keys[i]: i for i in range(len(tmp_keys))}
                    # Include new tokens in vocab
                    self.vocabulary_ = {k: tmp_dict[k] for k in self.vocabulary_}
                    # Update number of features by 1 for data validation
                    self._tfidf.n_features_in_ += 1
                    # We keep a list of all new indices of new tokens
                    indices_to_insert.append(self.vocabulary_[w])
            # update document frequency
            doc_frequency = (self.n_docs + self.smooth_idf) / np.exp(
                self.idf_ - 1
            ) - self.smooth_idf
            # the new token indices must be added
            for index_to_insert in indices_to_insert:
                doc_frequency = np.insert(doc_frequency, index_to_insert, 0)
            self.n_docs += 1
            # document frequency is not dependent on number of times in doc, only if
            # it appears at all
            for w in set(tokens):
                doc_frequency[self.vocabulary_[w]] += 1
            # update internal inverse document frequency
            idf = (
                np.log(
                    (self.n_docs + self.smooth_idf) / (doc_frequency + self.smooth_idf)
                )
                + 1
            )
            # these values are updated to get correct values from the `transform`
            # function
            self._tfidf.idf_ = idf
            self._tfidf._idf_diag = dia_matrix((idf, 0), shape=(len(idf), len(idf)))

相关内容

最新更新

热门标签：