是否有一个函数添加到现有的语料库?我已经生成了我的矩阵,我希望定期添加到表中,而不需要重新处理整个sha-bang
如;
articleList = ['here is some text blah blah','another text object', 'more foo for your bar right now']
tfidf_vectorizer = TfidfVectorizer(
max_df=.8,
max_features=2000,
min_df=.05,
preprocessor=prep_text,
use_idf=True,
tokenizer=tokenize_text
)
tfidf_matrix = tfidf_vectorizer.fit_transform(articleList)
#### ADDING A NEW ARTICLE TO EXISTING SET?
bigger_tfidf_matrix = tfidf_vectorizer.fit_transform(['the last article I wanted to add'])
您可以直接访问矢量器的vocabulary_
属性,并且可以通过_tfidf._idf_diag
访问idf_
矢量,因此可以像这样进行猴子补丁:
import re
import numpy as np
from scipy.sparse.dia import dia_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
def partial_fit(self, X):
max_idx = max(self.vocabulary_.values())
for a in X:
#update vocabulary_
if self.lowercase: a = a.lower()
tokens = re.findall(self.token_pattern, a)
for w in tokens:
if w not in self.vocabulary_:
max_idx += 1
self.vocabulary_[w] = max_idx
#update idf_
df = (self.n_docs + self.smooth_idf)/np.exp(self.idf_ - 1) - self.smooth_idf
self.n_docs += 1
df.resize(len(self.vocabulary_))
for w in tokens:
df[self.vocabulary_[w]] += 1
idf = np.log((self.n_docs + self.smooth_idf)/(df + self.smooth_idf)) + 1
self._tfidf._idf_diag = dia_matrix((idf, 0), shape=(len(idf), len(idf)))
TfidfVectorizer.partial_fit = partial_fit
articleList = ['here is some text blah blah','another text object', 'more foo for your bar right now']
vec = TfidfVectorizer()
vec.fit(articleList)
vec.n_docs = len(articleList)
vec.partial_fit(['the last text I wanted to add'])
vec.transform(['the last text I wanted to add']).toarray()
# array([[ 0. , 0. , 0. , 0. , 0. ,
# 0. , 0. , 0. , 0. , 0. ,
# 0. , 0. , 0.27448674, 0. , 0.43003652,
# 0.43003652, 0.43003652, 0.43003652, 0.43003652]])
我相信给出的(优秀的)答案有几个错误—如果一个标记在一个文档中多次出现,文档频率应该只更新一次,并且词汇字典应该以不同的顺序:
def _partial_fit(self, X:):
X = X.copy()
for doc in X:
if self.lowercase:
doc = doc.lower()
tokens = re.findall(self.token_pattern, doc)
tokens = [token for token in tokens if token not in my_stop_words]
indices_to_insert = []
for w in tokens:
# We now need to update the vocabulary with the new tokens
if w not in self.vocabulary_:
# temporary placeholder in the dict
self.vocabulary_[w] = -1
# create a list in alphabetical order
# each token's value in the dict is equal to its place in the list
# this aligns with the internal dict of sklearn's TfidfVectorizer
tmp_keys = sorted(list(self.vocabulary_.keys()))
# the dictionary must be in order it has seen the tokens
tmp_dict = {tmp_keys[i]: i for i in range(len(tmp_keys))}
# Include new tokens in vocab
self.vocabulary_ = {k: tmp_dict[k] for k in self.vocabulary_}
# Update number of features by 1 for data validation
self._tfidf.n_features_in_ += 1
# We keep a list of all new indices of new tokens
indices_to_insert.append(self.vocabulary_[w])
# update document frequency
doc_frequency = (self.n_docs + self.smooth_idf) / np.exp(
self.idf_ - 1
) - self.smooth_idf
# the new token indices must be added
for index_to_insert in indices_to_insert:
doc_frequency = np.insert(doc_frequency, index_to_insert, 0)
self.n_docs += 1
# document frequency is not dependent on number of times in doc, only if
# it appears at all
for w in set(tokens):
doc_frequency[self.vocabulary_[w]] += 1
# update internal inverse document frequency
idf = (
np.log(
(self.n_docs + self.smooth_idf) / (doc_frequency + self.smooth_idf)
)
+ 1
)
# these values are updated to get correct values from the `transform`
# function
self._tfidf.idf_ = idf
self._tfidf._idf_diag = dia_matrix((idf, 0), shape=(len(idf), len(idf)))