如何获取scipy.sparse.csr.csr_matrix的log2



这段代码我在np.log2()运行错误。如果我将其作为输入而不是tmptmp.data,它将起作用.这使我得出结论np.log2()不适用于scipy.sparse.csr.csr_matrix,但可以通过调用csr_matrix.datascipy.sparse.csr.csr_matrix转换为ndarray

唯一的问题是,我可以更改的代码下面的代码期望得到scipy.sparse.csr.csr_matrix.我该怎么办?

import numpy as np
from numpy.linalg import norm
import scipy
from scipy.sparse import csr_matrix, diags
from scipy.sparse.linalg import svds
import sys
if len(sys.argv) != 2:
print("Usage: {} output_filename".format(sys.argv[0]))
sys.exit(1)
# here are a few helper functions you might find useful...
def multiply_by_rows(matrix, coefficients):
diag = diags(coefficients, 0)
return diag * matrix
def multiply_by_columns(matrix, coefficients):
diag = diags(coefficients, 0)
return matrix * diag
print("Loading cooccurrence matrix...")
with np.load("cooccur.npz") as loader:
PPMI = csr_matrix((loader['data'], loader['indices'],
loader['indptr']), shape=loader['shape'])
print("Computing PPMI...")
print(type(PPMI))
##### FILL IN THE CODE HERE #####
# you should compute the PMI matrix and save it into the PPMI variable.
# Hint: the following functions/attributes might be useful:
#
# - csr_matrix.sum()
# - csr_matrix.data
# - array.sum()
# - np.reciprocal()
# - np.log2()
# - multiply_by_rows() (above)
# - multiply_by_columns() (above)
# Refer to page 16 of chapter word embeddings
sum_of_all_values = csr_matrix.sum(PPMI)
sum_of_all_cols   = csr_matrix.sum(PPMI,axis=0)
sum_of_all_rows   = csr_matrix.sum(PPMI,axis=1) 
print(type(sum_of_all_values))
joint_probabilities = PPMI / sum_of_all_values
joint_probabilities_cols = sum_of_all_cols / sum_of_all_values
joint_probabilities_rows = sum_of_all_rows / sum_of_all_values
print(type(joint_probabilities))
tmp = multiply_by_rows(joint_probabilities, np.reciprocal(np.squeeze(np.asarray(joint_probabilities_cols))))
tmp = multiply_by_columns(tmp , np.reciprocal(np.squeeze(np.asarray(joint_probabilities_rows))))
print(type(tmp))
PPMI = np.log2(tmp)
print(type(PPMI))
##### STOP FILLING IN THE CODE HERE
# At this point, PPMI is actually PMI, so let's drop all negative values,
# sparsify, and then compute rank-50 SVD
#PPMI = max(0, PPMI)
PPMI.data[PPMI.data < 0] = 0
# sparisfy
PPMI.eliminate_zeros()
print("Computing SVD...")
u, s, vt = svds(PPMI, k = 50)
p = 1
emb = u * (s ** p)
# normalize embeddings to unit length so cos(x, y) == x.T * y
emb = (emb.T / norm(emb, axis=1, ord=2)).T
print("Saving embeddings...")
np.save(sys.argv[1], emb)
print("Done!")

输出和错误

Loading cooccurrence matrix...
Computing PPMI...
<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.float32'>
<class 'scipy.sparse.csr.csr_matrix'>
<class 'scipy.sparse.csr.csr_matrix'>
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-55-f37e357d0d51> in <module>()
44 #tmp3 = np.dot(tmp3, IC)
45 
---> 46 PPMI = np.log2(tmp)
47 print(type(PPMI))
48 ##### STOP FILLING IN THE CODE HERE
C:ProgramDataAnaconda3libsite-packagesscipysparsebase.py in __getattr__(self, attr)
684             return self.getnnz()
685         else:
--> 686             raise AttributeError(attr + " not found")
687 
688     def transpose(self, axes=None, copy=False):
AttributeError: log2 not found

这奏效了。

with np.load("cooccur.npz") as loader:
PPMI = csr_matrix((PPMI, loader['indices'],
loader['indptr']), shape=loader['shape']) 

仅使用csr_matrix (PPMI( 会给 SVD 带来形状问题

另一种方法是使用基数变化公式.csr_matrix.log1p据说采用我不需要的自然对数。基本的数学操作会给我们csr_matrix.log1p(PPMI(/csr_matrix.log1p(2(等同于np.log2(PPMI(

最新更新