如何解决以下错误:dist = np.sum(train_data_features, axis=0) 文件"/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/fromnumeric.py",第 1711 行,总计 返回总和(轴=轴,dtype=dtype,out=out)类型错误:sum() 得到一个意外的关键字参数 'dtype'
这是我的代码:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility
import pandas as pd
import numpy as np
if __name__ == '__main__':
train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTrain.csv'), header=0)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTest.csv'), header=0)
train["Abstract"].fillna(0)
print 'A sample Abstract is:'
print train["Abstract"][0]
#raw_input("Press Enter to continue...")
#print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
#nltk.download() # Download text data sets, including stop words
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
print "Cleaning and parsing the training set abstracts...n"
#for i in xrange( 0, len(train["Abstract"])):
for i in xrange( 0, 10):
if pd.isnull(train["Abstract"][i])==False:
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["Abstract"][i], True)))
else:
clean_train_reviews.append(" ")
print clean_train_reviews
# ****** Create a bag of words from the training set
#
print "Creating the bag of words...n"
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = 5000)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
print clean_train_reviews
train_data_features = vectorizer.fit_transform(clean_train_reviews)
print 'train_data_features'
print train_data_features
print train_data_features.shape
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)
看起来你不能对矢量化器给你的东西求和。您将需要一种不同的方法来求和,您应该能够在scipy的稀疏库中找到,很可能只需调用
dist = train_data_features.sum (axis=0)
这是我从coo_sparse矩阵总和文档中得到的。请参阅下面的详细信息
来自 sklearn 文档:
此实现使用 scipy.sparse.coo_matrix生成计数的稀疏表示形式。
从谷歌搜索这种类型的错误:
这在以前从未奏效,因为numpy对scipy.sparse一无所知。