'no unique mode; found %d equally common values' % len(表)统计。统计错误:无唯一模式;找到 2 个同样常见的值



当我使用大量数据时显示此错误:('没有唯一模式;发现%d个相同的值'%len(表(统计信息。StatisticsError:没有唯一模式;发现2个相同的共同值(。但是使用100个数据,它是有效的。我不明白它不起作用的原因是什么。请帮助和如何解决这个错误

数据链接:https://github.com/YoeriNijs/TweetAnalyzer

代码:

import warnings
warnings.filterwarnings("ignore")
import nltk, random, csv, sys
from nltk.probability import FreqDist, ELEProbDist
from nltk.classify.util import apply_features,accuracy
from nltk.corpus import names
from nltk.tokenize import word_tokenize
import nltk.classify.util
from nltk import NaiveBayesClassifier
from textblob import TextBlob
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers

def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)

def get_words_in_tweets(tweets):
all_words = []
try:
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
except Exception as e:
print(e)

def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
#print (word_features)
return word_features

def selectTweets(row):
tweetWords = []
words = row[0].split()
for i in words:
i = i.lower()
i = i.strip('@#'"?,.!')
tweetWords.append(i)
row[0] = tweetWords
if counter <= 120:
trainTweets.append(row)
#print(trainTweets)
#print(('*')*30)
else:
testTweets.append(row)
#print(testTweets)

def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features 

trainTweets = []
testTweets = []
#csvfile.csv
while True:
# Ask for filename
filename =  str(input("> Please enter a filename (.csv): "))
#Check if filename ends with .csv
if filename.endswith(".csv"):
try:
#Open file
with open(filename, 'r',encoding='utf-8') as csvfile: 
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
#Print succes message
print ("> File opened successfully!")

counter = 0
for row in reader:
selectTweets(row)
counter += 1
print (counter,"> Wait a sec for the results...")
word_features = get_word_features(get_words_in_tweets(trainTweets))      

training_set = apply_features(extract_features, trainTweets)
test_training_set=apply_features(extract_features, testTweets)

classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features(5)
print (nltk.classify.util.accuracy(classifier,test_training_set))

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, test_training_set))
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_training_set))


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_training_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_training_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_training_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_training_set))*100)
voted_classifier = VoteClassifier(classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)

while True:

tweet =  str(input("Please enter the text of the tweet you want to analize: "))
print (classifier.classify(extract_features(tweet.split())))

while True:
print
repeat =  str(input("> Do you want to check another tweet (y/n)? "))
if repeat == "n":
print ("Exit program")
sys.exit()
if repeat != "y":
print ("Something went wrong")
if repeat == "y":
break         
#If file does not exist, display this"""
except IOError:
print ("File does not exist.")
#Else if file does not end with .csv, do this
else:
print ("Please open a file that ends with .csv")

显示此错误:

Traceback (most recent call last):
File "C:UsersNahidDesktopmain foldernewcheck.py", line 163, in         <module>
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
File "C:UsersNahidAppDataLocalProgramsPythonPython36-32libsite-packagesnltkclassifyutil.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
File "C:UsersNahidAppDataLocalProgramsPythonPython36-32libsite-packagesnltkclassifyapi.py", line 77, in classify_many
return [self.classify(fs) for fs in featuresets]
File "C:UsersNahidAppDataLocalProgramsPythonPython36-32libsite-packagesnltkclassifyapi.py", line 77, in <listcomp>
return [self.classify(fs) for fs in featuresets]
File "C:UsersNahidDesktopmain foldernewcheck.py", line 35, in classify
return mode(votes)
File "C:UsersNahidAppDataLocalProgramsPythonPython36-32libstatistics.py", line 507, in mode

'没有唯一模式;找到%d个相同的通用值"%len(表("统计数字StatisticsError:没有唯一模式;发现两个相同的值

解决此问题的最简单方法是将Python升级到3.8或更高版本。

在Python 3.7及更高版本中,只能有一个数字在整个集合中出现次数最多。如果一个集合包含两个或更多这样的数字,则模式将变得不确定,并返回您得到的确切错误。

然而,自3.8版本以来,整个数学概念发生了变化。在一组中有两个或多个模式的情况下,会选择最小的模式作为结果。

示例:

result = statistics.mode([1,1,2,2,3,3])

有三种可能且相等的解决方案:123,因为每个数字在集合中出现两次

在Python 3.7中,这会返回一个错误,

在Python 3.8中,这将返回1作为模式

最新更新