如何从整数中取回数据.我的 model.predict() 不起作用



我有一个csv.包含"性别","诊断","测试","physical_exam","医学"这些列。我想根据"性别"、"诊断"、"测试"、"physical_exam"这些列来预测"医学"列。 我已经这样做了:

import nltk
import re
import pandas as pd
from io import StringIO
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
stop_words=set(stopwords.words("english"))
def stop_words_filtering(wordlist):
return [w for w in wordlist if not w in stop_words]
def tagg(wordlist):
try:
#print(wordlist)
tagged_list=[]
tagged=nltk.pos_tag(wordlist)
return tagged
except Exception as e:
print(e)
filter_words_by_parts_of_speach=lambda words:[word for word,pp in words if pp!='CD' and pp!='']
join_words=lambda words:' '.join(words)
remove_numbering = lambda string:re.sub("d.", "", string)
remove_punchuation = lambda string:re.sub(r'[^ws]','',string)
df=pd.read_csv('Pescription_details.csv')
col = ['gender','diagnosis','test','physical_exam','medicine']
df = df[col]
df = df.replace(np.nan, '', regex=True)
df.columns = ['gender','diagnosis','test','physical_exam','medicine']
for colm in col:
df[colm]=df[colm].str.replace('n',' ')
df[colm]=df[colm].apply(remove_numbering).apply(remove_punchuation).apply(word_tokenize).apply(stop_words_filtering).apply(tagg).apply(filter_words_by_parts_of_speach).apply(join_words)
#df.to_csv('keyword.csv')
df=df.apply(lambda x: pd.factorize(x)[0])
#print(df.head(10))
X=df[['gender','diagnosis','test','physical_exam']]
y=df[['medicine']]

vect = CountVectorizer()
vect.fit(X)
simple_train_dtm = vect.transform(X)
percent = 0.0
rds=0
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=138)
y_train, y_test=y_train.values.ravel(),y_test.values.ravel()
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
vect.vocabulary_.get(u'algorithm')
nb = MultinomialNB()
nb.fit(X_train,y_train)
y_pred_class = nb.predict(X_test)
percentage=metrics.accuracy_score(y_test, y_pred_class)*100
print(percentage)
ber=BernoulliNB()
ber.fit(X_train,y_train)
y_pred_class = ber.predict(X_test)
percentage=metrics.accuracy_score(y_test, y_pred_class)*100
print('bernoli',percentage)
print(ber.predict( [vect.transform(["Male"]),
vect.transform(["Old inferior myocardial infarction.Occasional chest pain on lifting weight at shop."]),
vect.transform(["Electrocardiogram:  Old inferior myocardial infarction. FBS:  95 mg/dL. Creatinine: 1.99 mg/dL. SGPT:  Normal. Fasting lipid profile:  Normal. Echo:  Akinetic inferior wall."]),
vect.transform(["Chest:  Clear. 1st heart sound and 2nd heart sound: Audible."])
] ))

但是这段代码给了我错误


47.02702702702703
bernoli 51.891891891891895
Traceback (most recent call last):
File "backend.py", line 90, in <module>
vect.transform(["Chest:  Clear. 1st heart sound and 2nd heart sound: Audible."])
File "/home/android/.local/lib/python3.6/site-packages/sklearn/naive_bayes.py", line 65, in predict
jll = self._joint_log_likelihood(X)
File "/home/android/.local/lib/python3.6/site-packages/sklearn/naive_bayes.py", line 943, in _joint_log_likelihood
X = check_array(X, accept_sparse='csr')
File "/home/android/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 521, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[<1x5 sparse matrix of type '<class 'numpy.int64'>'
with 0 stored elements in Compressed Sparse Row format>
<1x5 sparse matrix of type '<class 'numpy.int64'>'
with 0 stored elements in Compressed Sparse Row format>
<1x5 sparse matrix of type '<class 'numpy.int64'>'
with 0 stored elements in Compressed Sparse Row format>
<1x5 sparse matrix of type '<class 'numpy.int64'>'
with 0 stored elements in Compressed Sparse Row format>
<1x5 sparse matrix of type '<class 'numpy.int64'>'
with 0 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

如何预测"医学"列? 此代码的输出应为"奥美沙坦氯吡格雷瑞舒伐他汀5伊伐布雷定"。这些是药物的名称

from sklearn.feature_extraction.text import CountVectorizer
# creating dummy dataframe with 2 textual column text and gender, 
df = pd.DataFrame([['Old inferior myocardial infarction.Occasional chest pain on lifting weight at shop.',1,'Male'],
['Chest:  Clear. 1st heart sound and 2nd heart sound: Audible.',0,'Female']],columns = ['text','label','Gender'])

#now assign x as you have done and passing to it Countvectorizer object
x = df[['text','Gender']]
vect = CountVectorizer
x_train = vect.fit_transform(x)
#now when you look at at vocabulary created by countvectorizer
print(vect.vocabulary_)
#op
{'text': 1, 'gender': 0} #it will give you only column name, 
#rather it should give you word which is present in your text as well as gender column

#to deal with this concatenate all string column
df['combined_text'] = df['text'] + ' ' + df['Gender']
x_train = vect.fit_transform(df['combined_text'])
#now when you look at its vocabulary, it will give all word present in combined text column
print(vect.vocabulary_)
{'old': 15, 'inferior': 10, 'myocardial': 13, 'infarction': 9, 'occasional': 14, 'chest': 5, 'pain': 17, 'on': 16, 'lifting': 11, 'weight': 20, 'at': 3, 'shop': 18, 'male': 12, 'clear': 6, '1st': 0, 'heart': 8, 'sound': 19, 'and': 2, '2nd': 1, 'audible': 4, 'female': 7} 
#now coming to your predict function of model
model.fit(x_train, df['label'])
#you are using vect.transform for every individual text in predict function which is 
#creating separate sparse matrix, which is throwing error.    
model.predict(vect.transform(['old inferior myocardial','Clear. 1st heart sound']))
#now you will get two output since you are passing 2 text to predict
array([1, 0], dtype=int64) # 

最新更新