不确定如何最好地说标题,所以我很抱歉...
我是Python机器学习的新手,仍然学习自己。我有此数据集(ml_test):
Sale ID Amount in $ Region Product Salesperson Win_Lose
1 500 North ink Jon 1
2 250 North ink Jon 0
3 250 North ink Jon 0
4 750 North paper Jon 0
5 800 North ink Bill 0
6 250 North paper Bill 1
7 750 North paper Jon 1
8 250 North ink Bill 1
9 250 North paper Dave 0
10 800 North desk chair Bill 1
11 750 South paper Dave 0
12 500 South desk chair Dave 1
13 500 South ink Bill 1
14 500 South ink Bill 0
15 400 South paper Jon 0
16 250 South paper Jon 0
17 250 South ink Jon 1
18 250 East ink Dave 1
19 250 East ink Bill 1
20 400 East ink Jon 0
21 400 East paper Dave 1
22 500 West desk chair Bill 0
23 750 West desk chair Jon 1
24 800 West desk chair Jon 0
25 450 West paper Jon 0
26 250 West ink Dave 1
27 250 West paper Dave 1
28 250 West paper Bill 1
29 250 West paper Bill 0
30 400 West ink Bill 1
我试图了解运行此操作时遇到的错误:
#Load Libraries
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import pyodbc
conn = pyodbc.connect('')
sql = "Select * from TMP.ML_TEST"
dataset = pd.read_sql(sql, conn)
array = dataset.values
X = array[:,0:5]
Y = array[:,5]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
print(Y)
seed = 7
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=12, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
这是我遇到的错误:
ValueError Traceback (most recent call last)
<ipython-input-119-86bed78dded1> in <module>()
12 for name, model in models:
13 kfold = model_selection.KFold(n_splits=12, random_state=seed)
---> 14 cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
15 results.append(cv_results)
16 names.append(name)
C:ProgramDataAnaconda3libsite-packagessklearnmodel_selection_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
138 train, test, verbose, None,
139 fit_params)
--> 140 for train, test in cv_iter)
141 return np.array(scores)[:, 0]
142
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblibparallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblibparallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblibparallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblib_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblib_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblibparallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:ProgramDataAnaconda3libsite-packagessklearnexternalsjoblibparallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:ProgramDataAnaconda3libsite-packagessklearnmodel_selection_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
236 estimator.fit(X_train, **fit_params)
237 else:
--> 238 estimator.fit(X_train, y_train, **fit_params)
239
240 except Exception as e:
C:ProgramDataAnaconda3libsite-packagessklearnlinear_modellogistic.py in fit(self, X, y, sample_weight)
1171
1172 X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
-> 1173 order="C")
1174 check_classification_targets(y)
1175 self.classes_ = np.unique(y)
C:ProgramDataAnaconda3libsite-packagessklearnutilsvalidation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
520 ensure_2d, allow_nd, ensure_min_samples,
--> 521 ensure_min_features, warn_on_dtype, estimator)
522 if multi_output:
523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
C:ProgramDataAnaconda3libsite-packagessklearnutilsvalidation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
380 force_all_finite)
381 else:
--> 382 array = np.array(array, dtype=dtype, order=order, copy=copy)
383
384 if ensure_2d:
ValueError: could not convert string to float: 'Jon'
我真的很想使用幼稚的bayes模型,因为我的许多功能都是文本,但是我什至无法克服此错误:(
我正在尝试建立一个模型,以预测销售是基于这些功能而赢或损失的。
您正在尝试在由strings
组成的功能向量上应用ML技术。这是不可能的,因为数学操作仅针对double
/float
值定义。要解决此错误,您必须将此字符串标签转换为数值表示。为此,您可以使用sklearn.preprocessing.LabelEncoder
。
>>> le = preprocessing.LabelEncoder()
>>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"])
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))
['tokyo', 'tokyo', 'paris']
您可以在文档中找到更多信息。