我在Linux CentOS 7机器上使用Python 2.7.5。我正在尝试应用概率神经网络(PNN)我的数据集,来解决二进制分类问题。
我使用以下Python包:numpy, sklearn, neupy.algorithms。我正在尝试遵循这个用于虹膜数据集的示例。
问题是我的PNN总是预测零值(分类为零的元素),我无法理解为什么…
这是我的数据集("dataset_file.csv")。有34个特性和1个标签目标(最后一列,可能是0或1):
47,1,0,1,0,20,1,0,1,24,1,1,0,2,1,8050,9,1,274,60,258,65,7,3.2,105,289,0,0,79,1,0,0,0,34,0
55,1,0,1,0,45,1,0,0,1,1,1,1,3,0,11200,7,0,615,86,531,97,5.4,2.6,96,7541,1.6,0.8,6,1,1,1,1,42,0
29,1,1,1,0,23,0,1,0,1,0,0,0,2,1,5300,12,1,189,30,203,72,7,3.5,93,480,0,0,90,1,0,0,0,43,1
39,1,0,1,0,10,1,0,0,3,0,1,1,0,1,7910,14,1,462,28,197,50,8,4.5,93,459,5,2.8,45,1,1,0,0,21,0
47,1,0,1,0,10,1,1,1,1.5,1,1,0,3,1,9120,4,0,530,71,181,60,6.2,3.8,83,213,3.6,1.95,53,1,1,0,0,11,0
57,1,0,1,0,50,0,1,0,24,1,0,1,3,1,16000,9,0,330,78,172,74,5.9,2.9,112,332,4.1,2.1,82,1,1,0,0,23,1
44,1,0,1,0,15,1,1,0,0.5,1,1,1,2,0,5800,14,0,275,44,155,105,7.2,3.5,84,360,3.44,1.6,55,1,1,0,0,24,0
49,1,3,1,0,25,1,1,1,1,0,1,0,3,1,8200,12,1,441,74,237,111,6.2,3.6,79,211,0,0,91,1,0,0,0,43,0
56,1,0,1,0,5,1,0,0,3,1,0,0,3,1,5100,7,1,188,58,185,62,7.8,3.9,112,472,0,0,83,1,0,0,0,34,0
33,1,4,1,0,20,1,0,1,3,0,0,0,3,1,7300,10,1,329,40,139,80,6.9,3.7,89,122,3.4,1.2,75,1,1,0,0,33,0
22,0,0,1,0,15,1,0,0,1,1,1,1,0,1,3700,8,0,617,53,267,128,6.2,3.8,91,3060,3.1,1.9,63,1,1,0,0,54,0
82,0,5,1,0,60,1,0,1,3,1,1,1,0,0,8900,11,1,275,83,255,93,5.9,3.1,95,455,4.8,1.9,68,1,1,0,0,55,0
49,0,2,1,0,20,1,0,1,2,1,0,0,0,1,8500,6,1,292,84,241,79,6.8,3.9,100,158,3.4,1.25,75,1,1,1,0,65,0
51,1,4,1,0,51,1,1,1,2,1,0,1,0,1,18300,14,1,522,91,268,105,6.1,3.1,98,758,4.2,2.5,19,1,1,1,1,67,0
61,1,2,1,0,20,1,0,0,3,1,0,0,3,1,6600,9,1,563,101,268,78,6.4,3.7,115,694,5.2,3,29,1,1,1,1,77,0
48,0,1,1,0,28,1,0,0,12,1,0,0,3,1,9100,22,0,114,18,165,63,7.2,3.6,103,429,0,0,84,1,0,0,0,34,0
57,0,0,1,0,40,1,0,1,1,0,0,0,3,1,8100,8,0,264,15,120,69,6.8,3.4,91,390,0,0,91,1,0,0,0,23,0
57,0,0,1,0,25,1,0,0,12,0,1,0,0,0,6900,16,0,847,111,289,78,5.3,2.4,105,162,3.1,1.9,68,1,1,1,0,78,0
47,1,4,1,0,40,0,1,1,6,1,1,1,2,1,21500,10,0,632,121,219,108,7.5,2.8,149,1158,3.17,1.77,8,1,1,1,0,58,1
52,0,0,1,0,30,1,1,1,2,1,0,1,0,0,14600,5,1,405,88,280,140,5.8,3.1,121,983,3.9,1.8,17,1,1,1,1,76,0
50,1,2,1,0,16,1,1,0,1,1,1,1,0,1,12200,9,1,280,7,176,71,7.4,4.2,105,293,4.5,2.7,68,1,1,0,0,67,0
63,1,4,1,0,18,1,0,1,0.5,1,1,1,3,0,16400,8,0,479,93,140,64,5.8,3.7,226,1286,6.22,3.6,18,1,1,0,0,19,0
54,0,0,1,0,20,0,0,1,8,0,0,1,0,1,7200,10,0,366,71,284,73,6.4,3.7,114,384,4.1,2.8,65,1,1,0,0,24,1
31,0,3,1,0,10,0,1,0,1,1,1,1,1,1,3800,8,0,568,102,236,59,6.4,3.7,99,387,0,0,78,1,0,0,0,45,1
44,0,6,1,0,10,1,1,0,2,1,1,1,0,1,7700,15,1,274,44,139,62,6.7,4.1,93,129,0,0,76,1,0,0,0,24,0
50,0,6,1,0,20,1,0,0,3,1,1,0,0,1,5200,6,0,403,90,224,79,6.3,3.1,109,151,3.1,1.4,79,1,0,0,0,34,0
61,1,3,1,0,30,0,1,0,3,1,0,1,2,1,11500,7,0,668,88,178,65,6.7,3.08,104,680,4.1,2.5,22,1,1,0,0,23,1
下面是我的Python代码:
import numpy as np
from sklearn import datasets
from sklearn.metrics import matthews_corrcoef
from sklearn.cross_validation import StratifiedKFold
from neupy.algorithms import PNN
fileName="dataset_file.csv"
TARGET_COLUMN=35
from numpy import genfromtxt
input_dataset_data = genfromtxt(fileName, delimiter=',', skip_header=0, usecols=(range(0, TARGET_COLUMN-1)))
#print(input_dataset_data)
input_dataset_target = genfromtxt(fileName, delimiter=',', skip_header=0, usecols=(TARGET_COLUMN-1))
#print(input_dataset_target)
kfold_number = 2
skfold = StratifiedKFold(input_dataset_target, kfold_number, shuffle=True)
avarage_result = 0
print("> Start classify input_dataset dataset")
for i, (train, test) in enumerate(skfold, start=1):
pnn_network = PNN(std=0.1, step=0.2, verbose=True)
pnn_network.train(input_dataset_data[train], input_dataset_target[train])
predictions = pnn_network.predict(input_dataset_data[test])
print(predictions)
#print(input_dataset_target[test])
mcc = matthews_corrcoef(input_dataset_target[test], predictions)
print "The Matthews correlation coefficient is %f" % mcc
print("kfold #{:<2}: Guessed {} out of {}".format(
i, np.sum(predictions == input_dataset_target[test]), test.size
))
有人知道为什么我只得到0值的预测吗?你能给我一些建议来解决这个问题吗?
谢谢!
编辑:这是规范化的数据集(按列规范化):
0.55,1,0,1,0,0.29,1,0,1,0.46,1,1,0,0.67,1,0.37,0.41,1,0.08,0.47,0.23,0.13,0.82,0.46,0.25,0.04,0,0,0.52,1,0,0,0,0.33,0
0.65,1,0,1,0,0.64,1,0,0,0.02,1,1,1,1,0,0.52,0.32,0,0.18,0.67,0.47,0.2,0.64,0.38,0.23,1,0.24,0.18,0.04,1,1,1,1,0.41,0
0.34,1,0.13,1,0,0.33,0,0.5,0,0.02,0,0,0,0.67,1,0.25,0.55,1,0.06,0.23,0.18,0.15,0.82,0.51,0.22,0.06,0,0,0.6,1,0,0,0,0.42,1
0.46,1,0,1,0,0.14,1,0,0,0.06,0,1,1,0,1,0.37,0.64,1,0.14,0.22,0.17,0.1,0.94,0.65,0.22,0.06,0.75,0.64,0.3,1,1,0,0,0.2,0
0.55,1,0,1,0,0.14,1,0.5,1,0.03,1,1,0,1,1,0.42,0.18,0,0.16,0.55,0.16,0.12,0.73,0.55,0.2,0.03,0.54,0.44,0.35,1,1,0,0,0.11,0
0.67,1,0,1,0,0.71,0,0.5,0,0.46,1,0,1,1,1,0.74,0.41,0,0.1,0.6,0.15,0.15,0.69,0.42,0.27,0.04,0.61,0.48,0.54,1,1,0,0,0.22,1
0.52,1,0,1,0,0.21,1,0.5,0,0.01,1,1,1,0.67,0,0.27,0.64,0,0.08,0.34,0.14,0.21,0.85,0.51,0.2,0.05,0.51,0.36,0.36,1,1,0,0,0.23,0
0.58,1,0.38,1,0,0.36,1,0.5,1,0.02,0,1,0,1,1,0.38,0.55,1,0.13,0.57,0.21,0.23,0.73,0.52,0.19,0.03,0,0,0.6,1,0,0,0,0.42,0
0.66,1,0,1,0,0.07,1,0,0,0.06,1,0,0,1,1,0.24,0.32,1,0.06,0.45,0.16,0.13,0.92,0.57,0.27,0.06,0,0,0.55,1,0,0,0,0.33,0
0.39,1,0.5,1,0,0.29,1,0,1,0.06,0,0,0,1,1,0.34,0.45,1,0.1,0.31,0.12,0.16,0.81,0.54,0.21,0.02,0.51,0.27,0.5,1,1,0,0,0.32,0
0.26,0,0,1,0,0.21,1,0,0,0.02,1,1,1,0,1,0.17,0.36,0,0.19,0.41,0.24,0.26,0.73,0.55,0.22,0.41,0.46,0.43,0.42,1,1,0,0,0.52,0
0.96,0,0.63,1,0,0.86,1,0,1,0.06,1,1,1,0,0,0.41,0.5,1,0.08,0.64,0.23,0.19,0.69,0.45,0.23,0.06,0.72,0.43,0.45,1,1,0,0,0.53,0
0.58,0,0.25,1,0,0.29,1,0,1,0.04,1,0,0,0,1,0.4,0.27,1,0.09,0.65,0.21,0.16,0.8,0.57,0.24,0.02,0.51,0.28,0.5,1,1,1,0,0.63,0
0.6,1,0.5,1,0,0.73,1,0.5,1,0.04,1,0,1,0,1,0.85,0.64,1,0.16,0.71,0.24,0.21,0.72,0.45,0.23,0.1,0.63,0.57,0.13,1,1,1,1,0.65,0
0.72,1,0.25,1,0,0.29,1,0,0,0.06,1,0,0,1,1,0.31,0.41,1,0.17,0.78,0.24,0.16,0.75,0.54,0.27,0.09,0.78,0.68,0.19,1,1,1,1,0.75,0
0.56,0,0.13,1,0,0.4,1,0,0,0.23,1,0,0,1,1,0.42,1,0,0.03,0.14,0.15,0.13,0.85,0.52,0.24,0.06,0,0,0.56,1,0,0,0,0.33,0
0.67,0,0,1,0,0.57,1,0,1,0.02,0,0,0,1,1,0.38,0.36,0,0.08,0.12,0.11,0.14,0.8,0.49,0.22,0.05,0,0,0.6,1,0,0,0,0.22,0
0.67,0,0,1,0,0.36,1,0,0,0.23,0,1,0,0,0,0.32,0.73,0,0.25,0.86,0.26,0.16,0.62,0.35,0.25,0.02,0.46,0.43,0.45,1,1,1,0,0.76,0
0.55,1,0.5,1,0,0.57,0,0.5,1,0.12,1,1,1,0.67,1,1,0.45,0,0.19,0.94,0.19,0.22,0.88,0.41,0.35,0.15,0.47,0.4,0.05,1,1,1,0,0.56,1
0.61,0,0,1,0,0.43,1,0.5,1,0.04,1,0,1,0,0,0.68,0.23,1,0.12,0.68,0.25,0.29,0.68,0.45,0.29,0.13,0.58,0.41,0.11,1,1,1,1,0.74,0
0.59,1,0.25,1,0,0.23,1,0.5,0,0.02,1,1,1,0,1,0.57,0.41,1,0.08,0.05,0.16,0.15,0.87,0.61,0.25,0.04,0.67,0.61,0.45,1,1,0,0,0.65,0
0.74,1,0.5,1,0,0.26,1,0,1,0.01,1,1,1,1,0,0.76,0.36,0,0.14,0.72,0.12,0.13,0.68,0.54,0.54,0.17,0.93,0.82,0.12,1,1,0,0,0.18,0
0.64,0,0,1,0,0.29,0,0,1,0.15,0,0,1,0,1,0.33,0.45,0,0.11,0.55,0.25,0.15,0.75,0.54,0.27,0.05,0.61,0.64,0.43,1,1,0,0,0.23,1
0.36,0,0.38,1,0,0.14,0,0.5,0,0.02,1,1,1,0.33,1,0.18,0.36,0,0.17,0.79,0.21,0.12,0.75,0.54,0.24,0.05,0,0,0.52,1,0,0,0,0.44,1
0.52,0,0.75,1,0,0.14,1,0.5,0,0.04,1,1,1,0,1,0.36,0.68,1,0.08,0.34,0.12,0.13,0.79,0.59,0.22,0.02,0,0,0.5,1,0,0,0,0.23,0
0.59,0,0.75,1,0,0.29,1,0,0,0.06,1,1,0,0,1,0.24,0.27,0,0.12,0.7,0.2,0.16,0.74,0.45,0.26,0.02,0.46,0.32,0.52,1,0,0,0,0.33,0
0.72,1,0.38,1,0,0.43,0,0.5,0,0.06,1,0,1,0.67,1,0.53,0.32,0,0.2,0.68,0.16,0.13,0.79,0.45,0.25,0.09,0.61,0.57,0.15,1,1,0,0,0.22,1
我试过你的代码,在我看来,你的网络预测两个类:
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from neupy.algorithms import PNN
fileName="dataset_file.csv"
TARGET_COLUMN=35
from numpy import genfromtxt
input_dataset_data = genfromtxt(fileName, delimiter=',', skip_header=0, usecols=(range(0, TARGET_COLUMN-1)))
input_dataset_target = genfromtxt(fileName, delimiter=',', skip_header=0, usecols=(TARGET_COLUMN-1))
kfold_number = 5
skfold = StratifiedKFold(input_dataset_target, kfold_number, shuffle=True)
print("> Start classify input_dataset dataset")
for std in [0.2, 0.4, 0.6, 0.8, 1]:
average_results = []
for i, (train, test) in enumerate(skfold, start=1):
pnn_network = PNN(std=std, step=0.2, verbose=False, batch_size=2)
pnn_network.train(input_dataset_data[train], input_dataset_target[train])
predictions = pnn_network.predict(input_dataset_data[test])
print("Positive in predictions:", 1 in predictions)
average_result.append(np.sum(predictions == input_dataset_target[test]) /float(len(predictions)))
print std, np.average(average_result)
调优std时的输出示例:
1 0.881558441558
('Positive in predictions:', True)
('Positive in predictions:', True)
('Positive in predictions:', True)
('Positive in predictions:', True)
('Positive in predictions:', True)
即std=1
的平均准确率为~0.88,每轮分层交叉验证预测准确率为1。我使用了你编辑的规范化数据