我是ML的新手,曾尝试使用这个GitHub存储库来构建MNIST机器学习模型。
由于我必须从我的计算机导入数据集,我不得不稍微改变一下。我导入的数据集也不包括所有的10位数字,只包括5位。
计算的准确率为96%,但当我将计算机上的.png文件与结果txt进行交叉检查时,这些标签毫无意义。它将一些4标记为7,一些2标记为5,依此类推
这就是我电脑上的文件夹结构:
2
-->001.png
-->002.png
-->003.png
-->...
3
-->001.png
-->002.png
-->003.png
-->...
4
-->001.png
-->002.png
-->003.png
-->...
5
-->001.png
-->002.png
-->003.png
-->...
7
-->001.png
-->002.png
-->003.png
-->...
问题1:我之前有一个错误,它预计有8个不同的类别,因为7是最高数字的标签。我不知道如何解决这个问题,所以我将文件夹从0重命名为4。知道如何解决这个问题,而不必重命名所有文件夹吗?
问题2:你知道为什么结果没有任何意义吗?这似乎不是一个过度拟合的问题,我试着调整了训练测试的分配,但没有任何影响。
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time
#x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
import os
from os import listdir
from os.path import isfile, join
import cv2
label_folder_training = []
label_files_training = []
total_size_training = 0
total_size_testing = 0
data_path_training = r"Training_data"
data_path_testing = r"Testing_data"
for root, dirs, files in os.walk(data_path_training):
for dir in dirs:
label_folder_training.append(dir)
total_size_training += len(files)
for file in files:
label_files_training.append(file)
for root, dirs, files in os.walk(data_path_testing):
total_size_testing += len(files)
#to ignore .DS_Store file
total_size_training = total_size_training - 1
total_size_testing = total_size_testing
print("found", total_size_training, "training files and", total_size_testing, "testing files.")
print("folder Training:",label_folder_training)
# Print returns the following:
#found 20000 training files and 5000 testing files.
#folder Training: ['0', '1', '4', '3', '2']
x = []
y = []
for i in range(len(label_folder_training)):
labelPath_training = os.path.join(data_path_training,label_folder_training[i])
FileName = [f for f in listdir(labelPath_training) if isfile(join(labelPath_training, f))]
for j in range(len(FileName)):
path = os.path.join(labelPath_training,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
x.append(img)
y.append(label_folder_training[i])
x = np.array(x)
x = np.reshape(x, (20000, 784))
x = (x/255).astype('float32')
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
import pandas as pd
class Module:
def __init__(self):
self._train = True
def forward(self, input):
raise NotImplementedError
def backward(self, input, grad_output):
raise NotImplementedError
def parameters(self):
"""
Returns list of its parameters
"""
return []
def grad_parameters(self):
"""
Returns list of tensors gradients of its parameters
"""
return []
def train(self):
self._train = True
def eval(self):
self._train = False
class Criterion:
def forward(self, input, target):
raise NotImplementedError
def backward(self, input, target):
raise NotImplementedError
class Linear(Module):
def __init__(self, dim_in, dim_out):
super().__init__()
self.W = np.random.randn(dim_in, dim_out)
self.b = np.random.randn(1, dim_out)
def forward(self, input):
self.output = (np.dot(input, self.W) + self.b)
return self.output
def backward(self, input, grad_output):
self.grad_b = np.mean(grad_output, axis=0)
self.grad_W = np.dot(input.T, grad_output)
self.grad_W /= input.shape[0]
grad_input = np.dot(grad_output, self.W.T)
return grad_input
def parameters(self):
return [self.W, self.b]
def grad_parameters(self):
return [self.grad_W, self.grad_b]
def softmax(xs):
xs = np.subtract(xs, xs.max(axis=1, keepdims=True))
xs = np.exp(xs) / np.sum(np.exp(xs), axis=1, keepdims=True)
return xs
class CrossEntropy(Criterion):
def __init__(self):
super().__init__()
def forward(self, input, target):
eps = 1e-9
predictions = np.clip(input, eps, 1. - eps)
N = predictions.shape[0]
ce = -np.sum(target * np.log(predictions))
return ce / N
def backward(self, input, target):
eps = 1e-9
input_clamp = np.clip(input, eps, 1 - eps)
return softmax(input_clamp) - target
class Sequential(Module):
def __init__(self, *layers):
super().__init__()
self.layers = layers
def forward(self, input):
for layer in self.layers:
input = layer.forward(input)
self.output = input
return self.output
def backward(self, input, grad_output):
for i in range(len(self.layers) - 1, 0, -1):
grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)
grad_output = self.layers[0].backward(input, grad_output)
return grad_output
def parameters(self):
res = []
for l in self.layers:
res += l.parameters()
return res
def grad_parameters(self):
res = []
for l in self.layers:
res += l.grad_parameters()
return res
def train(self):
for layer in self.layers:
layer.train()
def eval(self):
for layer in self.layers:
layer.eval()
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class Sigmoid(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = sigmoid(input)
return self.output
def backward(self, input, grad_output):
grad_input = sigmoid(input) * (1 - sigmoid(input)) * grad_output
return grad_input
class SoftMax(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = np.subtract(input, input.max(axis=1, keepdims=True))
self.output = np.exp(self.output) / np.sum(np.exp(self.output), axis=1, keepdims=True)
return self.output
def backward(self, input, grad_output):
return grad_output
def DataLoader(X, Y, batch_size=32):
n = X.shape[0]
indices = np.arange(n)
np.random.shuffle(indices)
for start in range(0, n, batch_size):
end = min(start + batch_size, n)
batch_idx = indices[start:end]
yield X[batch_idx], Y[batch_idx]
def accuracy_score(y_true, y_pred):
a = np.argmax(y_true, axis=1)
b = np.argmax(y_pred, axis=1)
return np.count_nonzero(a == b) / y_true.shape[0]
class Adam:
def __init__(self, model):
self.prev_m = None
self.prev_v = None
self.model = model
self.t = 1
def step(self, lr, beta1, beta2):
prev_m_tmp = []
prev_v_tmp = []
eps = 1e-7
for i, (weights, gradient) in enumerate(zip(self.model.parameters(), self.model.grad_parameters())):
if self.prev_m and self.prev_v:
m = beta1 * self.prev_m[i] + (1 - beta1) * gradient
v = beta2 * self.prev_v[i] + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
else:
m = beta1 * 0 + (1 - beta1) * gradient
v = beta2 * 0 + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
weights -= lr * m_hat / (np.sqrt(v_hat) + eps)
prev_m_tmp.append(m)
prev_v_tmp.append(v)
self.prev_m = prev_m_tmp
self.prev_v = prev_v_tmp
self.t += 1
model = Sequential(
Linear(784, 512),
Sigmoid(),
Linear(512, 256),
Sigmoid(),
Linear(256, 128),
Sigmoid(),
Linear(128, 64),
Sigmoid(),
Linear(64, 5),
SoftMax(),
)
epochs = 20
eval_every = 1
batch_size = 1024
criterion = CrossEntropy()
optimizer = Adam(model)
for epoch in range(epochs):
for x, y in DataLoader(X_train, y_train, batch_size=batch_size):
model.train()
y_pred = model.forward(x)
grad = criterion.backward(y_pred, y)
model.backward(x, grad)
optimizer.step(lr=0.003, beta1=0.9, beta2=0.999)
if (epoch + 1) % eval_every == 0:
model.eval()
y_train_pred = model.forward(X_train)
y_test_pred = model.forward(X_test)
loss_train = criterion.forward(y_train_pred, y_train)
loss_test = criterion.forward(y_test_pred, y_test)
print(f'Epoch: {epoch + 1}/{epochs}')
print(f'Train Loss: {loss_train} Train Accuracy: {accuracy_score(y_train, y_train_pred)}')
print(f'Test Loss: {loss_test} Test Accuracy: {accuracy_score(y_test, y_test_pred)} n')
# Returns the following in epoch 20/20:
# Epoch: 20/20
# Train Loss: 0.151567557756849 Train Accuracy: 0.9905
# Test Loss: 0.706321046620394 Test Accuracy: 0.9563333333333334
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
x_val = np.array(test_x)
x_val = np.reshape(x_val, (5000, 784))
x_val = (x_val/255).astype('float32')
df_test = pd.DataFrame(x_val,columns=range(784)).add_prefix('pixels_')
output = model.forward(df_test)
output_arg = np.argmax(output, axis=1)
ImageId = df_test.index +1
submission = pd.DataFrame({'ImageId': ImageId, 'Label': output})
submission['ImageId'] = submission['ImageId'].apply('{:0>4}'.format)
submission.to_csv('export.txt', sep=' ', index=False, header=False)
找到了我的问题的答案。输出没有任何意义,因为python以随机顺序导入测试文件。我所要做的就是在让模型运行之前对FileName
进行排序。
我更改了这个
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
到此:
test_x=[]
FileName = sorted( filter( lambda x: os.path.isfile(os.path.join(data_path_testing, x)),
os.listdir(data_path_testing) ) )
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)