Tensorflow:GPU上矩阵乘法(NaN)的错误结果



tf.matmul(u_hat = tf.matmul(W_tiled, u_tiled)(在CPU和GPU上返回不同的结果。第二批后tensorflow/numpy matmul操作的平均值为3.6066954e+17/2.7731653e-06。最后,当代码在GPU上运行时,矩阵乘积开始包含NaN值。

复制者:

  • Win 10 | GTX 1080 | CUDA Toolkit 10。0| Python 3.7 | GPU驱动程序431.86 | Tensorflow 2.0

  • Win 10 | GTX 1080 | CUDA Toolkit 10.0| Python 3.7 | GPU驱动程序431.86 | Tensorflow 1.15

  • Win 10 | GTX 1080 | CUDA Toolkit 10.0| Python 3.7 | GPU驱动程序441.08 | Tensorflow 2.0

  • Win 10 | GTX 1080 | CUDA Toolkit 10.0| Python 3.7 | GPU驱动程序441.08 | Tensorflow 1.15

from tensorflow.keras import backend as K
from keras.layers import Layer
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist
import numpy as np
from tensorflow.keras.layers import *
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf

def squash(s, axis=-1, epsilon=1e-7, name=None):
squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=True)
safe_norm = tf.sqrt(squared_norm + epsilon)
squash_factor = squared_norm / (1. + squared_norm)
unit_vector = s / safe_norm
return squash_factor * unit_vector

def softmax(x, axis=-1):
ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
return ex / K.sum(ex, axis=axis, keepdims=True)

def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):
squared_norm = tf.reduce_sum(tf.square(s), axis=axis, keepdims=keep_dims)
return tf.sqrt(squared_norm + epsilon)

class Custom_layer(Layer):
def __init__(self, **kwargs):
super(Custom_layer, self).__init__(**kwargs)
def build(self, input_shape):
self.kernel = self.add_weight(
shape=(1, 1152, 10, 16, 8),
initializer=tf.keras.initializers.RandomNormal(0.0, stddev=0.01),
trainable=True)
def call(self, inputs):
reshaped = tf.reshape(inputs, [-1, 1152, 8])
inputs = squash(reshaped)
u_expanded_0 = tf.expand_dims(inputs, -1)
u_expanded_1 = tf.expand_dims(u_expanded_0, 2)
u_tiled = tf.tile(u_expanded_1, [1, 1, 10, 1, 1])
W_tiled = tf.tile(self.kernel, [batch_size, 1, 1, 1, 1])
u_hat = tf.matmul(W_tiled, u_tiled)
try:
numpy_result = np.matmul(W_tiled.numpy(), u_tiled.numpy())
tf.print('n TensorFlow/numpy max element value=' + str(tf.reduce_max(u_hat).numpy()) + '/' + str(
numpy_result.max()))
tf.print('n TensorFlow/numpy mean value=' + str(tf.reduce_mean(u_hat).numpy()) + '/' + str(
numpy_result.mean()))
except:
pass
soft = softmax((safe_norm(tf.reduce_sum(u_hat, axis=[1, 3]))))
# tf.print('nnW_tile max=' + str(tf.reduce_max(W_tiled)))
# tf.print('W_tile min=' + str(tf.reduce_min(W_tiled)))
# tf.print('u_tiled max=' + str(tf.reduce_max(u_tiled)))
# tf.print('u_tiled min=' + str(tf.reduce_min(u_tiled)))
# tf.print('u_hat max=' + str(tf.reduce_max(u_hat)))
# tf.print('u_hat min=' + str(tf.reduce_min(u_hat)))
tf.debugging.check_numerics(u_tiled, 'u_tiled')
tf.debugging.check_numerics(W_tiled, 'W_tiled')
tf.debugging.check_numerics(u_hat, 'u_hat')
tf.debugging.check_numerics(soft, 'soft')
return soft
def compute_output_shape(self, input_shape):
return (batch_size, 10, 16)

batch_size = 128
epochs = 100
img_rows, img_cols = 28, 28
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_train /= 255
y_train = utils.to_categorical(y_train, 10)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(256, (9, 9), activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.Conv2D(256, (9, 9), strides=(2, 2), activation='relu'))
model.add(Custom_layer())
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.run_eagerly = True
model.summary()
model.fit(
x_train,
y_train,
batch_size=batch_size,
epochs=epochs
)

我预计GPU和CPU会有类似的结果,但实际情况却大不相同。

在这里提供解决方案(答案部分(,即使它是由用户12292000在问题中指定的,也是为了社区的利益。

官方TF 2.0版本需要CUDA Toolkit 10。0(10.1不起作用(,因此他用CUDA Toolkit10重新编译了TF 2.0分支。1。现在,他在TF上的自定义版本按预期工作,并在此处提供。

此外,在官方TF 2.0版本(CUDA Toolkit 10.0(上,他从官方GitHub回购中复制了TF.matmul错误,但在他的自定义版本(CUDAToolkit 101

官方TF 2.0构建需要CUDA Toolkit 100(10.1不工作(,所以我用CUDA Toolkit10重新编译了TF 2.0分支。1。我的TF自定义版本按预期工作,可在此处获得。

此外,在官方TF 2.0版本(CUDA Toolkit 10.0(上,我从官方GitHub回购中复制了TF.matmul错误,但在我的自定义版本(CUDAToolkit 101

最新更新