卷积层的输出特征映射为(Batch, Height, Width, Channels)。当我们在tensorflow中初始化CNN时,我们得到None值来代替Batch。我试图在自定义层中实现空间变压器网络,因此需要将层矢量化为卷积层批量大小。当我尝试初始化网络时,空间变压器层给出的错误是不能使用None值执行操作。
我的代码如下
class SpatialTransformer(Layer):
def __init__(self):
super(SpatialTransformer, self).__init__()
def affine_transform(self, input_shape, theta):
N = theta.shape[0]
H, W = input_shape #output dimensions of grid
x_t, y_t = tf.meshgrid(tf.linspace(-1, 1, W), tf.linspace(-1, 1, H))
x_t = tf.cast(tf.reshape(x_t, [-1]), dtype = tf.float32)
y_t = tf.cast(tf.reshape(y_t, [-1]), dtype = tf.float32)
ones = tf.ones(x_t.shape, dtype=tf.float32)
sampling_grids = tf.stack([x_t, y_t, ones])
sampling_grids = tf.expand_dims(sampling_grids, axis = 0)
sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
batch_grids = tf.matmul(theta, sampling_grids)
batch_grids = tf.reshape(batch_grids, [N, 2, H, W])
return batch_grids
def get_pixel_value(self, feature_map, x_s, y_s):
"Util Function to get the value of pixel from 4d image tensors given position vectors x_s and y_s"
N, H, W = x_s.shape
batch_idx = tf.range(0, N)
batch_idx = tf.reshape(batch_idx, (N, 1, 1))
b = tf.tile(batch_idx, (1, H, W))
indices = tf.stack([b, y_s, x_s], 3) #creating indices of shape(N, H, W)
return tf.gather_nd(feature_map, indices) #extracting values corresponding to those indices
def bilinear_sampler(self, feature_map, x, y):
N, H, W, C = feature_map.shape
max_y = tf.cast(H - 1, dtype = tf.int32)
max_x = tf.cast(W - 1, dtype = tf.int32)
zero = tf.zeros([], dtype= tf.int32)
x = tf.cast(x, dtype = tf.float32)
y = tf.cast(y, dtype = tf.float32)
#Reshaping the batch grid from [-1, 1] to [0, W-1] and [0, H-1]
x = (x + 1.0) * tf.cast(max_x, dtype = tf.float32)/2.0
y = (y + 1.0) * tf.cast(max_y, dtype = tf.float32)/2.0
#Taking the 4 nearest points to the (x_i, y_i) to perform interpolation
x0 = tf.cast(tf.floor(x), dtype=tf.int32)
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), dtype = tf.int32)
y1 = y0 + 1
#clipping the value to be between [0, W-1] or [0, H-1]
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
#getting pixel values of the corner coordinates(x0,y0), (x0, y1), (x1, y0), (x1, y1)
Ia = self.get_pixel_value(feature_map, x0, y0)
Ib = self.get_pixel_value(feature_map, x0, y1)
Ic = self.get_pixel_value(feature_map, x1, y0)
Id = self.get_pixel_value(feature_map, x1, y1)
#Changing the data type to float32
x0 = tf.cast(x0, dtype = tf.float32)
x1 = tf.cast(x1, dtype = tf.float32)
y0 = tf.cast(y0, dtype = tf.float32)
y1 = tf.cast(y1, dtype = tf.float32)
#calculating delta (or simply area) weights for interpolation
Wa = tf.expand_dims((x1-x)*(y1-y), axis=3)
Wb = tf.expand_dims((x1-x)*(y-y0), axis=3)
Wc = tf.expand_dims((x-x0)*(y1-y), axis=3)
Wd = tf.expand_dims((x-x0)*(y-y0), axis=3)
out = tf.add_n([Wa*Ia, Wb*Ib, Wc*Ic, Wd*Id])
return out
def call(self, feature_map, theta, out_size = None):
N, H, W, _ = feature_map.shape
if out_size:
out_H = out_size[0]
out_W = out_size[1]
batch_grids = self.affine_transform([out_H, out_W], theta)
else:
batch_grids = self.affine_transform([H, W], theta)
x_s = batch_grids[:,0,:,:]
y_s = batch_grids[:,0,:,:]
output_feature_map = self.bilinear_sampler(feature_map, x_s, y_s)
return output_feature_map
class Localisation_Network(Layer):
def __init__(self):
super(Localisation_Network, self).__init__()
self.conv = Conv2D(4,(3, 3), padding = "valid", strides=2, activation="relu", kernel_initializer="he_normal")
self.flatten = Flatten()
self.dense_1 = Dense(64, activation="relu", kernel_initializer="he_normal")
self.dense_2 = Dense(6, activation="linear")
self.reshape = Reshape((2, 3))
def call(self, input_tensor):
x = self.conv(input_tensor)
x = self.flatten(x)
x = self.dense_1(x)
x = self.dense_2(x)
x = self.reshape(x)
return x
def get_model():
x_input = Input((28, 28, 1))
u = Conv2D(16, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x_input)
u = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(u)
theta = Localisation_Network()(u)
v = SpatialTransformer()(u, theta)
v = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(v)
x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(v)
x = GlobalAveragePooling2D()(x)
x = Flatten()(x)
x = Dense(10,activation ="softmax")(x)
model = Model(inputs = x_input, outputs = x)
return model
以上代码错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-47-d630585afd1d> in <module>()
4 u = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(u)
5 theta = Localisation_Network()(u)
----> 6 v = SpatialTransformer()(u, theta)
7 v = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(v)
8 x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(v)
4 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
668 except Exception as e: # pylint:disable=broad-except
669 if hasattr(e, 'ag_error_metadata'):
--> 670 raise e.ag_error_metadata.to_exception(e)
671 else:
672 raise
ValueError: in user code:
<ipython-input-7-910b0adb6eb7>:83 call *
batch_grids = self.affine_transform([H, W], theta)
<ipython-input-45-eb5ac5f8f722>:14 affine_transform *
sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py:201 wrapper **
return target(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py:1405 stack
value_shape = ops.convert_to_tensor(values[0], name=name)._shape_tuple() # pylint: disable=protected-access
/usr/local/lib/python3.6/dist-packages/tensorflow/python/profiler/trace.py:163 wrapped
return func(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py:1540 convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:339 _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:265 constant
allow_broadcast=True)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/constant_op.py:283 _constant_impl
allow_broadcast=allow_broadcast))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_util.py:445 make_tensor_proto
raise ValueError("None values not supported.")
ValueError: None values not supported.
从这里很难判断,但基于stacktrace似乎这行是有问题的-sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
(转发None在不期望的地方)。
我注意到的第二件事-不确定你的调用方法覆盖在SpatialTransformer实际上应该有3个参数def call(self, feature_map, theta, out_size = None):
?似乎因为它继承了Layer
,所以应该只有input_tensor
参数。
不确定是否需要为用例重写build
,或者在那里进行所需的初始化。
除此之外,您可以尝试广泛地记录(添加打印语句)并查看None
值'进入'的确切位置。
最后,您还可以上传足够的代码摘录来重现相同的错误,这可能会带来更多的帮助。
我已经删除了tf.tile
层,因为具有(None, 2, 3)
维的本地化网络的矢量化输出将在tf.matmul
操作期间执行矢量化技巧。我还用预定义的keras重塑层tf.keras.layers.Reshape()
替换了tf.reshape
操作,因为它们保持了矢量化。
class SpatialTransformer(Layer):
def __init__(self, out_size, name= "spatial_transformer"):
super(SpatialTransformer, self).__init__()
self.out_size = out_size
self.reshape_1 = Reshape([2, self.out_size[0], self.out_size[1]]) #for replacing all the reshape to vectorized form
self.reshape_2 = Reshape([self.out_size[0], self.out_size[1]])
self.reshape_3 = Reshape([1, 1])
self.reshape_4 = Reshape([])
def affine_transform(self, input_shape, theta):
N = theta.shape[0]
H, W = input_shape #output dimensions of grid
x_t, y_t = tf.meshgrid(tf.linspace(-1, 1, W), tf.linspace(-1, 1, H))
x_t = tf.cast(tf.reshape(x_t, [-1]), dtype = tf.float32)
y_t = tf.cast(tf.reshape(y_t, [-1]), dtype = tf.float32)
ones = tf.ones(x_t.shape, dtype=tf.float32)
sampling_grids = tf.stack([x_t, y_t, ones])
# sampling_grids = tf.tile(sampling_grids, tf.stack([N, 1, 1]))
batch_grids = tf.matmul(theta, sampling_grids)
batch_grids = self.reshape_1(batch_grids)
return batch_grids
def get_pixel_value(self, feature_map, x_s, y_s):
"Util Function to get the value of pixel from 4d image tensors given position vectors x_s and y_s"
N, H, W = x_s.shape
batch_idx = tf.range(0, N)
batch_idx = self.reshape_3(batch_idx)
b = tf.tile(batch_idx, (1, H, W))
indices = tf.stack([b, y_s, x_s], 3) #creating indices of shape(N, H, W)
return tf.gather_nd(feature_map, indices) #extracting values corresponding to those indices
def bilinear_sampler(self, feature_map, x, y):
N, H, W, _ = feature_map.shape
max_y = tf.cast(H - 1, dtype = tf.int32)
max_x = tf.cast(W - 1, dtype = tf.int32)
zero = tf.zeros([], dtype= tf.int32)
x = tf.cast(x, dtype = tf.float32)
y = tf.cast(y, dtype = tf.float32)
#Reshaping the batch grid from [-1, 1] to [0, W-1] and [0, H-1]
x = (x + 1.0) * tf.cast(max_x, dtype = tf.float32)/2.0
y = (y + 1.0) * tf.cast(max_y, dtype = tf.float32)/2.0
#Taking the 4 nearest points to the (x_i, y_i) to perform interpolation
x0 = tf.cast(tf.floor(x), dtype=tf.int32)
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), dtype = tf.int32)
y1 = y0 + 1
#clipping the value to be between [0, W-1] or [0, H-1]
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
#getting pixel values of the corner coordinates(x0,y0), (x0, y1), (x1, y0), (x1, y1)
Ia = self.get_pixel_value(feature_map, x0, y0)
Ib = self.get_pixel_value(feature_map, x0, y1)
Ic = self.get_pixel_value(feature_map, x1, y0)
Id = self.get_pixel_value(feature_map, x1, y1)
# print(f"Ia: {Ia}")
#Changing the data type to float32
x0 = tf.cast(x0, dtype = tf.float32)
x1 = tf.cast(x1, dtype = tf.float32)
y0 = tf.cast(y0, dtype = tf.float32)
y1 = tf.cast(y1, dtype = tf.float32)
#calculating delta (or simply area) weights for interpolation
Wa = tf.expand_dims((x1-x)*(y1-y), axis=3)
Wb = tf.expand_dims((x1-x)*(y-y0), axis=3)
Wc = tf.expand_dims((x-x0)*(y1-y), axis=3)
Wd = tf.expand_dims((x-x0)*(y-y0), axis=3)
out = tf.add_n([Wa*Ia, Wb*Ib, Wc*Ic, Wd*Id])
return out
def call(self, input_tensor):
feature_map, theta = input_tensor
N, H, W, _ = feature_map.shape
if self.out_size:
out_H = self.out_size[0]
out_W = self.out_size[1]
batch_grids = self.affine_transform([out_H, out_W], theta)
else:
batch_grids = self.affine_transform([H, W], theta)
x_s = self.reshape_2(batch_grids[:,0,:,:])
y_s = self.reshape_2(batch_grids[:,1,:,:])
output_feature_map = self.bilinear_sampler(feature_map, x_s, y_s)
return output_feature_map
class Localisation_Network(Layer):
def __init__(self):
super(Localisation_Network, self).__init__()
self.conv_1 = Conv2D(16, (3, 3), padding = "same", strides=1, activation="relu", kernel_initializer="he_normal")
self.conv_2 = Conv2D(32, (3, 3), padding = "same", strides=1, activation="relu", kernel_initializer="he_normal")
self.flatten = Flatten()
self.dense_1 = Dense(32, activation="relu", kernel_initializer="he_normal")
def bias_init(shape, dtype = None):
identitiy = tf.Variable([[1.0, 0.0, 0.0],[0.0, 1.0, 0.0]])
identitiy = tf.reshape(identitiy, -1)
return identitiy
self.dense_2 = Dense(6,kernel_initializer = "zeros", bias_initializer = bias_init)
self.reshape = Reshape((2, 3))
def call(self, input_tensor):
x = self.conv_1(input_tensor)
x = self.conv_2(x)
x = tf.reduce_mean(x, axis = [1, 2])
x = self.dense_1(x)
x = self.dense_2(x)
x = self.reshape(x)
return x
def transformer_model_2():
x_input = Input((28, 28, 1))
theta = Localisation_Network()(x_input)
x = SpatialTransformer(x_input.shape[1:3], name = "transformer_output" )([x_input, theta])
x = Conv2D(16, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x)
x = Conv2D(16, (3, 3), padding = "same", strides = 2, activation="relu", kernel_initializer="he_normal")(x)
x = Conv2D(32, (3, 3), padding = "same", activation= "relu", kernel_initializer="he_normal")(x)
x = Conv2D(32, (3, 3), padding = "same", strides = 2, activation= "relu", kernel_initializer="he_normal")(x)
x = GlobalAveragePooling2D()(x)
x = Flatten()(x)
x = Dense(10,activation ="softmax")(x)
return Model(inputs = x_input, outputs = x)
我唯一卡住的是本地化网络,因为它是一个回归网络,所以linear activation
被放置,但这个网络的输出导致值很大,因此在bilinear sampling
中被截断,最终导致零输出,因此梯度无法通过本地化网络流动。
我已经查找了medium post和github来找到解决方案,其中许多人建议在本地化网络的最后一层初始化weights to zeros
和biases to identity: [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]
,但它不起作用。