TensorFlow MirroredStrategy() 不适用于多 GPU 训练



我正在尝试实现 TensorFlowsMirroredStrategy(),以便在 2 张 Nvidia Titan RTX 显卡上运行 3DUNet。该代码经验证适用于 1 个 GPU。我的操作系统是Red Hat Enterprise Linux 8(RHEL8(。错误出现在model.fit().

我已经安装了适当的 NCCL Nvidia 驱动程序,并验证我可以使用 tensorflow.org 中的示例将训练数据解析到两个 GPU 上。

法典:

def get_model(optimizer, loss_metric, metrics, lr=1e-3):
inputs = Input((sample_width, sample_height, sample_depth, 1))
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(inputs)
conv1 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv1)
pool1 = MaxPooling3D(pool_size=(2, 2, 2))(conv1)
drop1 = Dropout(0.5)(pool1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(drop1)
conv2 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv2)
pool2 = MaxPooling3D(pool_size=(2, 2, 2))(conv2)
drop2 = Dropout(0.5)(pool2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(drop2)
conv3 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv3)
pool3 = MaxPooling3D(pool_size=(2, 2, 2))(conv3)
drop3 = Dropout(0.3)(pool3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(drop3)
conv4 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv4)
pool4 = MaxPooling3D(pool_size=(2, 2, 2))(conv4)
drop4 = Dropout(0.3)(pool4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(drop4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', padding='same')(conv5)
up6 = concatenate([Conv3DTranspose(256, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv5), conv4], axis=4)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(up6)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same')(conv6)
up7 = concatenate([Conv3DTranspose(128, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv6), conv3], axis=4)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(up7)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', padding='same')(conv7)
up8 = concatenate([Conv3DTranspose(64, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv7), conv2], axis=4)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(up8)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', padding='same')(conv8)
up9 = concatenate([Conv3DTranspose(32, (2, 2, 2), strides=(2, 2, 2), padding='same')(conv8), conv1], axis=4)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(up9)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', padding='same')(conv9)
conv10 = Conv3D(1, (1, 1, 1), activation='sigmoid')(conv9)
model = Model(inputs=[inputs], outputs=[conv10])
model.compile(optimizer=optimizer(lr=lr), loss=loss_metric, metrics=metrics)
return model
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
model = get_model(optimizer=Adam, loss_metric=dice_coef_loss, metrics=[dice_coef], lr=1e-3)
observe_var = 'dice_coef'
strategy = 'max'
model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
model.save('unet_seg_final_3d_test.model')

错误:

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-3-15c1c64c47ab> in <module>
423 model_checkpoint = ModelCheckpoint('unet_seg_cs9300_3d_{epoch:04}.model', monitor=observe_var, save_best_only=False, period = 1000)
424 
--> 425 model.fit(train_x, train_y, batch_size= 1, epochs= 100, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint])
426 
427 model.save('unet_seg_final_3d_test.model')
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
1211         else:
1212             fit_inputs = x + y + sample_weights
-> 1213         self._make_train_function()
1214         fit_function = self.train_function
1215 
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314                     training_updates = self.optimizer.get_updates(
315                         params=self._collected_trainable_weights,
--> 316                         loss=self.total_loss)
317                 updates = self.updates + training_updates
318 
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89                 warnings.warn('Update your `' + object_name + '` call to the ' +
90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
92         wrapper._original_function = func
93         return wrapper
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73         if _SYMBOLIC_SCOPE.value:
74             with get_graph().as_default():
---> 75                 return func(*args, **kwargs)
76         else:
77             return func(*args, **kwargs)
~/anaconda3/envs/gputest/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
548 
549             # Apply constraints.
--> 550             if getattr(p, 'constraint', None) is not None:
551                 new_p = p.constraint(new_p)
552 
~/anaconda3/envs/gputest/lib/python3.7/site-packages/tensorflow_core/python/ops/variables.py in constraint(self)
566       Can be `None` if no constraint was passed.
567     """
--> 568     raise NotImplementedError
569 
570   def assign(self, value, use_locking=False, name=None, read_value=True):
NotImplementedError: 

这个答案是基于对OP问题的评论。

当使用tf.distribute.MirroredStrategy进行多 GPU 训练时,应该使用tf.kerasAPI 而不是keras包的tensorflow后端。

一般来说,最好不要混合tf.keraskeras

尝试不同的cross_device_ops,不要诉诸NCCL

strategy = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
strategy = tf.distribute.MirroredStrategy(
cross_device_ops=tf.distribute.ReductionToOneDevice())

最新更新