如何在谷歌 colab 中使用带有 TPU 的 Keras 调谐器搜索方法时"pop from empty list"错误?



我之前能够在我的模型上使用Google colab的GPU运行时运行keras tuner的搜索方法。但是当我切换到TPU运行时,我得到了以下错误。我还没能得出如何访问TPU运行时的谷歌云存储以保存keras调谐器保存模型检查点的检查点文件夹的结论。我也不知道如何做到这一点,我得到了以下错误。请帮我解决这个问题。

我的代码:

def post_se(hp):
ip = Input(shape=(6, 128))
x = Masking()(ip)
x = LSTM(units=hp.Choice('lstm_1', values = [8,16,32,64,128,256,512]),return_sequences=True)(x)
x = Dropout(hp.Choice(name='Dropout', values = [0.0,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]))(x)
x = LSTM(units=hp.Choice('lstm_2', values = [8,16,32,64,128,256,512]))(x)
x = Dropout(hp.Choice(name='Dropout_2', values = [0.0,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]))(x)
y = Permute((2, 1))(ip)
y = Conv1D(hp.Choice('conv_1_filter', values = [32,64,128,256,512]), hp.Choice(name='conv_1_filter_size', values = [3,5,7,8,9]), padding='same', kernel_initializer='he_uniform')(y)
y = BatchNormalization()(y)
y = Activation('relu')(y)
y = squeeze_excite_block(y)
y = Conv1D(hp.Choice('conv_2_filter', values = [32,64,128,256,512]), hp.Choice(name='conv_2_filter_size',values = [3,5,7,8,9]), padding='same', kernel_initializer='he_uniform')(y)
y = BatchNormalization()(y)
y = Activation('relu')(y)
y = squeeze_excite_block(y)
y = Conv1D(hp.Choice('conv_3_filter', values = [32,64,128,256,512,]), hp.Choice(name='conv_3_filter_size',values = [3,5,7,8,9]), padding='same', kernel_initializer='he_uniform')(y)
y = BatchNormalization()(y)
y = Activation('relu')(y)
y = GlobalAveragePooling1D()(y)
x = concatenate([x,y])
# batch_size = hp.Choice('batch_size', values=[32, 64, 128, 256, 512, 1024, 2048, 4096])
out = Dense(num_classes, activation='softmax')(x)
model = Model(ip, out)
if gpu:
opt = keras.optimizers.Adam(learning_rate=0.001)
if tpu:
opt = keras.optimizers.Adam(learning_rate=8*0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy',metrics=['accuracy'])
# model.summary()
return model
if gpu:
tuner = kt.tuners.BayesianOptimization(post_se,
objective='val_accuracy',
max_trials=30,
seed=42,
project_name='Model_gpu')
# Will stop training if the "val_loss" hasn't improved in 30 epochs.
tuner.search(X_train, train_label, epochs=200, validation_split=0.1, shuffle=True, callbacks=[tensorflow.keras.callbacks.EarlyStopping('val_loss', patience=30)])
if tpu:
print("TPU")
with strategy.scope():
tuner = kt.tuners.BayesianOptimization(post_se,
objective='val_accuracy',
max_trials=30,
seed=42,
project_name='Model_tpu')
# Will stop training if the "val_loss" hasn't improved in 30 epochs.
tuner.search(X_train, train_label, epochs=200, validation_split=0.1, shuffle=True, callbacks=[tensorflow.keras.callbacks.EarlyStopping('val_loss', patience=30)])

错误日志

---------------------------------------------------------------------------
UnimplementedError                        Traceback (most recent call last)
/usr/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
129             try:
--> 130                 self.gen.throw(type, value, traceback)
131             except StopIteration as exc:
10 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in resource_creator_scope(resource_type, resource_creator)
2957                                                    resource_creator):
-> 2958     yield
2959 
<ipython-input-15-24c1e1bb603d> in <module>()
17         # Will stop training if the "val_loss" hasn't improved in 30 epochs.
---> 18         tuner.search(X_train, train_label, epochs=200, validation_split=0.1, shuffle=True, callbacks=[tensorflow.keras.callbacks.EarlyStopping('val_loss', patience=30)])
/usr/local/lib/python3.7/dist-packages/keras_tuner/engine/base_tuner.py in search(self, *fit_args, **fit_kwargs)
178             self.on_trial_begin(trial)
--> 179             results = self.run_trial(trial, *fit_args, **fit_kwargs)
180             # `results` is None indicates user updated oracle in `run_trial()`.
/usr/local/lib/python3.7/dist-packages/keras_tuner/engine/tuner.py in run_trial(self, trial, *args, **kwargs)
303             copied_kwargs["callbacks"] = callbacks
--> 304             obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
305 
/usr/local/lib/python3.7/dist-packages/keras_tuner/engine/tuner.py in _build_and_fit_model(self, trial, *args, **kwargs)
233         model = self._try_build(hp)
--> 234         return self.hypermodel.fit(hp, model, *args, **kwargs)
235 
/usr/local/lib/python3.7/dist-packages/keras_tuner/engine/hypermodel.py in fit(self, hp, model, *args, **kwargs)
136         """
--> 137         return model.fit(*args, **kwargs)
138 
/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
66       filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67       raise e.with_traceback(filtered_tb) from None
68     finally:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1116     except core._NotOkStatusException as e:  # pylint: disable=protected-access
-> 1117       raise core._status_to_exception(e) from None  # pylint: disable=protected-access
1118 
UnimplementedError: File system scheme '[local]' not implemented (file: './untitled_project/trial_78ed6883514d67dc6222064095c134cb/checkpoints/epoch_0/checkpoint_temp/part-00000-of-00001')
Encountered when executing an operation using EagerExecutor. This error cancels all future operations and poisons their output tensors.
During handling of the above exception, another exception occurred:
IndexError                                Traceback (most recent call last)
<ipython-input-15-24c1e1bb603d> in <module>()
16             seed=42)
17         # Will stop training if the "val_loss" hasn't improved in 30 epochs.
---> 18         tuner.search(X_train, train_label, epochs=200, validation_split=0.1, shuffle=True, callbacks=[tensorflow.keras.callbacks.EarlyStopping('val_loss', patience=30)])
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in __exit__(self, exception_type, exception_value, traceback)
454                          "tf.distribute.set_strategy() out of `with` scope."),
455             e)
--> 456     _pop_per_thread_mode()
457 
458 
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribution_strategy_context.py in _pop_per_thread_mode()
64 
65 def _pop_per_thread_mode():
---> 66   ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
67 
68 
IndexError: pop from empty list

为了获得一些额外的信息,我在这篇文章中附上了我的代码。

这是您的错误:

UnimplementedError: File system scheme '[local]' not implemented (file: './untitled_project/trial_78ed6883514d67dc6222064095c134cb/checkpoints/epoch_0/checkpoint_temp/part-00000-of-00001')

请参阅https://stackoverflow.com/a/62881833/14043558寻求解决方案。

最新更新