请参阅下面的代码块(代码的源代码可以在这里找到,也不需要阅读整个块,我会解释并突出显示重要的部分)
def train(data_loader, model, optimizer, scheduler, total_epochs, save_interval, save_folder, sets):
# settings
batches_per_epoch = len(data_loader) #
log.info('{} epochs in total, {} batches per epoch'.format(total_epochs, batches_per_epoch))
loss_seg = nn.CrossEntropyLoss(ignore_index=-1)
print("Current setting is:")
print(sets)
print("nn")
if not sets.no_cuda:
loss_seg = loss_seg.cuda()
# change model in training mode, enable batch normalization etc
model.train()
# record train time
train_time_sp = time.time()
# loop to train the model
for epoch in range(total_epochs):
log.info('Start epoch {}'.format(epoch))
scheduler.step()
log.info('lr = {}'.format(scheduler.get_lr()))
for batch_id, batch_data in enumerate(data_loader):
# getting data batch
batch_id_sp = epoch * batches_per_epoch
volumes, label_masks = batch_data
if not sets.no_cuda:
volumes = volumes.cuda()
optimizer.zero_grad()
out_masks = model(volumes)
# resize label
[n, _, d, h, w] = out_masks.shape
new_label_masks = np.zeros([n, d, h, w])
for label_id in range(n):
label_mask = label_masks[label_id]
[ori_c, ori_d, ori_h, ori_w] = label_mask.shape
label_mask = np.reshape(label_mask, [ori_d, ori_h, ori_w])
scale = [d*1.0/ori_d, h*1.0/ori_h, w*1.0/ori_w]
label_mask = ndimage.interpolation.zoom(label_mask, scale, order=0)
new_label_masks[label_id] = label_mask
new_label_masks = torch.tensor(new_label_masks).to(torch.int64)
if not sets.no_cuda:
new_label_masks = new_label_masks.cuda()
# calculating loss
loss_value_seg = loss_seg(out_masks, new_label_masks)
loss = loss_value_seg
loss.backward()
optimizer.step()
avg_batch_time = (time.time() - train_time_sp) / (1 + batch_id_sp)
log.info(
'Batch: {}-{} ({}), loss = {:.3f}, loss_seg = {:.3f}, avg_batch_time = {:.3f}'
.format(epoch, batch_id, batch_id_sp, loss.item(), loss_value_seg.item(), avg_batch_time))
if not sets.ci_test:
# save model
if batch_id == 0 and batch_id_sp != 0 and batch_id_sp % save_interval == 0:
#if batch_id_sp != 0 and batch_id_sp % save_interval == 0:
model_save_path = '{}_epoch_{}_batch_{}.pth.tar'.format(save_folder, epoch, batch_id)
model_save_dir = os.path.dirname(model_save_path)
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
log.info('Save checkpoints: epoch = {}, batch_id = {}'.format(epoch, batch_id))
torch.save({
'ecpoch': epoch,
'batch_id': batch_id,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict()},
model_save_path)
print('Finished training')
if sets.ci_test:
exit()
这是一个定制的训练函数,作者通过使用代码loss_seg = loss_seg.cuda()
实现GPU运行,其中loss_seg
是torch.optim.SGD
的实例。这部分让我感到困惑,因为根据 pytorch 中的官方文档,我只需要移动我的模型和我的输入数据到 GPU 就足够了。我想知道为什么编写下面代码的作者也将优化器移动到 GPU 以及有关在 GPU 上运行 pytorch 的更多详细信息。
通常,为了利用 GPU 的全部功能,应在前进步骤之前将每个有状态Module
发送到cuda
设备。有状态Module
具有内部状态,例如Parameter
(重量)。
这通常不是这种情况Loss
,通常,它仅适用于已经为cuda
设备实现的功能。
总之,如果丢失是有状态的,那么将其发送到cuda
设备是有意义的,否则,没有必要。
也可以在 PyTorch 论坛中查看这个问题。