Source code for xmodaler.engine.tden_pretrainer

# Copyright 2021 JD.com, Inc., JD AI
"""
@author: Yehao Li
@contact: yehaoli.sysu@gmail.com
"""
import time
import tqdm
import copy
import numpy as np
import itertools
import torch
from .defaults import DefaultTrainer
from xmodaler.scorer import build_scorer
from xmodaler.config import kfg
from xmodaler.losses import build_rl_losses
import xmodaler.utils.comm as comm
from .build import ENGINE_REGISTRY

__all__ = ['TDENPretrainer']

[docs]@ENGINE_REGISTRY.register() class TDENPretrainer(DefaultTrainer):
[docs] def __init__(self, cfg): super(TDENPretrainer, self).__init__(cfg)
[docs] def run_step(self): """ Implement the standard training logic described above. """ assert self.model.training, "[SimpleTrainer] model was changed to eval mode!" start = time.perf_counter() """ If you want to do something with the data, you can wrap the dataloader. """ try: data = next(self._train_data_loader_iter) except StopIteration: self._train_data_loader_iter = iter(self.train_data_loader) data = next(self._train_data_loader_iter) data_time = time.perf_counter() - start """ If you want to do something with the losses, you can wrap the model. """ data = comm.unwrap_model(self.model).preprocess_batch(data) data_copy = copy.deepcopy(data) data[kfg.SS_PROB] = self.ss_prob data['tden_pretrain_similarity'] = True outputs_dict = self.model(data) losses_dict = {} for loss in self.losses: loss_dict = loss(outputs_dict) losses_dict.update(loss_dict) losses = sum(losses_dict.values()) """ If you need to accumulate gradients or do something similar, you can wrap the optimizer with your custom `zero_grad()` method. """ self.optimizer.zero_grad() losses.backward() self._write_metrics(losses_dict, data_time) """ If you need gradient clipping/scaling or other processing, you can wrap the optimizer with your custom `step()` method. But it is suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4 """ self.optimizer.step() u_tokens_ids = data[kfg.U_TOKENS_IDS] u_tlogits = outputs_dict[kfg.U_LOGITS] u_targets = data[kfg.U_TARGET_IDS] u_mask = (u_targets < 0).type(torch.cuda.IntTensor) _, wt_u = torch.max(u_tlogits.detach(), -1) new_u_tokens_ids = u_mask * u_tokens_ids + (1 - u_mask) * wt_u data_copy[kfg.U_TOKENS_IDS] = new_u_tokens_ids g_tlogits = outputs_dict[kfg.G_LOGITS] _, wt_g = torch.max(g_tlogits.detach(), -1) new_g_tokens_ids = u_mask * u_tokens_ids + (1 - u_mask) * torch.cat([wt_g.new(wt_g.size(0), 1), wt_g[:, 0:-1]], dim=-1) data_copy[kfg.G_TOKENS_IDS] = new_g_tokens_ids data = data_copy # second stage data['tden_pretrain_similarity'] = False outputs_dict = self.model(data) losses_dict = {} for loss in self.losses: loss_dict = loss(outputs_dict) losses_dict.update(loss_dict) losses = sum(losses_dict.values()) """ If you need to accumulate gradients or do something similar, you can wrap the optimizer with your custom `zero_grad()` method. """ self.optimizer.zero_grad() losses.backward() self._write_metrics(losses_dict, data_time) """ If you need gradient clipping/scaling or other processing, you can wrap the optimizer with your custom `step()` method. But it is suboptimal as explained in https://arxiv.org/abs/2006.15704 Sec 3.2.4 """ self.optimizer.step()