Source code for aitoolbox.experiment.local_load.local_model_load

from abc import ABC, abstractmethod
import os
from collections import OrderedDict
import torch

from aitoolbox.experiment.local_save.folder_create import ExperimentFolder
from aitoolbox.torchtrain.schedulers.basic import AbstractScheduler


[docs]class AbstractLocalModelLoader(ABC):
[docs]    @abstractmethod
    def load_model(self, project_name, experiment_name, experiment_timestamp, model_save_dir, epoch_num=None, **kwargs):
        """Model loading method all the model loaders need to implement

        Args:
            project_name (str): root name of the project
            experiment_name (str): name of the particular experiment
            experiment_timestamp (str): time stamp at the start of training
            model_save_dir (str): name of the folder inside experiment folder where the model is saved
            epoch_num (int or None): epoch number of the model checkpoint or none if loading final model
            **kwargs: additional parameters for specific framework model loader

        Returns:
            model
        """
        pass


[docs]class PyTorchLocalModelLoader(AbstractLocalModelLoader):
    def __init__(self, local_model_result_folder_path):
        """PyTorch saved model loader and initializer

        Args:
            local_model_result_folder_path (str): root local path where project folder will be created
        """
        self.local_model_result_folder_path = os.path.expanduser(local_model_result_folder_path)
        self.model_representation = None

[docs]    def load_model(self, project_name, experiment_name, experiment_timestamp, model_save_dir='checkpoint_model',
                   epoch_num=None, map_location=None):
        """Model loading interface compatible with the experiment folder structure maintained by the AIToolbox TrainLoop

        Args:
            project_name (str): root name of the project
            experiment_name (str): name of the particular experiment
            experiment_timestamp (str): time stamp at the start of training
            model_save_dir (str): name of the folder inside experiment folder where the model is saved
            epoch_num (int or None): epoch number of the model checkpoint or none if loading final model
            map_location (str or None):

        Returns:
            model
        """
        _, experiment_dir_path = ExperimentFolder.get_base_folder_paths(project_name, experiment_name,
                                                                        experiment_timestamp,
                                                                        self.local_model_result_folder_path)
        if epoch_num is None:
            model_name = f'model_{experiment_name}_{experiment_timestamp}.pth'
        else:
            model_name = f'model_{experiment_name}_{experiment_timestamp}_E{epoch_num}.pth'

        model_path = os.path.join(experiment_dir_path, model_save_dir, model_name)

        self.model_representation = torch.load(model_path, map_location=map_location)

        # Fix for back-compatibility
        if 'schedulers_state_dict' not in self.model_representation:
            self.model_representation['schedulers_state_dict'] = []

        return self.model_representation

[docs]    def load_model_from_path(self, model_path, map_location=None):
        """General model loading when the AIToolbox TrainLoop experiment folder structure is not used

        Args:
            model_path (str): full path to the model
            map_location (str or None): a function, :class:`torch.device`, string or a dict specifying how to remap
                storage locations

        Returns:
            model
        """
        self.model_representation = torch.load(model_path, map_location=map_location)
        return self.model_representation

[docs]    def check_if_model_loaded(self):
        if self.model_representation is None:
            raise ValueError('Model has not yet been loaded. Please call load_model() first.')

[docs]    def init_model(self, model, used_data_parallel=False):
        """Initialize provided PyTorch model with the loaded model weights

        For this function to work, load_model() must be first called to read the model representation into memory.

        Args:
            model (TTModel or torch.nn.Module): PyTorch model
            used_data_parallel (bool): if the saved model was nn.DataParallel or normal model

        Returns:
            PyTorch model
        """
        self.check_if_model_loaded()

        state_dict = self.model_representation['model_state_dict']

        if used_data_parallel:
            state_dict = OrderedDict()
            for k, v in self.model_representation['model_state_dict'].items():
                name = k[7:]  # remove `module.`
                state_dict[name] = v

        model.load_state_dict(state_dict)
        return model

[docs]    def init_optimizer(self, optimizer, device='cuda'):
        """Initialize the optimizer based on saved model/optimizer checkpoint

        Args:
            optimizer: PyTorch optimizer
            device (str): device id

        Returns:
            PyTorch optimizer
        """
        self.check_if_model_loaded()

        optimizer.load_state_dict(self.model_representation['optimizer_state_dict'])

        # for state in optimizer.state.values():
        #     for k, v in state.items():
        #         if isinstance(v, torch.Tensor):
        #             state[k] = v.to(device)

        return optimizer

[docs]    def init_scheduler(self, scheduler_callbacks_list, ignore_saved=False, ignore_missing_saved=False):
        """Initialize the list of schedulers based on saved model/optimizer/scheduler checkpoint

        Args:
            scheduler_callbacks_list (list): list of scheduler (callbacks)
            ignore_saved (bool): if exception should be raised in the case there are found scheduler snapshots
                in the checkpoint, but not schedulers are provided to this method
            ignore_missing_saved (bool): if exception should be raised in the case schedulers are provided to
                this method but no saved scheduler snapshots can be found in the checkpoint

        Returns:
            list: list of initialized scheduler (callbacks)
        """
        self.check_if_model_loaded()
        loaded_schedulers = self.model_representation['schedulers_state_dict']

        if len(loaded_schedulers) == 0 and len(scheduler_callbacks_list) > 0:
            if not ignore_missing_saved:
                raise KeyError('Schedulers_state_dict not found in the loaded model representation but you provided '
                               'schedulers to TrainLoop.')
            return scheduler_callbacks_list

        if len(loaded_schedulers) > 0 and len(scheduler_callbacks_list) == 0:
            if not ignore_saved:
                raise ValueError('No schedulers were provided to the TrainLoop, however scheduler state_dicts were'
                                 'found saved in the loaded model representation.')
            return scheduler_callbacks_list

        if len(scheduler_callbacks_list) != len(loaded_schedulers):
            raise ValueError('Number of provided schedulers does not match the number of loaded scheduler state_dicts. '
                             f'Number of given schedulers: {len(scheduler_callbacks_list)} and number of loaded'
                             f"scheduler state_dicts: {len(loaded_schedulers)}")

        # Initialize the scheduler callbacks with the saved scheduler states
        for sch_cb, sch_state_dict in zip(scheduler_callbacks_list, loaded_schedulers):
            if not isinstance(sch_cb, AbstractScheduler):
                raise TypeError('Provided scheduler is not inherited from AbstractScheduler')

            sch_cb.load_state_dict(sch_state_dict)

        return scheduler_callbacks_list

[docs]    def init_amp(self, amp_scaler):
        """Initialize AMP GradScaler

        Args:
            amp_scaler (torch.cuda.amp.GradScaler): AMP GradScaler

        Returns:
            torch.cuda.amp.GradScaler: initialized AMP GradScaler
        """
        amp_scaler.load_state_dict(self.model_representation['amp'])
        return amp_scaler