Source code for aitoolbox.torchtrain.train_loop.train_loop_tracking

import os
import inspect

from aitoolbox.torchtrain.train_loop.train_loop import TrainLoop
from aitoolbox.experiment.result_package.abstract_result_packages import AbstractResultPackage
from aitoolbox.torchtrain.callbacks.model_save import ModelCheckpoint, ModelIterationCheckpoint, ModelTrainEndSave
from aitoolbox.torchtrain.train_loop.components.pred_collate_fns import append_predictions, torch_cat_transf


[docs]class TrainLoopCheckpoint(TrainLoop):
    def __init__(self, model,
                 train_loader, validation_loader, test_loader,
                 optimizer, criterion,
                 project_name, experiment_name, local_model_result_folder_path,
                 hyperparams,
                 cloud_save_mode='s3', bucket_name='model-result', cloud_dir_prefix='', source_dirs=(),
                 rm_subopt_local_models=False, num_best_checkpoints_kept=2,
                 iteration_save_freq=0,
                 collate_batch_pred_fn=append_predictions, pred_transform_fn=torch_cat_transf,
                 end_auto_eval=True, lazy_experiment_save=False, print_callbacks=False,
                 gpu_mode='single', cuda_device_idx=None, use_amp=False):
        """TrainLoop with the automatic model check-pointing at the end of each epoch

        Args:
            model (TTModel or ModelWrap or TTDataParallel): neural network model
            train_loader (torch.utils.data.DataLoader): data loader for train data set
            validation_loader (torch.utils.data.DataLoader or None): data loader for validation data set
            test_loader (torch.utils.data.DataLoader or None): data loader for test data set
            optimizer (torch.optim.Optimizer or MultiOptimizer): optimizer algorithm.
            criterion (torch.nn.Module or MultiLoss or None): criterion during the training procedure
            project_name (str): root name of the project
            experiment_name (str): name of the particular experiment
            local_model_result_folder_path (str): root local path where project folder will be created
            hyperparams (dict): used hyper-parameters. When running the TrainLoop from jupyter notebook in order to
                ensure the python experiment file copying to the experiment folder, the user needs to manually
                specify the python file path as the value for the `experiment_file_path` key. If running the training
                directly from the terminal the path deduction is done automatically.
            cloud_save_mode (str or None): Storage destination selector.
                For AWS S3: 's3' / 'aws_s3' / 'aws'
                For Google Cloud Storage: 'gcs' / 'google_storage' / 'google storage'
                Everything else results just in local storage to disk
            bucket_name (str): name of the bucket in the cloud storage
            cloud_dir_prefix (str): path to the folder inside the bucket where the experiments are going to be saved
            source_dirs (list or tuple): paths to the local folders with the source code files used in experiment
            rm_subopt_local_models (bool or str): if True, the deciding metric is set to 'loss'. Give string metric name
                to set it as a deciding metric for suboptimal model removal. If metric name consists of substring 'loss'
                the metric minimization is done otherwise metric maximization is done
            num_best_checkpoints_kept (int): number of best performing models which are kept when removing suboptimal
                model checkpoints
            iteration_save_freq (int): frequency of saving the model checkpoint every specified number of
                training iterations
            collate_batch_pred_fn (callable): collate function transforming batch predictions as they come out from the
                model
            pred_transform_fn (callable): function transforming all the produced predictions after all the batches have
                been run through the model
            end_auto_eval (bool or int): used to optionally disable otherwise automatic end of epoch/training val/test
                loss calculations. This is useful when conducting very costly experiments to save on compute time.
                Specify either True/False boolean to always run or never run after each epoch or specify an int to
                execute only every specified number of epochs.
            lazy_experiment_save (bool): when in lazy mode experiment tracking components will create the experiment
                folder only after some training results are available (possibly at the end of the first epoch) instead
                of at the beginning of training.
            print_callbacks (bool): at the start of training print the list of registered callbacks
                which will be executed during the run of the train loop
            gpu_mode (str): GPU training mode selection. TrainLoop supports different GPU training modes by
                specifying one of the following:

                * ``'single'``: single GPU training
                * ``'dp'``: multi-GPU training via DataParallel
                * ``'ddp'``: multi-GPU training via DistributedDataParallel

            cuda_device_idx (int or None): CUDA device index used when training on multiple GPUs
            use_amp (bool or dict): Use 16-bit Automatic Mixed Precision (AMP).

                To switch to AMP mode either:

                * set this parameter to ``True`` to use default AMP :class:`~torch.cuda.amp.GradScaler`
                  initialization params
                * provide custom AMP :class:`~torch.cuda.amp.GradScaler` initialization parameters as a dict as
                  this parameter
        """
        TrainLoop.__init__(self, model, train_loader, validation_loader, test_loader, optimizer, criterion,
                           collate_batch_pred_fn, pred_transform_fn,
                           end_auto_eval, lazy_experiment_save, print_callbacks,
                           gpu_mode, cuda_device_idx, use_amp)
        self.project_name = project_name
        self.experiment_name = experiment_name
        self.local_model_result_folder_path = os.path.expanduser(local_model_result_folder_path)
        self.hyperparams = hyperparams
        self.cloud_save_mode = cloud_save_mode
        self.bucket_name = bucket_name
        self.cloud_dir_prefix = cloud_dir_prefix
        self.source_dirs = source_dirs
        self.rm_subopt_local_models = rm_subopt_local_models
        self.iteration_save_freq = iteration_save_freq

        if 'experiment_file_path' not in self.hyperparams:
            self.hyperparams['experiment_file_path'] = inspect.getframeinfo(inspect.currentframe().f_back).filename
        if 'source_dirs_paths' not in self.hyperparams:
            self.hyperparams['source_dirs_paths'] = source_dirs

        if iteration_save_freq == 0:
            model_checkpoint_cb = ModelCheckpoint(
                self.project_name, self.experiment_name, self.local_model_result_folder_path,
                self.hyperparams,
                cloud_save_mode=self.cloud_save_mode,
                bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
                rm_subopt_local_models=self.rm_subopt_local_models,
                num_best_checkpoints_kept=num_best_checkpoints_kept
            )
        elif iteration_save_freq > 0:
            model_checkpoint_cb = ModelIterationCheckpoint(
                iteration_save_freq,
                self.project_name, self.experiment_name, self.local_model_result_folder_path,
                self.hyperparams,
                cloud_save_mode=self.cloud_save_mode,
                bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
                rm_subopt_local_models=self.rm_subopt_local_models,
                num_best_checkpoints_kept=num_best_checkpoints_kept
            )
        else:
            raise ValueError('iteration_save_freq can have values only >= 0. '
                             f'But received value {iteration_save_freq}.')

        self.callbacks_handler.register_callbacks([model_checkpoint_cb], cache_callbacks=True)


[docs]class TrainLoopEndSave(TrainLoop):
    def __init__(self, model,
                 train_loader, validation_loader, test_loader,
                 optimizer, criterion,
                 project_name, experiment_name, local_model_result_folder_path,
                 hyperparams, val_result_package=None, test_result_package=None,
                 cloud_save_mode='s3', bucket_name='model-result', cloud_dir_prefix='', source_dirs=(),
                 collate_batch_pred_fn=append_predictions, pred_transform_fn=torch_cat_transf,
                 end_auto_eval=True, lazy_experiment_save=False, print_callbacks=False,
                 gpu_mode='single', cuda_device_idx=None, use_amp=False):
        """TrainLoop with the model performance evaluation and final model saving at the end of the training process

        Args:
            model (TTModel or ModelWrap or TTDataParallel): neural network model
            train_loader (torch.utils.data.DataLoader): data loader for train data set
            validation_loader (torch.utils.data.DataLoader or None): data loader for validation data set
            test_loader (torch.utils.data.DataLoader or None): data loader for test data set
            optimizer (torch.optim.Optimizer or MultiOptimizer): optimizer algorithm.
            criterion (torch.nn.Module or MultiLoss or None): criterion during the training procedure
            project_name (str): root name of the project
            experiment_name (str): name of the particular experiment
            local_model_result_folder_path (str): root local path where project folder will be created
            hyperparams (dict): used hyper-parameters. When running the TrainLoop from jupyter notebook in order to
                ensure the python experiment file copying to the experiment folder, the user needs to manually
                specify the python file path as the value for the `experiment_file_path` key. If running the training
                directly from the terminal the path deduction is done automatically.
            val_result_package (AbstractResultPackage or None): result package evaluated on validation data at  the end
                of the training
            test_result_package (AbstractResultPackage or None): result package evaluated on test data at the end
                of the training
            cloud_save_mode (str or None): Storage destination selector.
                For AWS S3: 's3' / 'aws_s3' / 'aws'
                For Google Cloud Storage: 'gcs' / 'google_storage' / 'google storage'
                Everything else results just in local storage to disk
            bucket_name (str): name of the bucket in the cloud storage
            cloud_dir_prefix (str): path to the folder inside the bucket where the experiments are going to be saved
            source_dirs (list or tuple): paths to the local folders with the source code files used in experiment
            collate_batch_pred_fn (callable): collate function transforming batch predictions as they come out from the
                model
            pred_transform_fn (callable): function transforming all the produced predictions after all the batches have
                been run through the model
            end_auto_eval (bool or int): used to optionally disable otherwise automatic end of epoch/training val/test
                loss calculations. This is useful when conducting very costly experiments to save on compute time.
                Specify either True/False boolean to always run or never run after each epoch or specify an int to
                execute only every specified number of epochs.
            lazy_experiment_save (bool): when in lazy mode experiment tracking components will create the experiment
                folder only after some training results are available (possibly at the end of the first epoch) instead
                of at the beginning of training.
            print_callbacks (bool): at the start of training print the list of registered callbacks
                which will be executed during the run of the train loop
            gpu_mode (str): GPU training mode selection. TrainLoop supports different GPU training modes by
                specifying one of the following:

                * ``'single'``: single GPU training
                * ``'dp'``: multi-GPU training via DataParallel
                * ``'ddp'``: multi-GPU training via DistributedDataParallel

            cuda_device_idx (int or None): CUDA device index used when training on multiple GPUs
            use_amp (bool or dict): Use 16-bit Automatic Mixed Precision (AMP).

                To switch to AMP mode either:

                * set this parameter to ``True`` to use default AMP :class:`~torch.cuda.amp.GradScaler`
                  initialization params
                * provide custom AMP :class:`~torch.cuda.amp.GradScaler` initialization parameters as a dict as
                  this parameter
        """
        TrainLoop.__init__(self, model, train_loader, validation_loader, test_loader, optimizer, criterion,
                           collate_batch_pred_fn, pred_transform_fn,
                           end_auto_eval, lazy_experiment_save, print_callbacks,
                           gpu_mode, cuda_device_idx, use_amp)
        self.project_name = project_name
        self.experiment_name = experiment_name
        self.local_model_result_folder_path = os.path.expanduser(local_model_result_folder_path)
        self.hyperparams = hyperparams
        self.val_result_package = val_result_package
        self.test_result_package = test_result_package
        self.cloud_save_mode = cloud_save_mode
        self.bucket_name = bucket_name
        self.cloud_dir_prefix = cloud_dir_prefix
        self.source_dirs = source_dirs

        if 'experiment_file_path' not in self.hyperparams:
            self.hyperparams['experiment_file_path'] = inspect.getframeinfo(inspect.currentframe().f_back).filename
        if 'source_dirs_paths' not in self.hyperparams:
            self.hyperparams['source_dirs_paths'] = source_dirs
        self.check_if_result_packages_possible()

        self.callbacks_handler.register_callbacks([
            ModelTrainEndSave(self.project_name, self.experiment_name, self.local_model_result_folder_path,
                              self.hyperparams, self.val_result_package, self.test_result_package,
                              cloud_save_mode=self.cloud_save_mode,
                              bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix)
        ], cache_callbacks=True)

[docs]    def check_if_result_packages_possible(self):
        if self.val_result_package is not None and self.validation_loader is None:
            raise ValueError('Given the val_result_package but not supplied the validation_loader. '
                             'If you want to calculate the val_result_package the validation_loader has to be provided.')

        if self.test_result_package is not None and self.test_loader is None:
            raise ValueError('Given the test_result_package but not supplied the test_loader. '
                             'If you want to calculate the test_result_package the test_loader has to be provided.')

        if self.val_result_package is None and self.test_result_package is None:
            raise ValueError('Both val_result_package and test_result_package are None. '
                             'At least one of these should be not None but actual result package.')

        if self.val_result_package is not None and not isinstance(self.val_result_package, AbstractResultPackage):
            raise TypeError(f'val_result_package {self.val_result_package} is not inherited from AbstractResultPackage')

        if self.test_result_package is not None and not isinstance(self.test_result_package, AbstractResultPackage):
            raise TypeError(f'test_result_package {self.test_result_package} is not inherited from AbstractResultPackage')


[docs]class TrainLoopCheckpointEndSave(TrainLoopEndSave):
    def __init__(self, model,
                 train_loader, validation_loader, test_loader,
                 optimizer, criterion,
                 project_name, experiment_name, local_model_result_folder_path,
                 hyperparams, val_result_package=None, test_result_package=None,
                 cloud_save_mode='s3', bucket_name='model-result', cloud_dir_prefix='', source_dirs=(),
                 rm_subopt_local_models=False, num_best_checkpoints_kept=2,
                 iteration_save_freq=0,
                 collate_batch_pred_fn=append_predictions, pred_transform_fn=torch_cat_transf,
                 end_auto_eval=True, lazy_experiment_save=False, print_callbacks=False,
                 gpu_mode='single', cuda_device_idx=None, use_amp=False):
        """TrainLoop both saving model check-pointing at the end of each epoch and model performance reporting
            and model saving at the end of the training process

        Args:
            model (TTModel or ModelWrap or TTDataParallel): neural network model
            train_loader (torch.utils.data.DataLoader): data loader for train data set
            validation_loader (torch.utils.data.DataLoader or None): data loader for validation data set
            test_loader (torch.utils.data.DataLoader or None): data loader for test data set
            optimizer (torch.optim.Optimizer or MultiOptimizer): optimizer algorithm.
            criterion (torch.nn.Module or MultiLoss or None): criterion during the training procedure
            project_name (str): root name of the project
            experiment_name (str): name of the particular experiment
            local_model_result_folder_path (str): root local path where project folder will be created
            hyperparams (dict): used hyper-parameters. When running the TrainLoop from jupyter notebook in order to
                ensure the python experiment file copying to the experiment folder, the user needs to manually
                specify the python file path as the value for the `experiment_file_path` key. If running the training
                directly from the terminal the path deduction is done automatically.
            val_result_package (AbstractResultPackage or None): result package evaluated on validation data at the end
                of the training
            test_result_package (AbstractResultPackage or None): result package evaluated on test data at the end
                of the training
            cloud_save_mode (str or None): Storage destination selector.
                For AWS S3: 's3' / 'aws_s3' / 'aws'
                For Google Cloud Storage: 'gcs' / 'google_storage' / 'google storage'
                Everything else results just in local storage to disk
            bucket_name (str): name of the bucket in the cloud storage
            cloud_dir_prefix (str): path to the folder inside the bucket where the experiments are going to be saved
            source_dirs (list or tuple): paths to the local folders with the source code files used in experiment
            rm_subopt_local_models (bool or str): if True, the deciding metric is set to 'loss'. Give string metric name
                to set it as a deciding metric for suboptimal model removal. If metric name consists of substring 'loss'
                the metric minimization is done otherwise metric maximization is done
            num_best_checkpoints_kept (int): number of best performing models which are kept when removing suboptimal
                model checkpoints
            iteration_save_freq (int): frequency of saving the model checkpoint every specified number of
                training iterations
            collate_batch_pred_fn (callable): collate function transforming batch predictions as they come out from the
                model
            pred_transform_fn (callable): function transforming all the produced predictions after all the batches have
                been run through the model
            end_auto_eval (bool or int): used to optionally disable otherwise automatic end of epoch/training val/test
                loss calculations. This is useful when conducting very costly experiments to save on compute time.
                Specify either True/False boolean to always run or never run after each epoch or specify an int to
                execute only every specified number of epochs.
            lazy_experiment_save (bool): when in lazy mode experiment tracking components will create the experiment
                folder only after some training results are available (possibly at the end of the first epoch) instead
                of at the beginning of training.
            print_callbacks (bool): at the start of training print the list of registered callbacks
                which will be executed during the run of the train loop
            gpu_mode (str): GPU training mode selection. TrainLoop supports different GPU training modes by
                specifying one of the following:

                * ``'single'``: single GPU training
                * ``'dp'``: multi-GPU training via DataParallel
                * ``'ddp'``: multi-GPU training via DistributedDataParallel

            cuda_device_idx (int or None): CUDA device index used when training on multiple GPUs
            use_amp (bool or dict): Use 16-bit Automatic Mixed Precision (AMP).

                To switch to AMP mode either:

                * set this parameter to ``True`` to use default AMP :class:`~torch.cuda.amp.GradScaler`
                  initialization params
                * provide custom AMP :class:`~torch.cuda.amp.GradScaler` initialization parameters as a dict as
                  this parameter
        """
        if 'experiment_file_path' not in hyperparams:
            hyperparams['experiment_file_path'] = inspect.getframeinfo(inspect.currentframe().f_back).filename
        if 'source_dirs_paths' not in hyperparams:
            hyperparams['source_dirs_paths'] = source_dirs

        TrainLoopEndSave.__init__(self, model, train_loader, validation_loader, test_loader,
                                  optimizer, criterion,
                                  project_name, experiment_name, os.path.expanduser(local_model_result_folder_path),
                                  hyperparams, val_result_package, test_result_package,
                                  cloud_save_mode, bucket_name, cloud_dir_prefix, source_dirs,
                                  collate_batch_pred_fn, pred_transform_fn,
                                  end_auto_eval, lazy_experiment_save, print_callbacks,
                                  gpu_mode, cuda_device_idx, use_amp)
        self.rm_subopt_local_models = rm_subopt_local_models
        self.iteration_save_freq = iteration_save_freq

        if iteration_save_freq == 0:
            model_checkpoint_cb = ModelCheckpoint(
                self.project_name, self.experiment_name, self.local_model_result_folder_path,
                self.hyperparams,
                cloud_save_mode=self.cloud_save_mode,
                bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
                rm_subopt_local_models=self.rm_subopt_local_models,
                num_best_checkpoints_kept=num_best_checkpoints_kept
            )
        elif iteration_save_freq > 0:
            model_checkpoint_cb = ModelIterationCheckpoint(
                iteration_save_freq,
                self.project_name, self.experiment_name, self.local_model_result_folder_path,
                self.hyperparams,
                cloud_save_mode=self.cloud_save_mode,
                bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
                rm_subopt_local_models=self.rm_subopt_local_models,
                num_best_checkpoints_kept=num_best_checkpoints_kept
            )
        else:
            raise ValueError('iteration_save_freq can have values only >= 0. '
                             f'But received value {iteration_save_freq}.')

        self.callbacks_handler.register_callbacks([model_checkpoint_cb], cache_callbacks=True)