import os
import inspect
from aitoolbox.torchtrain.train_loop.train_loop import TrainLoop
from aitoolbox.experiment.result_package.abstract_result_packages import AbstractResultPackage
from aitoolbox.torchtrain.callbacks.model_save import ModelCheckpoint, ModelIterationCheckpoint, ModelTrainEndSave
from aitoolbox.torchtrain.train_loop.components.pred_collate_fns import append_predictions, torch_cat_transf
[docs]class TrainLoopCheckpoint(TrainLoop):
def __init__(self, model,
train_loader, validation_loader, test_loader,
optimizer, criterion,
project_name, experiment_name, local_model_result_folder_path,
hyperparams,
cloud_save_mode='s3', bucket_name='model-result', cloud_dir_prefix='', source_dirs=(),
rm_subopt_local_models=False, num_best_checkpoints_kept=2,
iteration_save_freq=0,
collate_batch_pred_fn=append_predictions, pred_transform_fn=torch_cat_transf,
end_auto_eval=True, lazy_experiment_save=False, print_callbacks=False,
gpu_mode='single', cuda_device_idx=None, use_amp=False):
"""TrainLoop with the automatic model check-pointing at the end of each epoch
Args:
model (TTModel or ModelWrap or TTDataParallel): neural network model
train_loader (torch.utils.data.DataLoader): data loader for train data set
validation_loader (torch.utils.data.DataLoader or None): data loader for validation data set
test_loader (torch.utils.data.DataLoader or None): data loader for test data set
optimizer (torch.optim.Optimizer or MultiOptimizer): optimizer algorithm.
criterion (torch.nn.Module or MultiLoss or None): criterion during the training procedure
project_name (str): root name of the project
experiment_name (str): name of the particular experiment
local_model_result_folder_path (str): root local path where project folder will be created
hyperparams (dict): used hyper-parameters. When running the TrainLoop from jupyter notebook in order to
ensure the python experiment file copying to the experiment folder, the user needs to manually
specify the python file path as the value for the `experiment_file_path` key. If running the training
directly from the terminal the path deduction is done automatically.
cloud_save_mode (str or None): Storage destination selector.
For AWS S3: 's3' / 'aws_s3' / 'aws'
For Google Cloud Storage: 'gcs' / 'google_storage' / 'google storage'
Everything else results just in local storage to disk
bucket_name (str): name of the bucket in the cloud storage
cloud_dir_prefix (str): path to the folder inside the bucket where the experiments are going to be saved
source_dirs (list or tuple): paths to the local folders with the source code files used in experiment
rm_subopt_local_models (bool or str): if True, the deciding metric is set to 'loss'. Give string metric name
to set it as a deciding metric for suboptimal model removal. If metric name consists of substring 'loss'
the metric minimization is done otherwise metric maximization is done
num_best_checkpoints_kept (int): number of best performing models which are kept when removing suboptimal
model checkpoints
iteration_save_freq (int): frequency of saving the model checkpoint every specified number of
training iterations
collate_batch_pred_fn (callable): collate function transforming batch predictions as they come out from the
model
pred_transform_fn (callable): function transforming all the produced predictions after all the batches have
been run through the model
end_auto_eval (bool or int): used to optionally disable otherwise automatic end of epoch/training val/test
loss calculations. This is useful when conducting very costly experiments to save on compute time.
Specify either True/False boolean to always run or never run after each epoch or specify an int to
execute only every specified number of epochs.
lazy_experiment_save (bool): when in lazy mode experiment tracking components will create the experiment
folder only after some training results are available (possibly at the end of the first epoch) instead
of at the beginning of training.
print_callbacks (bool): at the start of training print the list of registered callbacks
which will be executed during the run of the train loop
gpu_mode (str): GPU training mode selection. TrainLoop supports different GPU training modes by
specifying one of the following:
* ``'single'``: single GPU training
* ``'dp'``: multi-GPU training via DataParallel
* ``'ddp'``: multi-GPU training via DistributedDataParallel
cuda_device_idx (int or None): CUDA device index used when training on multiple GPUs
use_amp (bool or dict): Use 16-bit Automatic Mixed Precision (AMP).
To switch to AMP mode either:
* set this parameter to ``True`` to use default AMP :class:`~torch.cuda.amp.GradScaler`
initialization params
* provide custom AMP :class:`~torch.cuda.amp.GradScaler` initialization parameters as a dict as
this parameter
"""
TrainLoop.__init__(self, model, train_loader, validation_loader, test_loader, optimizer, criterion,
collate_batch_pred_fn, pred_transform_fn,
end_auto_eval, lazy_experiment_save, print_callbacks,
gpu_mode, cuda_device_idx, use_amp)
self.project_name = project_name
self.experiment_name = experiment_name
self.local_model_result_folder_path = os.path.expanduser(local_model_result_folder_path)
self.hyperparams = hyperparams
self.cloud_save_mode = cloud_save_mode
self.bucket_name = bucket_name
self.cloud_dir_prefix = cloud_dir_prefix
self.source_dirs = source_dirs
self.rm_subopt_local_models = rm_subopt_local_models
self.iteration_save_freq = iteration_save_freq
if 'experiment_file_path' not in self.hyperparams:
self.hyperparams['experiment_file_path'] = inspect.getframeinfo(inspect.currentframe().f_back).filename
if 'source_dirs_paths' not in self.hyperparams:
self.hyperparams['source_dirs_paths'] = source_dirs
if iteration_save_freq == 0:
model_checkpoint_cb = ModelCheckpoint(
self.project_name, self.experiment_name, self.local_model_result_folder_path,
self.hyperparams,
cloud_save_mode=self.cloud_save_mode,
bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
rm_subopt_local_models=self.rm_subopt_local_models,
num_best_checkpoints_kept=num_best_checkpoints_kept
)
elif iteration_save_freq > 0:
model_checkpoint_cb = ModelIterationCheckpoint(
iteration_save_freq,
self.project_name, self.experiment_name, self.local_model_result_folder_path,
self.hyperparams,
cloud_save_mode=self.cloud_save_mode,
bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
rm_subopt_local_models=self.rm_subopt_local_models,
num_best_checkpoints_kept=num_best_checkpoints_kept
)
else:
raise ValueError('iteration_save_freq can have values only >= 0. '
f'But received value {iteration_save_freq}.')
self.callbacks_handler.register_callbacks([model_checkpoint_cb], cache_callbacks=True)
[docs]class TrainLoopEndSave(TrainLoop):
def __init__(self, model,
train_loader, validation_loader, test_loader,
optimizer, criterion,
project_name, experiment_name, local_model_result_folder_path,
hyperparams, val_result_package=None, test_result_package=None,
cloud_save_mode='s3', bucket_name='model-result', cloud_dir_prefix='', source_dirs=(),
collate_batch_pred_fn=append_predictions, pred_transform_fn=torch_cat_transf,
end_auto_eval=True, lazy_experiment_save=False, print_callbacks=False,
gpu_mode='single', cuda_device_idx=None, use_amp=False):
"""TrainLoop with the model performance evaluation and final model saving at the end of the training process
Args:
model (TTModel or ModelWrap or TTDataParallel): neural network model
train_loader (torch.utils.data.DataLoader): data loader for train data set
validation_loader (torch.utils.data.DataLoader or None): data loader for validation data set
test_loader (torch.utils.data.DataLoader or None): data loader for test data set
optimizer (torch.optim.Optimizer or MultiOptimizer): optimizer algorithm.
criterion (torch.nn.Module or MultiLoss or None): criterion during the training procedure
project_name (str): root name of the project
experiment_name (str): name of the particular experiment
local_model_result_folder_path (str): root local path where project folder will be created
hyperparams (dict): used hyper-parameters. When running the TrainLoop from jupyter notebook in order to
ensure the python experiment file copying to the experiment folder, the user needs to manually
specify the python file path as the value for the `experiment_file_path` key. If running the training
directly from the terminal the path deduction is done automatically.
val_result_package (AbstractResultPackage or None): result package evaluated on validation data at the end
of the training
test_result_package (AbstractResultPackage or None): result package evaluated on test data at the end
of the training
cloud_save_mode (str or None): Storage destination selector.
For AWS S3: 's3' / 'aws_s3' / 'aws'
For Google Cloud Storage: 'gcs' / 'google_storage' / 'google storage'
Everything else results just in local storage to disk
bucket_name (str): name of the bucket in the cloud storage
cloud_dir_prefix (str): path to the folder inside the bucket where the experiments are going to be saved
source_dirs (list or tuple): paths to the local folders with the source code files used in experiment
collate_batch_pred_fn (callable): collate function transforming batch predictions as they come out from the
model
pred_transform_fn (callable): function transforming all the produced predictions after all the batches have
been run through the model
end_auto_eval (bool or int): used to optionally disable otherwise automatic end of epoch/training val/test
loss calculations. This is useful when conducting very costly experiments to save on compute time.
Specify either True/False boolean to always run or never run after each epoch or specify an int to
execute only every specified number of epochs.
lazy_experiment_save (bool): when in lazy mode experiment tracking components will create the experiment
folder only after some training results are available (possibly at the end of the first epoch) instead
of at the beginning of training.
print_callbacks (bool): at the start of training print the list of registered callbacks
which will be executed during the run of the train loop
gpu_mode (str): GPU training mode selection. TrainLoop supports different GPU training modes by
specifying one of the following:
* ``'single'``: single GPU training
* ``'dp'``: multi-GPU training via DataParallel
* ``'ddp'``: multi-GPU training via DistributedDataParallel
cuda_device_idx (int or None): CUDA device index used when training on multiple GPUs
use_amp (bool or dict): Use 16-bit Automatic Mixed Precision (AMP).
To switch to AMP mode either:
* set this parameter to ``True`` to use default AMP :class:`~torch.cuda.amp.GradScaler`
initialization params
* provide custom AMP :class:`~torch.cuda.amp.GradScaler` initialization parameters as a dict as
this parameter
"""
TrainLoop.__init__(self, model, train_loader, validation_loader, test_loader, optimizer, criterion,
collate_batch_pred_fn, pred_transform_fn,
end_auto_eval, lazy_experiment_save, print_callbacks,
gpu_mode, cuda_device_idx, use_amp)
self.project_name = project_name
self.experiment_name = experiment_name
self.local_model_result_folder_path = os.path.expanduser(local_model_result_folder_path)
self.hyperparams = hyperparams
self.val_result_package = val_result_package
self.test_result_package = test_result_package
self.cloud_save_mode = cloud_save_mode
self.bucket_name = bucket_name
self.cloud_dir_prefix = cloud_dir_prefix
self.source_dirs = source_dirs
if 'experiment_file_path' not in self.hyperparams:
self.hyperparams['experiment_file_path'] = inspect.getframeinfo(inspect.currentframe().f_back).filename
if 'source_dirs_paths' not in self.hyperparams:
self.hyperparams['source_dirs_paths'] = source_dirs
self.check_if_result_packages_possible()
self.callbacks_handler.register_callbacks([
ModelTrainEndSave(self.project_name, self.experiment_name, self.local_model_result_folder_path,
self.hyperparams, self.val_result_package, self.test_result_package,
cloud_save_mode=self.cloud_save_mode,
bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix)
], cache_callbacks=True)
[docs] def check_if_result_packages_possible(self):
if self.val_result_package is not None and self.validation_loader is None:
raise ValueError('Given the val_result_package but not supplied the validation_loader. '
'If you want to calculate the val_result_package the validation_loader has to be provided.')
if self.test_result_package is not None and self.test_loader is None:
raise ValueError('Given the test_result_package but not supplied the test_loader. '
'If you want to calculate the test_result_package the test_loader has to be provided.')
if self.val_result_package is None and self.test_result_package is None:
raise ValueError('Both val_result_package and test_result_package are None. '
'At least one of these should be not None but actual result package.')
if self.val_result_package is not None and not isinstance(self.val_result_package, AbstractResultPackage):
raise TypeError(f'val_result_package {self.val_result_package} is not inherited from AbstractResultPackage')
if self.test_result_package is not None and not isinstance(self.test_result_package, AbstractResultPackage):
raise TypeError(f'test_result_package {self.test_result_package} is not inherited from AbstractResultPackage')
[docs]class TrainLoopCheckpointEndSave(TrainLoopEndSave):
def __init__(self, model,
train_loader, validation_loader, test_loader,
optimizer, criterion,
project_name, experiment_name, local_model_result_folder_path,
hyperparams, val_result_package=None, test_result_package=None,
cloud_save_mode='s3', bucket_name='model-result', cloud_dir_prefix='', source_dirs=(),
rm_subopt_local_models=False, num_best_checkpoints_kept=2,
iteration_save_freq=0,
collate_batch_pred_fn=append_predictions, pred_transform_fn=torch_cat_transf,
end_auto_eval=True, lazy_experiment_save=False, print_callbacks=False,
gpu_mode='single', cuda_device_idx=None, use_amp=False):
"""TrainLoop both saving model check-pointing at the end of each epoch and model performance reporting
and model saving at the end of the training process
Args:
model (TTModel or ModelWrap or TTDataParallel): neural network model
train_loader (torch.utils.data.DataLoader): data loader for train data set
validation_loader (torch.utils.data.DataLoader or None): data loader for validation data set
test_loader (torch.utils.data.DataLoader or None): data loader for test data set
optimizer (torch.optim.Optimizer or MultiOptimizer): optimizer algorithm.
criterion (torch.nn.Module or MultiLoss or None): criterion during the training procedure
project_name (str): root name of the project
experiment_name (str): name of the particular experiment
local_model_result_folder_path (str): root local path where project folder will be created
hyperparams (dict): used hyper-parameters. When running the TrainLoop from jupyter notebook in order to
ensure the python experiment file copying to the experiment folder, the user needs to manually
specify the python file path as the value for the `experiment_file_path` key. If running the training
directly from the terminal the path deduction is done automatically.
val_result_package (AbstractResultPackage or None): result package evaluated on validation data at the end
of the training
test_result_package (AbstractResultPackage or None): result package evaluated on test data at the end
of the training
cloud_save_mode (str or None): Storage destination selector.
For AWS S3: 's3' / 'aws_s3' / 'aws'
For Google Cloud Storage: 'gcs' / 'google_storage' / 'google storage'
Everything else results just in local storage to disk
bucket_name (str): name of the bucket in the cloud storage
cloud_dir_prefix (str): path to the folder inside the bucket where the experiments are going to be saved
source_dirs (list or tuple): paths to the local folders with the source code files used in experiment
rm_subopt_local_models (bool or str): if True, the deciding metric is set to 'loss'. Give string metric name
to set it as a deciding metric for suboptimal model removal. If metric name consists of substring 'loss'
the metric minimization is done otherwise metric maximization is done
num_best_checkpoints_kept (int): number of best performing models which are kept when removing suboptimal
model checkpoints
iteration_save_freq (int): frequency of saving the model checkpoint every specified number of
training iterations
collate_batch_pred_fn (callable): collate function transforming batch predictions as they come out from the
model
pred_transform_fn (callable): function transforming all the produced predictions after all the batches have
been run through the model
end_auto_eval (bool or int): used to optionally disable otherwise automatic end of epoch/training val/test
loss calculations. This is useful when conducting very costly experiments to save on compute time.
Specify either True/False boolean to always run or never run after each epoch or specify an int to
execute only every specified number of epochs.
lazy_experiment_save (bool): when in lazy mode experiment tracking components will create the experiment
folder only after some training results are available (possibly at the end of the first epoch) instead
of at the beginning of training.
print_callbacks (bool): at the start of training print the list of registered callbacks
which will be executed during the run of the train loop
gpu_mode (str): GPU training mode selection. TrainLoop supports different GPU training modes by
specifying one of the following:
* ``'single'``: single GPU training
* ``'dp'``: multi-GPU training via DataParallel
* ``'ddp'``: multi-GPU training via DistributedDataParallel
cuda_device_idx (int or None): CUDA device index used when training on multiple GPUs
use_amp (bool or dict): Use 16-bit Automatic Mixed Precision (AMP).
To switch to AMP mode either:
* set this parameter to ``True`` to use default AMP :class:`~torch.cuda.amp.GradScaler`
initialization params
* provide custom AMP :class:`~torch.cuda.amp.GradScaler` initialization parameters as a dict as
this parameter
"""
if 'experiment_file_path' not in hyperparams:
hyperparams['experiment_file_path'] = inspect.getframeinfo(inspect.currentframe().f_back).filename
if 'source_dirs_paths' not in hyperparams:
hyperparams['source_dirs_paths'] = source_dirs
TrainLoopEndSave.__init__(self, model, train_loader, validation_loader, test_loader,
optimizer, criterion,
project_name, experiment_name, os.path.expanduser(local_model_result_folder_path),
hyperparams, val_result_package, test_result_package,
cloud_save_mode, bucket_name, cloud_dir_prefix, source_dirs,
collate_batch_pred_fn, pred_transform_fn,
end_auto_eval, lazy_experiment_save, print_callbacks,
gpu_mode, cuda_device_idx, use_amp)
self.rm_subopt_local_models = rm_subopt_local_models
self.iteration_save_freq = iteration_save_freq
if iteration_save_freq == 0:
model_checkpoint_cb = ModelCheckpoint(
self.project_name, self.experiment_name, self.local_model_result_folder_path,
self.hyperparams,
cloud_save_mode=self.cloud_save_mode,
bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
rm_subopt_local_models=self.rm_subopt_local_models,
num_best_checkpoints_kept=num_best_checkpoints_kept
)
elif iteration_save_freq > 0:
model_checkpoint_cb = ModelIterationCheckpoint(
iteration_save_freq,
self.project_name, self.experiment_name, self.local_model_result_folder_path,
self.hyperparams,
cloud_save_mode=self.cloud_save_mode,
bucket_name=bucket_name, cloud_dir_prefix=cloud_dir_prefix,
rm_subopt_local_models=self.rm_subopt_local_models,
num_best_checkpoints_kept=num_best_checkpoints_kept
)
else:
raise ValueError('iteration_save_freq can have values only >= 0. '
f'But received value {iteration_save_freq}.')
self.callbacks_handler.register_callbacks([model_checkpoint_cb], cache_callbacks=True)