Source code for aitoolbox.torchtrain.schedulers.warmup

import math

from aitoolbox.torchtrain.schedulers.basic import LambdaLRScheduler


[docs]class ConstantWithWarmupScheduler(LambdaLRScheduler):
    def __init__(self, num_warmup_steps, last_epoch=-1, **kwargs):
        """Constant scheduler with the initial warmup

        Schedule with a constant learning rate preceded by a warmup period during which the learning rate increases
        linearly between 0 and the initial lr set in the optimizer.

        https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.get_constant_schedule_with_warmup

        Args:
            num_warmup_steps (int): The number of steps for the warmup phase
            last_epoch (int): The index of the last epoch when resuming training
            **kwargs: learning rate scheduler additional parameters
        """
        def lr_lambda(current_step: int):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1.0, num_warmup_steps))
            return 1.0

        super().__init__(lr_lambda=lr_lambda,
                         execute_epoch_end=False, execute_batch_end=True, last_epoch=last_epoch, **kwargs)
        self.callback_name = 'Warmed up constant learning rate scheduler'


[docs]class CosineWithWarmupScheduler(LambdaLRScheduler):
    def __init__(self, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1, **kwargs):
        """Cosine decreasing scheduler with the initial warmup

        Schedule with a learning rate that decreases following the values of the cosine function between the initial
        lr set in the optimizer to 0, after a warmup period during which it increases linearly
        between 0 and the initial lr set in the optimizer.

        https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.get_cosine_schedule_with_warmup

        Args:
            num_warmup_steps (int): The number of steps for the warmup phase
            num_training_steps (int): The total number of training steps
            num_cycles (float): The number of waves in the cosine schedule (the defaults is to just decrease
                from the max value to 0 following a half-cosine).
            last_epoch (int): The index of the last epoch when resuming training
            **kwargs: learning rate scheduler additional parameters
        """
        def lr_lambda(current_step):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

        super().__init__(lr_lambda=lr_lambda,
                         execute_epoch_end=False, execute_batch_end=True, last_epoch=last_epoch, **kwargs)
        self.callback_name = 'Warmed up cosine decreasing learning rate scheduler'


[docs]class HardRestartsCosineWithWarmupScheduler(LambdaLRScheduler):
    def __init__(self, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1, **kwargs):
        """Cosine scheduler with hard restarts and the initial warmup

        Schedule with a learning rate that decreases following the values of the cosine function between
        the initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which
        it increases linearly between 0 and the initial lr set in the optimizer.

        https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.get_cosine_with_hard_restarts_schedule_with_warmup

        Args:
            num_warmup_steps (int): The number of steps for the warmup phase
            num_training_steps (int): The total number of training steps
            num_cycles (float): The number of waves in the cosine schedule (the defaults is to just decrease
                from the max value to 0 following a half-cosine).
            last_epoch (int): The index of the last epoch when resuming training
            **kwargs: learning rate scheduler additional parameters
        """
        def lr_lambda(current_step):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
            if progress >= 1.0:
                return 0.0
            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))

        super().__init__(lr_lambda=lr_lambda,
                         execute_epoch_end=False, execute_batch_end=True, last_epoch=last_epoch, **kwargs)
        self.callback_name = 'Warmed up hard restarts cosine learning rate scheduler'


[docs]class LinearWithWarmupScheduler(LambdaLRScheduler):
    def __init__(self, num_warmup_steps, num_training_steps, last_epoch=-1, **kwargs):
        """Linearly decreasing scheduler with the initial warmup

        Schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0,
        after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

        Especially useful in the context of BERT-like models.
        Implementation based on HuggingFace Transformers library's ``get_linear_schedule_with_warmup()`` method.

        https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.get_linear_schedule_with_warmup

        Args:
            num_warmup_steps (int): The number of steps for the warmup phase
            num_training_steps (int): The total number of training steps
            last_epoch (int): The index of the last epoch when resuming training
            **kwargs: learning rate scheduler additional parameters
        """
        def lr_lambda(current_step: int):
            if current_step < num_warmup_steps:
                return float(current_step) / float(max(1, num_warmup_steps))
            return max(
                0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
            )

        super().__init__(lr_lambda=lr_lambda,
                         execute_epoch_end=False, execute_batch_end=True, last_epoch=last_epoch, **kwargs)
        self.callback_name = 'Warmed up linearly decreasing learning rate scheduler'