In the context of learning rate (“LR”) scheduling according to an exponential learning rate schedule, i.e. decaying the learning rate exponentially over iterations we have the following closed form expression for the LR at time step (iteration) , denoted given the base LR (i.e. LR at time step zero) :
(Reminder: We’re zero-based indexing here, so we start at .)
Rearranging, we get the expression for the scaling factor, , given a target learning rate which we would like to reach after time steps, which for us will be (or 1e-5
):
#!/usr/bin/env python
from math import exp, log
def compute_exp_lr_decay_gamma(lr_base: float, lr_target: float, timesteps: int) -> float:
return exp(log(lr_target / lr_base) / timesteps)
def get_gamma_ratio(Ts: tuple[int, int], lr_base: float, lr_target: float) -> float:
s = log(lr_target / lr_base)
return exp(s * (1 / Ts[0] - 1 / Ts[1]))
# Example usage: compute_exp_lr_decay_gamma
LR_BASE = 1e-3
LR_TARGET = 1e-5
TIMESTEP_RANGE = [10, 100, 500, 1000, 2500, 5000, 10_000, 100_000]
for timesteps in TIMESTEP_RANGE:
gamma = compute_exp_lr_decay_gamma(LR_BASE, LR_TARGET, timesteps)
print(f"T: {timesteps} -> gamma: {gamma}")
print("-" * 80)
# Example usage: get_gamma_ratio
LR_BASE = 1e-3
LR_TARGET = 1e-5
T1 = 10
T2 = 50
gamma1 = compute_exp_lr_decay_gamma(LR_BASE, LR_TARGET, T1)
gamma2 = compute_exp_lr_decay_gamma(LR_BASE, LR_TARGET, T2)
print(f"T: {T1} -> gamma: {gamma1}")
print(f"T: {T2} -> gamma: {gamma2}")
print(f"T ratio: {get_gamma_ratio((T1, T2), LR_BASE, LR_TARGET)}")
print(f"T ratio: {gamma1 / gamma2}")
print(f"T ratio: {get_gamma_ratio((T2, T1), LR_BASE, LR_TARGET)}")
print(f"T ratio: {gamma2 / gamma1}")