@@ -30,16 +30,16 @@ def mimic_adam_with_adafactor(hparams):
3030 Some minor things may be different, like epsilon and beta1 correction.
3131
3232 Args:
33- hparams: model hyperparameters where "Adam " in hparams.optimizer
33+ hparams: model hyperparameters where "adam " in hparams.optimizer
3434 """
35- assert "Adam " in hparams .optimizer
36- hparams .optimizer = "Adafactor "
35+ assert "adam " in hparams .optimizer
36+ hparams .optimizer = "adafactor "
3737 hparams .optimizer_adafactor_beta1 = hparams .optimizer_adam_beta1
3838 hparams .optimizer_adafactor_beta2 = hparams .optimizer_adam_beta2
3939 hparams .optimizer_adafactor_multiply_by_parameter_scale = False
4040 hparams .optimizer_adafactor_factored = False
4141 hparams .optimizer_adafactor_clipping_threshold = None
42- hparams .optimizer_adafactor_decay_type = "Adam "
42+ hparams .optimizer_adafactor_decay_type = "adam "
4343
4444
4545@registry .register_hparams
@@ -50,7 +50,7 @@ def afx_adam():
5050 hparams .optimizer_adam_beta2 = 0.999
5151 hparams .symbol_modality_num_shards = 1
5252 hparams .batch_size = 2048
53- hparams .optimizer = "Adam "
53+ hparams .optimizer = "adam "
5454 hparams .learning_rate_schedule = (
5555 "constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size" )
5656 hparams .learning_rate_constant = 2.0
0 commit comments