Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensor2tensor/bin/t2t_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def create_attack_params():


def create_attack(attack):
return registry.attacks(attack)
return registry.attack(attack)


def create_surrogate_hparams():
Expand Down
5 changes: 3 additions & 2 deletions tensor2tensor/bin/t2t_datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def main(_):

# Calculate the list of problems to generate.
problems = sorted(
list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems())
list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_base_problems())
for exclude in FLAGS.exclude_problems.split(","):
if exclude:
problems = [p for p in problems if exclude not in p]
Expand All @@ -169,7 +169,8 @@ def main(_):

if not problems:
problems_str = "\n * ".join(
sorted(list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems()))
sorted(list(_SUPPORTED_PROBLEM_GENERATORS) +
registry.list_base_problems()))
error_msg = ("You must specify one of the supported problems to "
"generate data for:\n * " + problems_str + "\n")
error_msg += ("TIMIT and parsing need data_sets specified with "
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/bin/t2t_prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def create_pruning_params():


def create_pruning_strategy(name):
return registry.pruning_strategies(name)
return registry.pruning_strategy(name)


def main(argv):
Expand Down
4 changes: 2 additions & 2 deletions tensor2tensor/layers/common_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def basic_params1():
initializer="orthogonal",
initializer_gain=1.5,
label_smoothing=0.1,
optimizer="Adam",
optimizer="adam",
optimizer_adam_epsilon=1e-6,
optimizer_adam_beta1=0.85,
optimizer_adam_beta2=0.997,
Expand Down Expand Up @@ -466,7 +466,7 @@ def basic_range1(ranged_hparams):
rhp.set_float("optimizer_adam_beta2", 0.995, 0.999)
rhp.set_categorical(
"optimizer",
["Adam", "Adagrad", "Momentum", "RMSProp", "SGD", "YellowFin"])
["adam", "adagrad", "momentum", "rms_prop", "sgd", "yellow_fin"])


@registry.register_ranged_hparams
Expand Down
10 changes: 5 additions & 5 deletions tensor2tensor/models/research/adafactor_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ def mimic_adam_with_adafactor(hparams):
Some minor things may be different, like epsilon and beta1 correction.

Args:
hparams: model hyperparameters where "Adam" in hparams.optimizer
hparams: model hyperparameters where "adam" in hparams.optimizer
"""
assert "Adam" in hparams.optimizer
hparams.optimizer = "Adafactor"
assert "adam" in hparams.optimizer
hparams.optimizer = "adafactor"
hparams.optimizer_adafactor_beta1 = hparams.optimizer_adam_beta1
hparams.optimizer_adafactor_beta2 = hparams.optimizer_adam_beta2
hparams.optimizer_adafactor_multiply_by_parameter_scale = False
hparams.optimizer_adafactor_factored = False
hparams.optimizer_adafactor_clipping_threshold = None
hparams.optimizer_adafactor_decay_type = "Adam"
hparams.optimizer_adafactor_decay_type = "adam"


@registry.register_hparams
Expand All @@ -50,7 +50,7 @@ def afx_adam():
hparams.optimizer_adam_beta2 = 0.999
hparams.symbol_modality_num_shards = 1
hparams.batch_size = 2048
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_schedule = (
"constant*rsqrt_decay*linear_warmup*rsqrt_hidden_size")
hparams.learning_rate_constant = 2.0
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/autoencoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,7 @@ def body(self, features):
def autoencoder_basic():
"""Basic autoencoder model."""
hparams = common_hparams.basic_params1()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_constant = 0.0002
hparams.learning_rate_warmup_steps = 500
hparams.learning_rate_schedule = "constant * linear_warmup"
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/transformer_nat.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def transformer_nat_small():
hparams.filter_size = 2048
hparams.label_smoothing = 0.0
hparams.force_full_predict = True
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.optimizer_adam_epsilon = 1e-9
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997
Expand Down
4 changes: 2 additions & 2 deletions tensor2tensor/models/research/transformer_vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ def transformer_ae_small():
hparams.filter_size = 2048
hparams.add_hparam("compress_filter_size", 2048 * 2)
hparams.label_smoothing = 0.0
hparams.optimizer = "Adam" # Can be unstable, maybe try Adam.
hparams.optimizer = "adam" # Can be unstable, maybe try Adam.
hparams.optimizer_adam_epsilon = 1e-9
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999.
Expand Down Expand Up @@ -941,7 +941,7 @@ def transformer_ae_a3():
def transformer_ae_a6():
"""Best hparams for transformer with semhash."""
hparams = transformer_ae_a3()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.noise_dev = 0.5
return hparams

Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/vqa_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def vqa_attention_base():
hparams = common_hparams.basic_params1()
hparams.batch_size = 128
hparams.use_fixed_batch_size = True,
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.999
hparams.optimizer_adam_epsilon = 1e-8
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/research/vqa_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ def vqa_self_attention_base():
hparams = common_hparams.basic_params1()
hparams.batch_size = 128
hparams.use_fixed_batch_size = True,
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.997
hparams.optimizer_adam_epsilon = 1e-9
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/shake_shake.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def shakeshake_small():
@registry.register_hparams
def shake_shake_quick():
hparams = shakeshake_small()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_cosine_cycle_steps = 1000
hparams.learning_rate = 0.5
hparams.batch_size = 100
Expand Down
4 changes: 2 additions & 2 deletions tensor2tensor/models/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1694,7 +1694,7 @@ def transformer_tall_pretrain_lm():
hparams.learning_rate_constant = 2e-4
hparams.learning_rate_schedule = (
"linear_warmup*constant*cosdecay")
hparams.optimizer = "AdamW"
hparams.optimizer = "adam_w"
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.999
hparams.optimizer_adam_epsilon = 1e-8
Expand Down Expand Up @@ -1739,7 +1739,7 @@ def transformer_tall_pretrain_lm_tpu():
# Optimizer gets reset in update_hparams_for_tpu so we set it again here.
hparams.learning_rate_constant = 2e-4
hparams.learning_rate_schedule = ("linear_warmup * constant * cosdecay")
hparams.optimizer = "AdamW"
hparams.optimizer = "adam_w"
return hparams


Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/models/vanilla_gan.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def infer(self, *args, **kwargs): # pylint: disable=arguments-differ
def sliced_gan():
"""Basic parameters for a vanilla_gan."""
hparams = common_hparams.basic_params1()
hparams.optimizer = "Adam"
hparams.optimizer = "adam"
hparams.learning_rate_constant = 0.0002
hparams.learning_rate_warmup_steps = 500
hparams.learning_rate_schedule = "constant * linear_warmup"
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/problems.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def problem(name):


def available():
return sorted(registry.list_problems())
return registry.list_base_problems()


all_problems.import_modules(all_problems.ALL_MODULES)
2 changes: 1 addition & 1 deletion tensor2tensor/rl/datagen_with_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main(_):

# Create problem if not already defined
problem_name = "gym_discrete_problem_with_agent_on_%s" % FLAGS.game
if problem_name not in registry.list_problems():
if problem_name not in registry.Registries.problems:
gym_env.register_game(FLAGS.game)

# Generate
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/test_data/transformer_test_ckpt/hparams.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
2 changes: 1 addition & 1 deletion tensor2tensor/utils/adafactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def adafactor_optimizer_from_hparams(hparams, lr):
Raises:
ValueError: on illegal values
"""
if hparams.optimizer_adafactor_decay_type == "Adam":
if hparams.optimizer_adafactor_decay_type == "adam":
decay_rate = adafactor_decay_rate_adam(
hparams.optimizer_adafactor_beta2)
elif hparams.optimizer_adafactor_decay_type == "pow":
Expand Down
2 changes: 1 addition & 1 deletion tensor2tensor/utils/learning_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def legacy_learning_rate_schedule(hparams):
warmup = _learning_rate_warmup(warmup_steps, hparams=hparams)
decay = _learning_rate_decay(hparams, warmup_steps)
ret = tf.where(step_num < warmup_steps, warmup, decay)
optimizer_correction = 0.002 if "Adam" in hparams.optimizer else 1.0
optimizer_correction = 0.002 if "adam" in hparams.optimizer else 1.0
tf.logging.info("Base learning rate: %f", hparams.learning_rate)
return ret * optimizer_correction * hparams.learning_rate

Expand Down
19 changes: 11 additions & 8 deletions tensor2tensor/utils/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def optimize(loss, learning_rate, hparams, use_tpu=False, variables=None):
return train_op


@registry.register_optimizer("adam")
@registry.register_optimizer
def adam(learning_rate, hparams):
# We change the default epsilon for Adam.
# Using LazyAdam as it's much faster for large vocabulary embeddings.
Expand All @@ -105,7 +105,7 @@ def adam(learning_rate, hparams):
epsilon=hparams.optimizer_adam_epsilon)


@registry.register_optimizer("multistep_adam")
@registry.register_optimizer
def multistep_adam(learning_rate, hparams):
return multistep_optimizer.MultistepAdamOptimizer(
learning_rate,
Expand All @@ -115,22 +115,22 @@ def multistep_adam(learning_rate, hparams):
n=hparams.optimizer_multistep_accumulate_steps)


@registry.register_optimizer("momentum")
@registry.register_optimizer
def momentum(learning_rate, hparams):
return tf.train.MomentumOptimizer(
learning_rate,
momentum=hparams.optimizer_momentum_momentum,
use_nesterov=hparams.optimizer_momentum_nesterov)


@registry.register_optimizer("yellow_fin")
@registry.register_optimizer
def yellow_fin(learning_rate, hparams):
return yellowfin.YellowFinOptimizer(
learning_rate=learning_rate,
momentum=hparams.optimizer_momentum_momentum)


@registry.register_optimizer("true_adam")
@registry.register_optimizer
def true_adam(learning_rate, hparams):
return tf.train.AdamOptimizer(
learning_rate,
Expand All @@ -139,7 +139,7 @@ def true_adam(learning_rate, hparams):
epsilon=hparams.optimizer_adam_epsilon)


@registry.register_optimizer("adam_w")
@registry.register_optimizer
def adam_w(learning_rate, hparams):
# Openai gpt used weight decay.
# Given the internals of AdamW, weight decay dependent on the
Expand All @@ -156,7 +156,7 @@ def adam_w(learning_rate, hparams):
epsilon=hparams.optimizer_adam_epsilon)


@registry.register_optimizer("Adafactor")
@registry.register_optimizer("adafactor")
def register_adafactor(learning_rate, hparams):
return adafactor.adafactor_optimizer_from_hparams(hparams, learning_rate)

Expand All @@ -169,8 +169,11 @@ def _register_base_optimizer(key, fn):


for k in tf.contrib.layers.OPTIMIZER_CLS_NAMES:
if k not in registry._OPTIMIZERS: # pylint: disable=protected-access
if k not in registry.Registries.optimizers and k not in ('SGD', 'RMSProp'):
_register_base_optimizer(k, tf.contrib.layers.OPTIMIZER_CLS_NAMES[k])
_register_base_optimizer('sgd', tf.contrib.layers.OPTIMIZER_CLS_NAMES['SGD'])
_register_base_optimizer(
'rms_prop', tf.contrib.layers.OPTIMIZER_CLS_NAMES['RMSProp'])


class ConditionalOptimizer(tf.train.Optimizer):
Expand Down
Loading