diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 36590210250d..5fdaa60a3c13 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -2,49 +2,49 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v0 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pong-ram-v4 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v0 \ --run A2C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "model": {"free_log_std": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"simple_optimizer": false, "num_sgd_iter": 2, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"simple_optimizer": true, "num_sgd_iter": 2, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0.1}' \ --ray-num-gpus 1 @@ -52,187 +52,194 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "use_gae": false, "batch_mode": "complete_episodes"}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"remote_worker_envs": true, "num_envs_per_worker": 2, "num_workers": 1, "train_batch_size": 100, "sgd_minibatch_size": 50}' +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/python/ray/rllib/tests/run_silent.sh train.py \ + --env Pendulum-v0 \ + --run APPO \ + --stop '{"training_iteration": 1}' \ + --config '{"num_workers": 2, "num_gpus": 0}' + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pendulum-v0 \ --run ES \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"stepsize": 0.01, "episodes_per_batch": 20, "train_batch_size": 100, "num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pong-v0 \ --run ES \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"stepsize": 0.01, "episodes_per_batch": 20, "train_batch_size": 100, "num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"lr": 1e-3, "schedule_max_timesteps": 100000, "exploration_fraction": 0.1, "exploration_final_eps": 0.02, "dueling": false, "hiddens": [], "model": {"fcnet_hiddens": [64], "fcnet_activation": "relu"}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run APEX \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "timesteps_per_iteration": 1000, "num_gpus": 0, "min_iter_time_s": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env FrozenLake-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' + --stop '{"training_iteration": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env FrozenLake-v0 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_sgd_iter": 10, "sgd_minibatch_size": 64, "train_batch_size": 1000, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v4 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"lr": 1e-4, "schedule_max_timesteps": 2000000, "buffer_size": 10000, "exploration_fraction": 0.1, "exploration_final_eps": 0.01, "sample_batch_size": 4, "learning_starts": 10000, "target_network_update_freq": 1000, "gamma": 0.99, "prioritized_replay": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env MontezumaRevenge-v0 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "model": {"dim": 40, "conv_filters": [[16, [8, 8], 4], [32, [4, 4], 2], [512, [5, 5], 1]]}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "use_pytorch": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1, "model": {"use_lstm": true, "max_seq_len": 100}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1, "num_envs_per_worker": 10}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pong-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env FrozenLake-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pendulum-v0 \ --run DDPG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1, "num_data_loader_buffers": 2, "replay_buffer_num_slots": 100, "replay_proportion": 1.0}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1, "num_data_loader_buffers": 2, "replay_buffer_num_slots": 100, "replay_proportion": 1.0, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env MountainCarContinuous-v0 \ --run DDPG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env MountainCarContinuous-v0 \ --run DDPG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ @@ -240,7 +247,7 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --env Pendulum-v0 \ --run APEX_DDPG \ --ray-num-cpus 8 \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "optimizer": {"num_replay_buffer_shards": 1}, "learning_starts": 100, "min_iter_time_s": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ @@ -248,21 +255,21 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --env Pendulum-v0 \ --run APEX_DDPG \ --ray-num-cpus 8 \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "optimizer": {"num_replay_buffer_shards": 1}, "learning_starts": 100, "min_iter_time_s": 1, "batch_mode": "complete_episodes", "parameter_noise": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run MARWIL \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"input": "/ray/python/ray/rllib/tests/data/cartpole_small", "learning_starts": 0, "input_evaluation": ["wis", "is"], "shuffle_buffer_size": 10}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"input": "/ray/python/ray/rllib/tests/data/cartpole_small", "learning_starts": 0, "input_evaluation": ["wis", "is"], "soft_q": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ @@ -375,14 +382,14 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v4 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "use_pytorch": true, "sample_async": false, "model": {"use_lstm": false, "grayscale": true, "zero_mean": false, "dim": 84}, "preprocessor_pref": "rllib"}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "use_pytorch": true, "sample_async": false}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py index 882c64031388..750f6a7e11cd 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py @@ -50,12 +50,13 @@ def __init__(self, observation_space, action_space, config): tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) - prev_actions = ModelCatalog.get_action_placeholder(action_space) - prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") + self.prev_actions = ModelCatalog.get_action_placeholder(action_space) + self.prev_rewards = tf.placeholder( + tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, - "prev_actions": prev_actions, - "prev_rewards": prev_rewards, + "prev_actions": self.prev_actions, + "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) @@ -83,8 +84,8 @@ def __init__(self, observation_space, action_space, config): loss_in = [ ("obs", self.observations), ("actions", actions), - ("prev_actions", prev_actions), - ("prev_rewards", prev_rewards), + ("prev_actions", self.prev_actions), + ("prev_rewards", self.prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] @@ -103,8 +104,8 @@ def __init__(self, observation_space, action_space, config): loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, - prev_action_input=prev_actions, - prev_reward_input=prev_rewards, + prev_action_input=self.prev_actions, + prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) @@ -138,7 +139,9 @@ def postprocess_trajectory(self, next_state = [] for i in range(len(self.model.state_in)): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) - last_r = self._value(sample_batch["new_obs"][-1], *next_state) + last_r = self._value(sample_batch["new_obs"][-1], + sample_batch["actions"][-1], + sample_batch["rewards"][-1], *next_state) return compute_advantages(sample_batch, last_r, self.config["gamma"], self.config["lambda"]) @@ -159,8 +162,13 @@ def extra_compute_action_fetches(self): TFPolicyGraph.extra_compute_action_fetches(self), **{"vf_preds": self.vf}) - def _value(self, ob, *args): - feed_dict = {self.observations: [ob], self.model.seq_lens: [1]} + def _value(self, ob, prev_action, prev_reward, *args): + feed_dict = { + self.observations: [ob], + self.prev_actions: [prev_action], + self.prev_rewards: [prev_reward], + self.model.seq_lens: [1] + } assert len(args) == len(self.model.state_in), \ (args, self.model.state_in) for k, v in zip(self.model.state_in, args): diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index 362f93b0721a..d613c64a7fd6 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -171,16 +171,17 @@ def __init__(self, if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False - actions_shape = [None] output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True - actions_shape = [None, len(action_space.nvec)] output_hidden_shape = action_space.nvec.astype(np.int32) - else: + elif self.config["vtrace"]: raise UnsupportedSpaceException( - "Action space {} is not supported for APPO.", + "Action space {} is not supported for APPO + VTrace.", format(action_space)) + else: + is_multidiscrete = False + output_hidden_shape = 1 # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( @@ -200,7 +201,7 @@ def __init__(self, existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: - actions = tf.placeholder(tf.int64, actions_shape, name="ac") + actions = ModelCatalog.get_action_placeholder(action_space) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( diff --git a/python/ray/rllib/optimizers/async_samples_optimizer.py b/python/ray/rllib/optimizers/async_samples_optimizer.py index 039fea346945..250ec735595f 100644 --- a/python/ray/rllib/optimizers/async_samples_optimizer.py +++ b/python/ray/rllib/optimizers/async_samples_optimizer.py @@ -84,9 +84,6 @@ def _init(self, learner_queue_size) self.learner.start() - if len(self.remote_evaluators) == 0: - logger.warning("Config num_workers=0 means training will hang!") - # Stats self._optimizer_step_timer = TimerStat() self.num_weight_syncs = 0 @@ -137,6 +134,8 @@ def get_mean_stats_and_reset(self): @override(PolicyOptimizer) def step(self): + if len(self.remote_evaluators) == 0: + raise ValueError("Config num_workers=0 means training will hang!") assert self.learner.is_alive() with self._optimizer_step_timer: sample_timesteps, train_timesteps = self._step()