diff --git a/rl_games/algos_torch/sac_agent.py b/rl_games/algos_torch/sac_agent.py index f17f52f2..30cfd710 100644 --- a/rl_games/algos_torch/sac_agent.py +++ b/rl_games/algos_torch/sac_agent.py @@ -38,6 +38,8 @@ def __init__(self, base_name, params): self.num_steps_per_episode = config.get("num_steps_per_episode", 1) self.normalize_input = config.get("normalize_input", False) + self.save_freq = config.get('save_frequency', 0) + self.max_env_steps = config.get("max_env_steps", 1000) # temporary, in future we will use other approach print(self.batch_size, self.num_actors, self.num_agents) @@ -236,7 +238,11 @@ def set_weights(self, weights): def set_full_state_weights(self, weights): self.set_weights(weights) - self.step = weights['step'] + for key in weights: + print("Set full state weights keys:") + print(key) + + self.step = weights['steps'] self.actor_optimizer.load_state_dict(weights['actor_optimizer']) self.critic_optimizer.load_state_dict(weights['critic_optimizer']) self.log_alpha_optimizer.load_state_dict(weights['log_alpha_optimizer']) @@ -560,7 +566,7 @@ def train(self): should_exit = False if self.save_freq > 0: - if (self.epoch_num % self.save_freq == 0) and (mean_rewards[0] <= self.last_mean_rewards): + if (self.epoch_num % self.save_freq) == 0: self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name)) if mean_rewards > self.last_mean_rewards and self.epoch_num >= self.save_best_after: diff --git a/rl_games/common/a2c_common.py b/rl_games/common/a2c_common.py index b6bdb687..67fd62e6 100644 --- a/rl_games/common/a2c_common.py +++ b/rl_games/common/a2c_common.py @@ -1024,7 +1024,7 @@ def train(self): checkpoint_name = self.config['name'] + '_ep_' + str(epoch_num) + '_rew_' + str(mean_rewards[0]) if self.save_freq > 0: - if (epoch_num % self.save_freq == 0) and (mean_rewards <= self.last_mean_rewards): + if (epoch_num % self.save_freq) == 0: self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name)) if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after: @@ -1301,7 +1301,7 @@ def train(self): checkpoint_name = self.config['name'] + '_ep_' + str(epoch_num) + '_rew_' + str(mean_rewards[0]) if self.save_freq > 0: - if (epoch_num % self.save_freq == 0) and (mean_rewards[0] <= self.last_mean_rewards): + if (epoch_num % self.save_freq) == 0: self.save(os.path.join(self.nn_dir, 'last_' + checkpoint_name)) if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after: diff --git a/rl_games/common/player.py b/rl_games/common/player.py index f1a5c35e..98be6501 100644 --- a/rl_games/common/player.py +++ b/rl_games/common/player.py @@ -364,9 +364,9 @@ def run(self): cur_rewards_done = cur_rewards/done_count cur_steps_done = cur_steps/done_count if print_game_res: - print(f'reward: {cur_rewards_done:.4} steps: {cur_steps_done:.4} w: {game_res}') + print(f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f} w: {game_res}') else: - print(f'reward: {cur_rewards_done:.4} steps: {cur_steps_done:.4f}') + print(f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f}') sum_game_res += game_res if batch_size//self.num_agents == 1 or games_played >= n_games: diff --git a/rl_games/configs/mujoco/sac_ant_envpool.yaml b/rl_games/configs/mujoco/sac_ant_envpool.yaml index e8390ed5..719e8b8f 100644 --- a/rl_games/configs/mujoco/sac_ant_envpool.yaml +++ b/rl_games/configs/mujoco/sac_ant_envpool.yaml @@ -29,7 +29,7 @@ params: max_epochs: 10000 num_steps_per_episode: 8 save_best_after: 500 - save_frequency: 10000 + save_frequency: 1000 gamma: 0.99 init_alpha: 1 alpha_lr: 5e-3 diff --git a/rl_games/envs/envpool.py b/rl_games/envs/envpool.py index f80ab7a9..8a835118 100644 --- a/rl_games/envs/envpool.py +++ b/rl_games/envs/envpool.py @@ -11,6 +11,7 @@ def flatten_dict(obs): res = np.column_stack(res) return res + class Envpool(IVecEnv): def __init__(self, config_name, num_actors, **kwargs): import envpool diff --git a/rl_games/torch_runner.py b/rl_games/torch_runner.py index e8ec70ee..f5f8652a 100644 --- a/rl_games/torch_runner.py +++ b/rl_games/torch_runner.py @@ -49,6 +49,7 @@ def __init__(self, algo_observer=None): self.algo_observer = algo_observer if algo_observer else DefaultAlgoObserver() torch.backends.cudnn.benchmark = True + ### it didnot help for lots for openai gym envs anyway :( #torch.backends.cudnn.deterministic = True #torch.use_deterministic_algorithms(True) @@ -57,6 +58,7 @@ def reset(self): pass def load_config(self, params): + print("Loading config") self.seed = params.get('seed', None) if self.seed is None: self.seed = int(time.time()) diff --git a/runner.py b/runner.py index 25f79af4..b2a61b30 100644 --- a/runner.py +++ b/runner.py @@ -26,6 +26,8 @@ os.makedirs("nn", exist_ok=True) os.makedirs("runs", exist_ok=True) + #breakpoint() + args = vars(ap.parse_args()) config_name = args['file'] @@ -50,7 +52,6 @@ except yaml.YAMLError as exc: print(exc) - #rank = int(os.getenv("LOCAL_RANK", "0")) global_rank = int(os.getenv("RANK", "0")) if args["track"] and global_rank == 0: import wandb