diff --git a/rls/_metadata.py b/rls/_metadata.py index 56aa847..032e2e4 100644 --- a/rls/_metadata.py +++ b/rls/_metadata.py @@ -8,7 +8,7 @@ # We follow Semantic Versioning (https://semver.org/) _MAJOR_VERSION = '4' _MINOR_VERSION = '1' -_PATCH_VERSION = '1' +_PATCH_VERSION = '2' # Example: '0.4.2' __version__ = '.'.join([_MAJOR_VERSION, _MINOR_VERSION, _PATCH_VERSION]) diff --git a/rls/algorithms/single/a2c.py b/rls/algorithms/single/a2c.py index dc7a397..51df1a0 100644 --- a/rls/algorithms/single/a2c.py +++ b/rls/algorithms/single/a2c.py @@ -33,7 +33,11 @@ def __init__(self, actor_lr=5.0e-4, critic_lr=1.0e-3, network_settings={ - 'actor_continuous': [32, 32], + 'actor_continuous': { + 'hidden_units': [64, 64], + 'condition_sigma': False, + 'log_std_bound': [-20, 2] + }, 'actor_discrete': [32, 32], 'critic': [32, 32] }, diff --git a/rls/algorithms/single/ac.py b/rls/algorithms/single/ac.py index 6047a1d..278aa50 100644 --- a/rls/algorithms/single/ac.py +++ b/rls/algorithms/single/ac.py @@ -31,7 +31,11 @@ def __init__(self, actor_lr=5.0e-4, critic_lr=1.0e-3, network_settings={ - 'actor_continuous': [32, 32], + 'actor_continuous': { + 'hidden_units': [64, 64], + 'condition_sigma': False, + 'log_std_bound': [-20, 2] + }, 'actor_discrete': [32, 32], 'critic': [32, 32] }, diff --git a/rls/algorithms/single/pg.py b/rls/algorithms/single/pg.py index c0d80c2..c122dff 100644 --- a/rls/algorithms/single/pg.py +++ b/rls/algorithms/single/pg.py @@ -30,7 +30,11 @@ def __init__(self, lr=5.0e-4, epoch=5, network_settings={ - 'actor_continuous': [32, 32], + 'actor_continuous': { + 'hidden_units': [32, 32], + 'condition_sigma': False, + 'log_std_bound': [-20, 2] + }, 'actor_discrete': [32, 32] }, **kwargs): diff --git a/rls/algorithms/single/ppo.py b/rls/algorithms/single/ppo.py index da335b1..5e2bfb1 100644 --- a/rls/algorithms/single/ppo.py +++ b/rls/algorithms/single/ppo.py @@ -79,6 +79,8 @@ def __init__(self, network_settings: Dict = { 'share': { 'continuous': { + 'condition_sigma': False, + 'log_std_bound': [-20, 2], 'share': [32, 32], 'mu': [32, 32], 'v': [32, 32] @@ -89,7 +91,11 @@ def __init__(self, 'v': [32, 32] } }, - 'actor_continuous': [32, 32], + 'actor_continuous': { + 'hidden_units': [64, 64], + 'condition_sigma': False, + 'log_std_bound': [-20, 2] + }, 'actor_discrete': [32, 32], 'critic': [32, 32] }, @@ -107,7 +113,7 @@ def __init__(self, self.kl_reverse = kl_reverse self.kl_target = kl_target self.kl_alpha = kl_alpha - self.kl_coef = t.tensor(kl_coef).float() + self.kl_coef = kl_coef self.extra_coef = extra_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm @@ -229,13 +235,13 @@ def _train(data, cell_states): early_step = 0 if self.share_net: for i in range(self.policy_epoch): - actor_loss, critic_loss, entropy, kl = self.train_share(data, cell_states, self.kl_coef) + actor_loss, critic_loss, entropy, kl = self.train_share(data, cell_states) if self.use_early_stop and kl > self.kl_stop: early_step = i break else: for i in range(self.policy_epoch): - actor_loss, entropy, kl = self.train_actor(data, cell_states, self.kl_coef) + actor_loss, entropy, kl = self.train_actor(data, cell_states) if self.use_early_stop and kl > self.kl_stop: early_step = i break @@ -283,7 +289,7 @@ def _train(data, cell_states): }) @iTensor_oNumpy - def train_share(self, BATCH, cell_states, kl_coef): + def train_share(self, BATCH, cell_states): feat, _ = self.rep_net(BATCH.obs, cell_state=cell_states['obs']) if self.is_continuous: mu, log_std, value = self.net(feat) @@ -327,7 +333,7 @@ def train_share(self, BATCH, cell_states, kl_coef): td_square = td_error.square() if self.use_kl_loss: - kl_loss = kl_coef * kl + kl_loss = self.kl_coef * kl actor_loss += kl_loss if self.use_extra_loss: @@ -340,7 +346,7 @@ def train_share(self, BATCH, cell_states, kl_coef): return actor_loss, value_loss, entropy, kl @iTensor_oNumpy - def train_actor(self, BATCH, cell_states, kl_coef): + def train_actor(self, BATCH, cell_states): feat, _ = self.rep_net(BATCH.obs, cell_state=cell_states['obs']) if self.is_continuous: mu, log_std = self.actor(feat) @@ -368,7 +374,7 @@ def train_actor(self, BATCH, cell_states, kl_coef): actor_loss = -(clipped_surrogate.mean() + self.ent_coef * entropy) if self.use_kl_loss: - kl_loss = kl_coef * kl + kl_loss = self.kl_coef * kl actor_loss += kl_loss if self.use_extra_loss: extra_loss = self.extra_coef * t.maximum(t.zeros_like(kl), kl - self.kl_cutoff).square() diff --git a/rls/algorithms/single/trpo.py b/rls/algorithms/single/trpo.py index c809a2b..a3f1fee 100644 --- a/rls/algorithms/single/trpo.py +++ b/rls/algorithms/single/trpo.py @@ -92,7 +92,11 @@ def __init__(self, epsilon=0.2, critic_lr=1e-3, network_settings={ - 'actor_continuous': [32, 32], + 'actor_continuous': { + 'hidden_units': [64, 64], + 'condition_sigma': False, + 'log_std_bound': [-20, 2] + }, 'actor_discrete': [32, 32], 'critic': [32, 32] }, diff --git a/rls/envs/unity/env.py b/rls/envs/unity/env.py index fbd3319..aee2b82 100644 --- a/rls/envs/unity/env.py +++ b/rls/envs/unity/env.py @@ -17,7 +17,7 @@ class UnityEnv(EnvBase): def __init__(self, obs_scale=False, **kwargs): - self.env = BasicUnityEnvironment(kwargs) + self.env = BasicUnityEnvironment(**kwargs) if obs_scale: self.env = ScaleVisualWrapper(env) diff --git a/rls/envs/unity/wrappers/wrappers.py b/rls/envs/unity/wrappers/wrappers.py index d3a9c91..49507ad 100644 --- a/rls/envs/unity/wrappers/wrappers.py +++ b/rls/envs/unity/wrappers/wrappers.py @@ -27,13 +27,13 @@ class BasicUnityEnvironment(object): def __init__(self, + worker_id=0, file_name=None, port=5005, render=False, seed=42, - worker_id=0, timeout_wait=60, - env_copys=1, + env_copys=12, env_name='3DBall', real_done=True, initialize_config={}, @@ -50,15 +50,14 @@ def __init__(self, self._n_copys = env_copys self._real_done = real_done - self._side_channels = self.initialize_all_side_channels(initialize_config) + self._side_channels = self.initialize_all_side_channels(initialize_config, engine_config) env_kwargs = dict(seed=seed, worker_id=worker_id, timeout_wait=timeout_wait, side_channels=list(self._side_channels.values())) # 注册所有初始化后的通讯频道 - - if file_nameis not None: + if file_name is not None: env_dict = load_config('rls/configs/unity/env_dict.yaml') - env_kwargs.update(file_name=file_name + env_kwargs.update(file_name=file_name, base_port=port, no_graphics=not render, additional_args=[ @@ -109,13 +108,13 @@ def initialize_environment(self): self.behavior_agents[bn] = len(ds) self.behavior_ids[bn] = ds.agent_id_to_index - for i, shape in enumerate(spec.observation_shapes): - if len(shape) == 1: + for i, obs_spec in enumerate(spec.observation_specs): # TODO: optimize + if len(obs_spec.shape) == 1: self.vector_idxs[bn].append(i) - self.vector_dims[bn].append(shape[0]) - elif len(shape) == 3: + self.vector_dims[bn].append(obs_spec.shape[0]) + elif len(obs_spec.shape) == 3: self.visual_idxs[bn].append(i) - self.visual_dims[bn].append(list(shape)) + self.visual_dims[bn].append(list(obs_spec.shape)) else: raise ValueError("shape of observation cannot be understood.") self.vector_info_type[bn] = generate_obs_dataformat(n_copys=self.behavior_agents[bn], @@ -253,7 +252,7 @@ def get_obs(self, behavior_names=None, only_obs=False): all_reward.append(reward[idxs]) # all_info.append(dict(max_step=info_max_step[idxs])) if only_obs: - return corrected_obs + return all_corrected_obs else: rets = [] for corrected_obs, obs, reward in zip(all_corrected_obs, all_obs, all_reward): diff --git a/rls/nn/mlps.py b/rls/nn/mlps.py index a9f81d1..dd7cff4 100644 --- a/rls/nn/mlps.py +++ b/rls/nn/mlps.py @@ -33,7 +33,7 @@ def __init__(self, self.add_module(f'{layer}_{i}', Layer_REGISTER[layer](_in, _out)) self.add_module(f'{act_fn}_{i}', Act_REGISTER[act_fn]()) - input_dim = outs[-1] or input_dim + input_dim = outs[-1] if len(outs) > 0 else input_dim if output_shape: self.add_module('out_layer', Layer_REGISTER[layer](input_dim, output_shape)) if out_act: diff --git a/rls/nn/models.py b/rls/nn/models.py index d3b03ec..be8d943 100644 --- a/rls/nn/models.py +++ b/rls/nn/models.py @@ -43,11 +43,11 @@ def __init__(self, vector_dim, output_shape, network_settings): ins = network_settings['hidden_units'][-1] else: ins = vector_dim - self.mu = MLP(ins, output_shape=output_shape, out_act='tanh') + self.mu = MLP(ins, [], output_shape=output_shape, out_act='tanh') if self.condition_sigma: self.log_std = MLP(ins, [], output_shape=output_shape) else: - self.log_std = -0.5 * t.nn.Parameter(t.ones((1, output_shape)), requires_grad=True) + self.log_std = t.nn.Parameter(-0.5 * t.ones((1, output_shape))) def forward(self, x): x = self.share(x) @@ -56,7 +56,7 @@ def forward(self, x): log_std = self.log_std(x) else: log_std = self.log_std - log_std.clamp_(self.log_std_min, self.log_std_max) + log_std = log_std.clamp(self.log_std_min, self.log_std_max) batch_size = mu.shape[0] if batch_size: log_std = log_std.repeat(batch_size, 1) # [1, N] => [B, N] @@ -335,7 +335,7 @@ def __init__(self, vector_dim, output_shape, network_settings): if self.condition_sigma: self.log_std = MLP(ins, [], output_shape=output_shape) else: - self.log_std = -0.5 * t.nn.Parameter(t.ones((1, output_shape)), requires_grad=True) + self.log_std = t.nn.Parameter(-0.5 * t.ones((1, output_shape))) def forward(self, x): x = self.share(x) @@ -349,7 +349,7 @@ def forward(self, x): batch_size = mu.shape[0] if batch_size: log_std = log_std.repeat(batch_size, 1) # [1, N] => [B, N] - log_std.clamp_(self.log_std_min, self.log_std_max) + log_std = log_std.clamp(self.log_std_min, self.log_std_max) return (mu, log_std, v) diff --git a/rls/utils/np_utils.py b/rls/utils/np_utils.py index 22e5959..8bc1262 100644 --- a/rls/utils/np_utils.py +++ b/rls/utils/np_utils.py @@ -2,7 +2,6 @@ # encoding: utf-8 import itertools -import scipy.signal import numpy as np diff --git a/setup.py b/setup.py index f5826c0..b1db6a0 100644 --- a/setup.py +++ b/setup.py @@ -84,8 +84,7 @@ 'tqdm', 'tensorboard', 'colored_traceback', - 'pyglet', - 'imageio' + # 'imageio' ], extras_require=extras, )