diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..9b4a023d --- /dev/null +++ b/.flake8 @@ -0,0 +1,23 @@ +[flake8] +show-source=True +statistics=True +per-file-ignores=*/__init__.py:F401 +# E402: Module level import not at top of file +# E501: Line too long +# W503: Line break before binary operator +# E203: Whitespace before ':' -> conflicts with black +# D401: First line should be in imperative mood +# R504: Unnecessary variable assignment before return statement. +# R505: Unnecessary elif after return statement +# SIM102: Use a single if-statement instead of nested if-statements +# SIM117: Merge with statements for context managers that have same scope. +# SIM118: Checks for key-existence checks against dict.keys() calls. +ignore=E402,E501,W503,E203,D401,R504,R505,SIM102,SIM117,SIM118 +max-line-length = 120 +max-complexity = 30 +exclude=_*,.vscode,.git,docs/** +# docstrings +docstring-convention=google +# annotations +suppress-none-returning=True +allow-star-arg-any=True diff --git a/pyproject.toml b/pyproject.toml index e73c4c42..af601629 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ authors = [ ] [tool.poetry.dependencies] -python = ">=3.7.1,<3.11" +python = ">=3.7.1,<3.14" gym = {version = "^0.23.0", extras = ["classic_control"]} tensorboard = "^2.8.0" tensorboardX = "^2.5" diff --git a/rl_games/algos_torch/a2c_continuous.py b/rl_games/algos_torch/a2c_continuous.py index e93ea362..9323f1a7 100644 --- a/rl_games/algos_torch/a2c_continuous.py +++ b/rl_games/algos_torch/a2c_continuous.py @@ -6,7 +6,7 @@ from rl_games.common import datasets from torch import optim -import torch +import torch class A2CAgent(a2c_common.ContinuousA2CBase): @@ -30,11 +30,11 @@ def __init__(self, base_name, params): 'actions_num' : self.actions_num, 'input_shape' : obs_shape, 'num_seqs' : self.num_actors * self.num_agents, - 'value_size': self.env_info.get('value_size',1), + 'value_size': self.env_info.get('value_size', 1), 'normalize_value' : self.normalize_value, 'normalize_input': self.normalize_input, } - + self.model = self.network.build(build_config) self.model.to(self.ppo_device) self.states = None @@ -74,7 +74,7 @@ def __init__(self, base_name, params): def update_epoch(self): self.epoch_num += 1 return self.epoch_num - + def save(self, fn): state = self.get_full_state_weights() torch_ext.save_checkpoint(fn, state) @@ -114,7 +114,7 @@ def calc_gradients(self, input_dict): batch_dict = { 'is_train': True, - 'prev_actions': actions_batch, + 'prev_actions': actions_batch, 'obs' : obs_batch, } @@ -195,7 +195,7 @@ def train_actor_critic(self, input_dict): def reg_loss(self, mu): if self.bounds_loss_coef is not None: - reg_loss = (mu*mu).sum(axis=-1) + reg_loss = (mu * mu).sum(axis=-1) else: reg_loss = 0 return reg_loss @@ -209,5 +209,3 @@ def bound_loss(self, mu): else: b_loss = 0 return b_loss - - diff --git a/rl_games/algos_torch/network_builder.py b/rl_games/algos_torch/network_builder.py index 289812dd..fd35f7de 100644 --- a/rl_games/algos_torch/network_builder.py +++ b/rl_games/algos_torch/network_builder.py @@ -5,9 +5,9 @@ import torch.nn as nn from rl_games.algos_torch.d2rl import D2RLNet -from rl_games.algos_torch.sac_helper import SquashedNormal -from rl_games.common.layers.recurrent import GRUWithDones, LSTMWithDones -from rl_games.common.layers.value import TwoHotEncodedValue, DefaultValue +from rl_games.algos_torch.sac_helper import SquashedNormal +from rl_games.common.layers.recurrent import GRUWithDones, LSTMWithDones +from rl_games.common.layers.value import TwoHotEncodedValue, DefaultValue from rl_games.algos_torch.spatial_softmax import SpatialSoftArgmax @@ -44,16 +44,16 @@ def __init__(self, **kwargs): self.activations_factory.register_builder('None', lambda **kwargs : nn.Identity()) self.init_factory = object_factory.ObjectFactory() - #self.init_factory.register_builder('normc_initializer', lambda **kwargs : normc_initializer(**kwargs)) - self.init_factory.register_builder('const_initializer', lambda **kwargs : _create_initializer(nn.init.constant_,**kwargs)) - self.init_factory.register_builder('orthogonal_initializer', lambda **kwargs : _create_initializer(nn.init.orthogonal_,**kwargs)) - self.init_factory.register_builder('glorot_normal_initializer', lambda **kwargs : _create_initializer(nn.init.xavier_normal_,**kwargs)) - self.init_factory.register_builder('glorot_uniform_initializer', lambda **kwargs : _create_initializer(nn.init.xavier_uniform_,**kwargs)) - self.init_factory.register_builder('variance_scaling_initializer', lambda **kwargs : _create_initializer(torch_ext.variance_scaling_initializer,**kwargs)) - self.init_factory.register_builder('random_uniform_initializer', lambda **kwargs : _create_initializer(nn.init.uniform_,**kwargs)) - self.init_factory.register_builder('kaiming_normal', lambda **kwargs : _create_initializer(nn.init.kaiming_normal_,**kwargs)) - self.init_factory.register_builder('orthogonal', lambda **kwargs : _create_initializer(nn.init.orthogonal_,**kwargs)) - self.init_factory.register_builder('default', lambda **kwargs : nn.Identity() ) + # self.init_factory.register_builder('normc_initializer', lambda **kwargs : normc_initializer(**kwargs)) + self.init_factory.register_builder('const_initializer', lambda **kwargs : _create_initializer(nn.init.constant_, **kwargs)) + self.init_factory.register_builder('orthogonal_initializer', lambda **kwargs : _create_initializer(nn.init.orthogonal_, **kwargs)) + self.init_factory.register_builder('glorot_normal_initializer', lambda **kwargs : _create_initializer(nn.init.xavier_normal_, **kwargs)) + self.init_factory.register_builder('glorot_uniform_initializer', lambda **kwargs : _create_initializer(nn.init.xavier_uniform_, **kwargs)) + self.init_factory.register_builder('variance_scaling_initializer', lambda **kwargs : _create_initializer(torch_ext.variance_scaling_initializer, **kwargs)) + self.init_factory.register_builder('random_uniform_initializer', lambda **kwargs : _create_initializer(nn.init.uniform_, **kwargs)) + self.init_factory.register_builder('kaiming_normal', lambda **kwargs : _create_initializer(nn.init.kaiming_normal_, **kwargs)) + self.init_factory.register_builder('orthogonal', lambda **kwargs : _create_initializer(nn.init.orthogonal_, **kwargs)) + self.init_factory.register_builder('default', lambda **kwargs : nn.Identity()) def is_separate_critic(self): return False @@ -70,7 +70,7 @@ def get_default_rnn_state(self): def get_aux_loss(self): return None - def _calc_input_size(self, input_shape,cnn_layers=None): + def _calc_input_size(self, input_shape, cnn_layers=None): if cnn_layers is None: assert(len(input_shape) == 1) return input_shape[0] @@ -78,6 +78,7 @@ def _calc_input_size(self, input_shape,cnn_layers=None): return nn.Sequential(*cnn_layers)(torch.rand(1, *(input_shape))).flatten(1).data.size(1) def _noisy_dense(self, inputs, units): + # TODO: to fix! return layers.NoisyFactorizedLinear(inputs, units) def _build_rnn(self, name, input, units, layers): @@ -88,13 +89,13 @@ def _build_rnn(self, name, input, units, layers): if name == 'gru': return GRUWithDones(input_size=input, hidden_size=units, num_layers=layers) - def _build_sequential_mlp(self, - input_size, - units, + def _build_sequential_mlp(self, + input_size, + units, activation, dense_func, - norm_only_first_layer=False, - norm_func_name = None): + norm_only_first_layer=False, + norm_func_name=None): print('build mlp:', input_size) in_size = input_size layers = [] @@ -106,7 +107,7 @@ def _build_sequential_mlp(self, if not need_norm: continue if norm_only_first_layer and norm_func_name is not None: - need_norm = False + need_norm = False if norm_func_name == 'layer_norm': layers.append(torch.nn.LayerNorm(unit)) elif norm_func_name == 'batch_norm': @@ -115,19 +116,19 @@ def _build_sequential_mlp(self, return nn.Sequential(*layers) - def _build_mlp(self, - input_size, - units, + def _build_mlp(self, + input_size, + units, activation, - dense_func, + dense_func, norm_only_first_layer=False, - norm_func_name = None, + norm_func_name=None, d2rl=False): if d2rl: act_layers = [self.activations_factory.create(activation) for i in range(len(units))] return D2RLNet(input_size, units, act_layers, norm_func_name) else: - return self._build_sequential_mlp(input_size, units, activation, dense_func, norm_func_name = None,) + return self._build_sequential_mlp(input_size, units, activation, dense_func, norm_func_name=None,) def _build_conv(self, ctype, **kwargs): print('conv_name:', ctype) @@ -148,11 +149,11 @@ def _build_cnn2d(self, input_shape, convs, activation, conv_func=torch.nn.Conv2d in_channels = input_shape[0] layers = [] for conv in convs: - layers.append(conv_func(in_channels=in_channels, - out_channels=conv['filters'], - kernel_size=conv['kernel_size'], + layers.append(conv_func(in_channels=in_channels, + out_channels=conv['filters'], + kernel_size=conv['kernel_size'], stride=conv['strides'], padding=conv['padding'])) - conv_func=torch.nn.Conv2d + conv_func = torch.nn.Conv2d act = self.activations_factory.create(activation) layers.append(act) in_channels = conv['filters'] @@ -160,6 +161,7 @@ def _build_cnn2d(self, input_shape, convs, activation, conv_func=torch.nn.Conv2d layers.append(torch_ext.LayerNorm2d(in_channels)) elif norm_func_name == 'batch_norm': layers.append(torch.nn.BatchNorm2d(in_channels)) + if add_spatial_softmax: layers.append(SpatialSoftArgmax(normalize=True)) if add_flatten: @@ -178,21 +180,20 @@ def _build_cnn1d(self, input_shape, convs, activation, norm_func_name=None): if norm_func_name == 'layer_norm': layers.append(torch.nn.LayerNorm(in_channels)) elif norm_func_name == 'batch_norm': - layers.append(torch.nn.BatchNorm2d(in_channels)) + layers.append(torch.nn.BatchNorm2d(in_channels)) return nn.Sequential(*layers) def _build_value_layer(self, input_size, output_size, value_type='legacy'): if value_type == 'legacy': return torch.nn.Linear(input_size, output_size) if value_type == 'default': - return DefaultValue(input_size, output_size) + return DefaultValue(input_size, output_size) if value_type == 'twohot_encoded': return TwoHotEncodedValue(input_size, output_size) raise ValueError('value type is not "default", "legacy" or "two_hot_encoded"') - class A2CBuilder(NetworkBuilder): def __init__(self, **kwargs): NetworkBuilder.__init__(self) @@ -213,21 +214,21 @@ def __init__(self, params, **kwargs): self.critic_cnn = nn.Sequential() self.actor_mlp = nn.Sequential() self.critic_mlp = nn.Sequential() - + if self.has_cnn: if self.permute_input: input_shape = torch_ext.shape_whc_to_cwh(input_shape) cnn_args = { - 'ctype' : self.cnn['type'], - 'input_shape' : input_shape, - 'convs' :self.cnn['convs'], - 'activation' : self.cnn['activation'], + 'ctype' : self.cnn['type'], + 'input_shape' : input_shape, + 'convs' : self.cnn['convs'], + 'activation' : self.cnn['activation'], 'norm_func_name' : self.normalization, } self.actor_cnn = self._build_conv(**cnn_args) if self.separate: - self.critic_cnn = self._build_conv( **cnn_args) + self.critic_cnn = self._build_conv(**cnn_args) cnn_output_size = self._calc_input_size(input_shape, self.actor_cnn) @@ -266,8 +267,8 @@ def __init__(self, params, **kwargs): mlp_args = { 'input_size' : mlp_input_size, - 'units' : self.units, - 'activation' : self.activation, + 'units' : self.units, + 'activation' : self.activation, 'norm_func_name' : self.normalization, 'dense_func' : torch.nn.Linear, 'd2rl' : self.is_d2rl, @@ -311,14 +312,14 @@ def __init__(self, params, **kwargs): if isinstance(m, nn.Linear): mlp_init(m.weight) if getattr(m, "bias", None) is not None: - torch.nn.init.zeros_(m.bias) + torch.nn.init.zeros_(m.bias) if self.is_continuous: mu_init(self.mu.weight) if self.fixed_sigma: sigma_init(self.sigma) else: - sigma_init(self.sigma.weight) + sigma_init(self.sigma.weight) def forward(self, obs_dict): obs = obs_dict['obs'] @@ -339,7 +340,7 @@ def forward(self, obs_dict): a_out = a_out.contiguous().view(a_out.size(0), -1) c_out = self.critic_cnn(c_out) - c_out = c_out.contiguous().view(c_out.size(0), -1) + c_out = c_out.contiguous().view(c_out.size(0), -1) if self.has_rnn: seq_length = obs_dict.get('seq_length', 1) @@ -359,11 +360,11 @@ def forward(self, obs_dict): a_out = a_out.reshape(num_seqs, seq_length, -1) c_out = c_out.reshape(num_seqs, seq_length, -1) - a_out = a_out.transpose(0,1) - c_out = c_out.transpose(0,1) + a_out = a_out.transpose(0, 1) + c_out = c_out.transpose(0, 1) if dones is not None: dones = dones.reshape(num_seqs, seq_length, -1) - dones = dones.transpose(0,1) + dones = dones.transpose(0, 1) if len(states) == 2: a_states = states[0] @@ -374,8 +375,8 @@ def forward(self, obs_dict): a_out, a_states = self.a_rnn(a_out, a_states, dones, bptt_len) c_out, c_states = self.c_rnn(c_out, c_states, dones, bptt_len) - a_out = a_out.transpose(0,1) - c_out = c_out.transpose(0,1) + a_out = a_out.transpose(0, 1) + c_out = c_out.transpose(0, 1) a_out = a_out.contiguous().reshape(a_out.size()[0] * a_out.size()[1], -1) c_out = c_out.contiguous().reshape(c_out.size()[0] * c_out.size()[1], -1) @@ -398,7 +399,7 @@ def forward(self, obs_dict): else: a_out = self.actor_mlp(a_out) c_out = self.critic_mlp(c_out) - + value = self.value_act(self.value(c_out)) if self.is_discrete: @@ -474,7 +475,7 @@ def forward(self, obs_dict): else: sigma = self.sigma_act(self.sigma(out)) return mu, mu*0 + sigma, value, states - + def is_separate_critic(self): return self.separate @@ -491,19 +492,19 @@ def get_default_rnn_state(self): rnn_units = self.rnn_units if self.rnn_name == 'lstm': if self.separate: - return (torch.zeros((num_layers, self.num_seqs, rnn_units)), + return (torch.zeros((num_layers, self.num_seqs, rnn_units)), + torch.zeros((num_layers, self.num_seqs, rnn_units)), torch.zeros((num_layers, self.num_seqs, rnn_units)), - torch.zeros((num_layers, self.num_seqs, rnn_units)), torch.zeros((num_layers, self.num_seqs, rnn_units))) else: - return (torch.zeros((num_layers, self.num_seqs, rnn_units)), + return (torch.zeros((num_layers, self.num_seqs, rnn_units)), torch.zeros((num_layers, self.num_seqs, rnn_units))) else: if self.separate: - return (torch.zeros((num_layers, self.num_seqs, rnn_units)), + return (torch.zeros((num_layers, self.num_seqs, rnn_units)), torch.zeros((num_layers, self.num_seqs, rnn_units))) else: - return (torch.zeros((num_layers, self.num_seqs, rnn_units)),) + return (torch.zeros((num_layers, self.num_seqs, rnn_units)),) def load(self, params): self.separate = params.get('separate', False) @@ -520,9 +521,9 @@ def load(self, params): self.joint_obs_actions_config = params.get('joint_obs_actions', None) if self.has_space: - self.is_multi_discrete = 'multi_discrete'in params['space'] + self.is_multi_discrete = 'multi_discrete' in params['space'] self.is_discrete = 'discrete' in params['space'] - self.is_continuous = 'continuous'in params['space'] + self.is_continuous = 'continuous' in params['space'] if self.is_continuous: self.space_config = params['space']['continuous'] self.fixed_sigma = self.space_config['fixed_sigma'] @@ -555,10 +556,12 @@ def build(self, name, **kwargs): net = A2CBuilder.Network(self.params, **kwargs) return net + class Conv2dAuto(nn.Conv2d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.padding = (self.kernel_size[0] // 2, self.kernel_size[1] // 2) # dynamic add padding based on the kernel_size + # dynamic add padding based on the kernel_size + self.padding = (self.kernel_size[0] // 2, self.kernel_size[1] // 2) class ConvBlock(nn.Module): @@ -579,15 +582,18 @@ def forward(self, x): class ResidualBlock(nn.Module): def __init__(self, channels, activation='relu', use_bn=False, use_zero_init=False, use_attention=False): super().__init__() - self.use_zero_init=use_zero_init + self.use_zero_init = use_zero_init self.use_attention = use_attention + if use_zero_init: self.alpha = nn.Parameter(torch.zeros(1)) + self.activation = activation self.conv1 = ConvBlock(channels, channels, use_bn) self.conv2 = ConvBlock(channels, channels, use_bn) self.activate1 = nn.ReLU() self.activate2 = nn.ReLU() + if use_attention: self.ca = ChannelAttention(channels) self.sa = SpatialAttention() @@ -623,6 +629,7 @@ def forward(self, x): x = self.res_block2(x) return x + class A2CResnetBuilder(NetworkBuilder): def __init__(self, **kwargs): NetworkBuilder.__init__(self) @@ -655,10 +662,10 @@ def __init__(self, params, **kwargs): if self.has_rnn: if not self.is_rnn_before_mlp: - rnn_in_size = out_size + rnn_in_size = out_size out_size = self.rnn_units else: - rnn_in_size = mlp_input_size + rnn_in_size = mlp_input_size mlp_input_size = self.rnn_units if self.require_rewards: @@ -667,12 +674,12 @@ def __init__(self, params, **kwargs): rnn_in_size += actions_num self.rnn = self._build_rnn(self.rnn_name, rnn_in_size, self.rnn_units, self.rnn_layers) - #self.layer_norm = torch.nn.LayerNorm(self.rnn_units) + # self.layer_norm = torch.nn.LayerNorm(self.rnn_units) mlp_args = { 'input_size' : mlp_input_size, - 'units' :self.units, - 'activation' : self.activation, + 'units' : self.units, + 'activation' : self.activation, 'norm_func_name' : self.normalization, 'dense_func' : torch.nn.Linear } @@ -687,9 +694,9 @@ def __init__(self, params, **kwargs): self.logits = torch.nn.Linear(out_size, actions_num) if self.is_continuous: self.mu = torch.nn.Linear(out_size, actions_num) - self.mu_act = self.activations_factory.create(self.space_config['mu_activation']) + self.mu_act = self.activations_factory.create(self.space_config['mu_activation']) mu_init = self.init_factory.create(**self.space_config['mu_init']) - self.sigma_act = self.activations_factory.create(self.space_config['sigma_activation']) + self.sigma_act = self.activations_factory.create(self.space_config['sigma_activation']) sigma_init = self.init_factory.create(**self.space_config['sigma_init']) if self.fixed_sigma: @@ -716,7 +723,7 @@ def __init__(self, params, **kwargs): else: sigma_init(self.sigma.weight) - mlp_init(self.value.weight) + mlp_init(self.value.weight) def forward(self, obs_dict): if self.require_rewards or self.require_last_actions: @@ -740,7 +747,7 @@ def forward(self, obs_dict): out = self.flatten_act(out) if self.has_rnn: - #seq_length = obs_dict['seq_length'] + # seq_length = obs_dict['seq_length'] seq_length = obs_dict.get('seq_length', 1) out_in = out @@ -799,7 +806,7 @@ def load(self, params): self.initializer = params['mlp']['initializer'] self.is_discrete = 'discrete' in params['space'] self.is_continuous = 'continuous' in params['space'] - self.is_multi_discrete = 'multi_discrete'in params['space'] + self.is_multi_discrete = 'multi_discrete' in params['space'] self.value_activation = params.get('value_activation', 'None') self.normalization = params.get('normalization', None) @@ -827,7 +834,7 @@ def load(self, params): def _build_impala(self, input_shape, depths): in_channels = input_shape[0] - layers = nn.ModuleList() + layers = nn.ModuleList() for d in depths: layers.append(ImpalaSequential(in_channels, d)) in_channels = d @@ -842,10 +849,10 @@ def is_rnn(self): def get_default_rnn_state(self): num_layers = self.rnn_layers if self.rnn_name == 'lstm': - return (torch.zeros((num_layers, self.num_seqs, self.rnn_units)), - torch.zeros((num_layers, self.num_seqs, self.rnn_units))) + return (torch.zeros((num_layers, self.num_seqs, self.rnn_units)), + torch.zeros((num_layers, self.num_seqs, self.rnn_units))) else: - return (torch.zeros((num_layers, self.num_seqs, self.rnn_units))) + return (torch.zeros((num_layers, self.num_seqs, self.rnn_units))) def build(self, name, **kwargs): net = A2CResnetBuilder.Network(self.params, **kwargs) @@ -926,9 +933,9 @@ def __init__(self, params, **kwargs): self.load(params) actor_mlp_args = { - 'input_size' : obs_dim, - 'units' : self.units, - 'activation' : self.activation, + 'input_size' : obs_dim, + 'units' : self.units, + 'activation' : self.activation, 'norm_func_name' : self.normalization, 'dense_func' : torch.nn.Linear, 'd2rl' : self.is_d2rl, @@ -936,9 +943,9 @@ def __init__(self, params, **kwargs): } critic_mlp_args = { - 'input_size' : obs_dim + action_dim, - 'units' : self.units, - 'activation' : self.activation, + 'input_size' : obs_dim + action_dim, + 'units' : self.units, + 'activation' : self.activation, 'norm_func_name' : self.normalization, 'dense_func' : torch.nn.Linear, 'd2rl' : self.is_d2rl, @@ -952,7 +959,7 @@ def __init__(self, params, **kwargs): self.critic = self._build_critic(1, **critic_mlp_args) print("Building Critic Target") self.critic_target = self._build_critic(1, **critic_mlp_args) - self.critic_target.load_state_dict(self.critic.state_dict()) + self.critic_target.load_state_dict(self.critic.state_dict()) mlp_init = self.init_factory.create(**self.initializer) for m in self.modules(): @@ -976,7 +983,7 @@ def forward(self, obs_dict): obs = obs_dict['obs'] mu, sigma = self.actor(obs) return mu, sigma - + def is_separate_critic(self): return self.separate @@ -997,7 +1004,7 @@ def load(self, params): if self.has_space: self.is_discrete = 'discrete' in params['space'] - self.is_continuous = 'continuous'in params['space'] + self.is_continuous = 'continuous' in params['space'] if self.is_continuous: self.space_config = params['space']['continuous'] elif self.is_discrete: @@ -1005,4 +1012,3 @@ def load(self, params): else: self.is_discrete = False self.is_continuous = False - diff --git a/rl_games/common/env_configurations.py b/rl_games/common/env_configurations.py index 08170847..06f73995 100644 --- a/rl_games/common/env_configurations.py +++ b/rl_games/common/env_configurations.py @@ -10,7 +10,6 @@ import math - class HCRewardEnv(gym.RewardWrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) @@ -34,8 +33,6 @@ def step(self, action): return observation, reward, done, info - - class DMControlObsWrapper(gym.ObservationWrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) @@ -96,15 +93,15 @@ def create_myo(**kwargs): def create_atari_gym_env(**kwargs): #frames = kwargs.pop('frames', 1) name = kwargs.pop('name') - skip = kwargs.pop('skip',4) - episode_life = kwargs.pop('episode_life',True) + skip = kwargs.pop('skip', 4) + episode_life = kwargs.pop('episode_life', True) wrap_impala = kwargs.pop('wrap_impala', False) - env = wrappers.make_atari_deepmind(name, skip=skip,episode_life=episode_life, wrap_impala=wrap_impala, **kwargs) - return env + env = wrappers.make_atari_deepmind(name, skip=skip, episode_life=episode_life, wrap_impala=wrap_impala, **kwargs) + return env def create_dm_control_env(**kwargs): frames = kwargs.pop('frames', 1) - name = 'dm2gym:'+ kwargs.pop('name') + name = 'dm2gym:' + kwargs.pop('name') env = gym.make(name, environment_kwargs=kwargs) env = DMControlWrapper(env) env = DMControlObsWrapper(env) @@ -140,11 +137,11 @@ def create_super_mario_env_stage1(name='SuperMarioBrosRandomStage1-v1'): env = gym_super_mario_bros.make(stage_names[1]) env = JoypadSpace(env, SIMPLE_MOVEMENT) - + env = wrappers.MaxAndSkipEnv(env, skip=4) env = wrappers.wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=True) #env = wrappers.AllowBacktracking(env) - + return env def create_quadrupped_env(): @@ -166,8 +163,7 @@ def create_smac(name, **kwargs): has_cv = kwargs.get('central_value', False) as_single_agent = kwargs.pop('as_single_agent', False) env = SMACEnv(name, **kwargs) - - + if frames > 1: if has_cv: env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=False, flatten=flatten) @@ -185,7 +181,7 @@ def create_smac_v2(name, **kwargs): flatten = kwargs.pop('flatten', True) has_cv = kwargs.get('central_value', False) env = SMACEnvV2(name, **kwargs) - + if frames > 1: if has_cv: env = wrappers.BatchedFrameStackWithStates(env, frames, transpose=False, flatten=flatten) @@ -217,7 +213,6 @@ def create_minigrid_env(name, **kwargs): import gym_minigrid import gym_minigrid.wrappers - state_bonus = kwargs.pop('state_bonus', False) action_bonus = kwargs.pop('action_bonus', False) rgb_fully_obs = kwargs.pop('rgb_fully_obs', False) @@ -225,7 +220,6 @@ def create_minigrid_env(name, **kwargs): view_size = kwargs.pop('view_size', 3) env = gym.make(name, **kwargs) - if state_bonus: env = gym_minigrid.wrappers.StateBonus(env) if action_bonus: @@ -243,7 +237,7 @@ def create_minigrid_env(name, **kwargs): def create_multiwalker_env(**kwargs): from rl_games.envs.multiwalker import MultiWalker - env = MultiWalker('', **kwargs) + env = MultiWalker('', **kwargs) return env @@ -290,19 +284,19 @@ def create_env(name, **kwargs): 'vecenv_type' : 'RAY' }, 'PongNoFrameskip-v4' : { - 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('PongNoFrameskip-v4', skip=4), + 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('PongNoFrameskip-v4', skip=4), 'vecenv_type' : 'RAY' }, 'BreakoutNoFrameskip-v4' : { - 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('BreakoutNoFrameskip-v4', skip=4,sticky=False), + 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('BreakoutNoFrameskip-v4', skip=4, sticky=False), 'vecenv_type' : 'RAY' }, 'MsPacmanNoFrameskip-v4' : { - 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('MsPacmanNoFrameskip-v4', skip=4), + 'env_creator' : lambda **kwargs : wrappers.make_atari_deepmind('MsPacmanNoFrameskip-v4', skip=4), 'vecenv_type' : 'RAY' }, 'CarRacing-v0' : { - 'env_creator' : lambda **kwargs : wrappers.make_car_racing('CarRacing-v0', skip=4), + 'env_creator' : lambda **kwargs : wrappers.make_car_racing('CarRacing-v0', skip=4), 'vecenv_type' : 'RAY' }, 'RoboschoolAnt-v1' : { @@ -310,19 +304,19 @@ def create_env(name, **kwargs): 'vecenv_type' : 'RAY' }, 'SuperMarioBros-v1' : { - 'env_creator' : lambda : create_super_mario_env(), + 'env_creator' : lambda : create_super_mario_env(), 'vecenv_type' : 'RAY' }, 'SuperMarioBrosRandomStages-v1' : { - 'env_creator' : lambda : create_super_mario_env('SuperMarioBrosRandomStages-v1'), + 'env_creator' : lambda : create_super_mario_env('SuperMarioBrosRandomStages-v1'), 'vecenv_type' : 'RAY' }, 'SuperMarioBrosRandomStage1-v1' : { - 'env_creator' : lambda **kwargs : create_super_mario_env_stage1('SuperMarioBrosRandomStage1-v1'), + 'env_creator' : lambda **kwargs : create_super_mario_env_stage1('SuperMarioBrosRandomStage1-v1'), 'vecenv_type' : 'RAY' }, 'RoboschoolHalfCheetah-v1' : { - 'env_creator' : lambda **kwargs : create_roboschool_env('RoboschoolHalfCheetah-v1'), + 'env_creator' : lambda **kwargs : create_roboschool_env('RoboschoolHalfCheetah-v1'), 'vecenv_type' : 'RAY' }, 'RoboschoolHumanoid-v1' : { @@ -330,27 +324,27 @@ def create_env(name, **kwargs): 'vecenv_type' : 'RAY' }, 'LunarLanderContinuous-v2' : { - 'env_creator' : lambda **kwargs : gym.make('LunarLanderContinuous-v2'), + 'env_creator' : lambda **kwargs : gym.make('LunarLanderContinuous-v2'), 'vecenv_type' : 'RAY' }, 'RoboschoolHumanoidFlagrun-v1' : { - 'env_creator' : lambda **kwargs : wrappers.FrameStack(create_roboschool_env('RoboschoolHumanoidFlagrun-v1'), 1, True), + 'env_creator' : lambda **kwargs : wrappers.FrameStack(create_roboschool_env('RoboschoolHumanoidFlagrun-v1'), 1, True), 'vecenv_type' : 'RAY' }, 'BipedalWalker-v3' : { - 'env_creator' : lambda **kwargs : create_env('BipedalWalker-v3', **kwargs), + 'env_creator' : lambda **kwargs : create_env('BipedalWalker-v3', **kwargs), 'vecenv_type' : 'RAY' }, 'BipedalWalkerCnn-v3' : { - 'env_creator' : lambda **kwargs : wrappers.FrameStack(HCRewardEnv(gym.make('BipedalWalker-v3')), 4, False), + 'env_creator' : lambda **kwargs : wrappers.FrameStack(HCRewardEnv(gym.make('BipedalWalker-v3')), 4, False), 'vecenv_type' : 'RAY' }, 'BipedalWalkerHardcore-v3' : { - 'env_creator' : lambda **kwargs : gym.make('BipedalWalkerHardcore-v3'), + 'env_creator' : lambda **kwargs : gym.make('BipedalWalkerHardcore-v3'), 'vecenv_type' : 'RAY' }, 'ReacherPyBulletEnv-v0' : { - 'env_creator' : lambda **kwargs : create_roboschool_env('ReacherPyBulletEnv-v0'), + 'env_creator' : lambda **kwargs : create_roboschool_env('ReacherPyBulletEnv-v0'), 'vecenv_type' : 'RAY' }, 'BipedalWalkerHardcoreCnn-v3' : { @@ -358,19 +352,19 @@ def create_env(name, **kwargs): 'vecenv_type' : 'RAY' }, 'QuadruppedWalk-v1' : { - 'env_creator' : lambda **kwargs : create_quadrupped_env(), + 'env_creator' : lambda **kwargs : create_quadrupped_env(), 'vecenv_type' : 'RAY' }, 'FlexAnt' : { - 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/ant.yaml'), + 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/ant.yaml'), 'vecenv_type' : 'ISAAC' }, 'FlexHumanoid' : { - 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid.yaml'), + 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid.yaml'), 'vecenv_type' : 'ISAAC' }, 'FlexHumanoidHard' : { - 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid_hard.yaml'), + 'env_creator' : lambda **kwargs : create_flex(FLEX_PATH + '/demo/gym/cfg/humanoid_hard.yaml'), 'vecenv_type' : 'ISAAC' }, 'smac' : { @@ -423,7 +417,7 @@ def create_env(name, **kwargs): }, 'brax' : { 'env_creator': lambda **kwargs: create_brax_env(**kwargs), - 'vecenv_type': 'BRAX' + 'vecenv_type': 'BRAX' }, 'envpool': { 'env_creator': lambda **kwargs: create_envpool(**kwargs), @@ -439,6 +433,7 @@ def create_env(name, **kwargs): }, } + def get_env_info(env): result_shapes = {} result_shapes['observation_space'] = env.observation_space @@ -450,16 +445,17 @@ def get_env_info(env): ''' if isinstance(result_shapes['observation_space'], gym.spaces.dict.Dict): result_shapes['observation_space'] = observation_space['observations'] - + if isinstance(result_shapes['observation_space'], dict): result_shapes['observation_space'] = observation_space['observations'] result_shapes['state_space'] = observation_space['states'] ''' - if hasattr(env, "value_size"): + if hasattr(env, "value_size"): result_shapes['value_size'] = env.value_size print(result_shapes) return result_shapes + def get_obs_and_action_spaces_from_config(config): env_config = config.get('env_config', {}) env = configurations[config['env_name']]['env_creator'](**env_config) @@ -476,4 +472,4 @@ def register(name, config): config (:obj:`dict`): Dictionary with env type and a creator function. """ - configurations[name] = config \ No newline at end of file + configurations[name] = config diff --git a/rl_games/common/player.py b/rl_games/common/player.py index 98be6501..202a9fa9 100644 --- a/rl_games/common/player.py +++ b/rl_games/common/player.py @@ -62,7 +62,7 @@ def __init__(self, params): 'central_value_config') is not None self.device_name = self.config.get('device_name', 'cuda') self.render_env = self.player_config.get('render', False) - self.games_num = self.player_config.get('games_num', 2000) + self.games_num = self.player_config.get('games_num', 1000000000) if 'deterministic' in self.player_config: self.is_deterministic = self.player_config['deterministic'] @@ -73,7 +73,7 @@ def __init__(self, params): self.n_game_life = self.player_config.get('n_game_life', 1) self.print_stats = self.player_config.get('print_stats', True) self.render_sleep = self.player_config.get('render_sleep', 0.002) - self.max_steps = 108000 // 4 + self.max_steps = self.player_config.get('max_steps', 100000000) self.device = torch.device(self.device_name) self.evaluation = self.player_config.get("evaluation", False) @@ -361,15 +361,15 @@ def run(self): game_res = info.get('scores', 0.5) if self.print_stats: - cur_rewards_done = cur_rewards/done_count - cur_steps_done = cur_steps/done_count + cur_rewards_done = cur_rewards / done_count + cur_steps_done = cur_steps / done_count if print_game_res: print(f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f} w: {game_res}') else: print(f'reward: {cur_rewards_done:.2f} steps: {cur_steps_done:.1f}') sum_game_res += game_res - if batch_size//self.num_agents == 1 or games_played >= n_games: + if batch_size // self.num_agents == 1 or games_played >= n_games: break print(sum_rewards) diff --git a/rl_games/common/vecenv.py b/rl_games/common/vecenv.py index c29fd4be..5eebd4e5 100644 --- a/rl_games/common/vecenv.py +++ b/rl_games/common/vecenv.py @@ -7,6 +7,7 @@ from time import sleep import torch + class RayWorker: """Wrapper around a third-party (gym for example) environment class that enables parallel training. @@ -47,7 +48,7 @@ def step(self, action): """ next_state, reward, is_done, info = self.env.step(action) - + if np.isscalar(is_done): episode_done = is_done else: @@ -64,7 +65,7 @@ def seed(self, seed): np.random.seed(seed) random.seed(seed) self.env.seed(seed) - + def render(self): self.env.render() @@ -93,13 +94,9 @@ def can_concat_infos(self): def get_env_info(self): info = {} - observation_space = self.env.observation_space - - #if isinstance(observation_space, gym.spaces.dict.Dict): - # observation_space = observation_space['observations'] info['action_space'] = self.env.action_space - info['observation_space'] = observation_space + info['observation_space'] = self.env.observation_space info['state_space'] = None info['use_global_observations'] = False info['agents'] = self.get_number_of_agents() @@ -115,12 +112,16 @@ def get_env_info(self): class RayVecEnv(IVecEnv): """Main env class that manages several `rl_games.common.vecenv.Rayworker` objects for parallel training - + The RayVecEnv class manages a set of individual environments and wraps around the methods from RayWorker. Each worker is executed asynchronously. """ - import ray + # Import Ray only when RayVecEnv is used + try: + import ray + except ImportError: + pass def __init__(self, config_name, num_actors, **kwargs): """Initialise the class. Sets up the config for the environment and creates individual workers to manage. @@ -135,8 +136,6 @@ def __init__(self, config_name, num_actors, **kwargs): self.num_actors = num_actors self.use_torch = False self.seed = kwargs.pop('seed', None) - - self.remote_worker = self.ray.remote(RayWorker) self.workers = [self.remote_worker.remote(self.config_name, kwargs) for i in range(self.num_actors)] @@ -162,7 +161,7 @@ def __init__(self, config_name, num_actors, **kwargs): self.concat_func = np.stack else: self.concat_func = np.concatenate - + def step(self, actions): """Step all individual environments (using the created workers). Returns a concatenated array of observations, rewards, done states, and infos if the env allows concatenation. @@ -201,7 +200,7 @@ def step(self, actions): if self.use_global_obs: newobsdict = {} newobsdict["obs"] = ret_obs - + if self.state_type_dict: newobsdict["states"] = dicts_to_dict_with_arrays(newstates, True) else: @@ -248,7 +247,7 @@ def reset(self): if self.use_global_obs: newobsdict = {} newobsdict["obs"] = ret_obs - + if self.state_type_dict: newobsdict["states"] = dicts_to_dict_with_arrays(newstates, True) else: @@ -256,8 +255,10 @@ def reset(self): ret_obs = newobsdict return ret_obs + vecenv_config = {} + def register(config_name, func): """Add an environment type (for example RayVecEnv) to the list of available types `rl_games.common.vecenv.vecenv_config` Args: @@ -267,10 +268,12 @@ def register(config_name, func): """ vecenv_config[config_name] = func + def create_vec_env(config_name, num_actors, **kwargs): vec_env_name = configurations[config_name]['vecenv_type'] return vecenv_config[vec_env_name](config_name, num_actors, **kwargs) + register('RAY', lambda config_name, num_actors, **kwargs: RayVecEnv(config_name, num_actors, **kwargs)) from rl_games.envs.brax import BraxEnv diff --git a/rl_games/common/wrappers.py b/rl_games/common/wrappers.py index dab4a648..a4026ef7 100644 --- a/rl_games/common/wrappers.py +++ b/rl_games/common/wrappers.py @@ -1,4 +1,3 @@ -import gymnasium import numpy as np from numpy.random import randint @@ -11,12 +10,11 @@ from copy import copy - class InfoWrapper(gym.Wrapper): def __init__(self, env): gym.RewardWrapper.__init__(self, env) - self.reward = 0 + def reset(self, **kwargs): self.reward = 0 return self.env.reset(**kwargs) @@ -87,7 +85,7 @@ def __init__(self, env): """ gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): obs, reward, done, info = self.env.step(action) @@ -122,7 +120,7 @@ def __init__(self, env): gym.Wrapper.__init__(self, env) self.max_stacked_steps = 1000 - self.current_steps=0 + self.current_steps = 0 def step(self, action): obs, reward, done, info = self.env.step(action) @@ -140,7 +138,7 @@ def step(self, action): class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env,skip=4, use_max = True): + def __init__(self, env, skip=4, use_max=True): """Return only every `skip`-th frame""" gym.Wrapper.__init__(self, env) self.use_max = use_max @@ -150,7 +148,7 @@ def __init__(self, env,skip=4, use_max = True): else: self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.float32) self._skip = skip - + def step(self, action): """Repeat action, sum reward, and max over last observations.""" total_reward = 0.0 @@ -211,8 +209,9 @@ def observation(self, frame): frame = np.expand_dims(frame, -1) return frame + class FrameStack(gym.Wrapper): - def __init__(self, env, k, flat = False): + def __init__(self, env, k, flat=False): """ Stack k last frames. Returns lazy array, which is much more memory efficient. @@ -262,7 +261,7 @@ def _get_ob(self): class BatchedFrameStack(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.frames = deque([], maxlen=k) @@ -303,8 +302,9 @@ def _get_ob(self): frames = np.transpose(self.frames, (1, 0, 2)) return frames + class BatchedFrameStackWithStates(gym.Wrapper): - def __init__(self, env, k, transpose = False, flatten = False): + def __init__(self, env, k, transpose=False, flatten=False): gym.Wrapper.__init__(self, env) self.k = k self.obses = deque([], maxlen=k) @@ -363,14 +363,15 @@ def process_data(self, data): obses = np.transpose(data, (1, 0, 2)) return obses + class ProcgenStack(gym.Wrapper): - def __init__(self, env, k = 2, greyscale=True): + def __init__(self, env, k=2, greyscale=True): gym.Wrapper.__init__(self, env) self.k = k self.curr_frame = 0 self.frames = deque([], maxlen=k) - self.greyscale=greyscale + self.greyscale = greyscale self.prev_frame = None shp = env.observation_space.shape if greyscale: @@ -421,6 +422,7 @@ def observation(self, observation): # with smaller replay buffers only. return np.array(observation).astype(np.float32) / 255.0 + class LazyFrames(object): def __init__(self, frames): """This object ensures that common frames between the observations are only stored once. @@ -449,6 +451,7 @@ def __len__(self): def __getitem__(self, i): return self._force()[i] + class ReallyDoneWrapper(gym.Wrapper): def __init__(self, env): """ @@ -471,6 +474,7 @@ def step(self, action): done = lives == 0 return obs, reward, done, info + class AllowBacktracking(gym.Wrapper): """ Use deltas in max(X) as the reward, rather than deltas @@ -506,6 +510,7 @@ def unwrap(env): else: return env + class StickyActionEnv(gym.Wrapper): def __init__(self, env, p=0.25): super(StickyActionEnv, self).__init__(env) @@ -591,7 +596,7 @@ def step(self, action): obs, reward, done, info = self.env.step(action) obs = { 'observation': obs, - 'reward':np.clip(reward, -1, 1), + 'reward': np.clip(reward, -1, 1), 'last_action': action } return obs, reward, done, info @@ -625,7 +630,8 @@ def __init__(self, env, name): raise NotImplementedError def observation(self, observation): - return observation * self.mask + return observation * self.mask + class OldGymWrapper(gym.Env): def __init__(self, env): @@ -636,6 +642,8 @@ def __init__(self, env): self.action_space = self.convert_space(env.action_space) def convert_space(self, space): + import gymnasium + """Recursively convert Gymnasium spaces to Gym spaces.""" if isinstance(space, gymnasium.spaces.Box): return gym.spaces.Box( @@ -686,14 +694,19 @@ def step(self, action): return observation, reward, done, info def render(self, mode='human'): - return self.env.render(mode=mode) + # Fix to allow rendering in the old Gym API using Mujoco's render method + # return self.env.render(mode=mode) + return self.env.mj_render() def close(self): return self.env.close() + # Example usage: if __name__ == "__main__": # Create a MyoSuite environment + import myosuite + env = myosuite.make('myoChallengeDieReorientP2-v0') # Wrap it with the old Gym-style wrapper @@ -714,12 +727,13 @@ def close(self): def make_atari(env_id, timelimit=True, noop_max=0, skip=4, sticky=False, directory=None, **kwargs): env = gym.make(env_id, **kwargs) + if 'Montezuma' in env_id: env = MontezumaInfoWrapper(env, room_address=3 if 'Montezuma' in env_id else 1) env = StickyActionEnv(env) env = InfoWrapper(env) - if directory != None: - env = gym.wrappers.Monitor(env,directory=directory,force=True) + if directory is not None: + env = gym.wrappers.Monitor(env, directory=directory, force=True) if sticky: env = StickyActionEnv(env) if not timelimit: @@ -731,6 +745,7 @@ def make_atari(env_id, timelimit=True, noop_max=0, skip=4, sticky=False, directo #env = EpisodeStackedEnv(env) return env + def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, scale =False, wrap_impala=False): """Configure environment for DeepMind-style Atari. """ @@ -749,6 +764,7 @@ def wrap_deepmind(env, episode_life=False, clip_rewards=True, frame_stack=True, env = ImpalaEnvWrapper(env) return env + def wrap_carracing(env, clip_rewards=True, frame_stack=True, scale=False): """Configure environment for DeepMind-style Atari. """ @@ -761,11 +777,12 @@ def wrap_carracing(env, clip_rewards=True, frame_stack=True, scale=False): env = FrameStack(env, 4) return env + def make_car_racing(env_id, skip=4): env = make_atari(env_id, noop_max=0, skip=skip) return wrap_carracing(env, clip_rewards=False) + def make_atari_deepmind(env_id, noop_max=30, skip=4, sticky=False, episode_life=True, wrap_impala=False, **kwargs): env = make_atari(env_id, noop_max=noop_max, skip=skip, sticky=sticky, **kwargs) return wrap_deepmind(env, episode_life=episode_life, clip_rewards=False, wrap_impala=wrap_impala) - diff --git a/rl_games/configs/myosuite/myo_hand_reorient.yaml b/rl_games/configs/myosuite/myo_hand_reorient.yaml new file mode 100644 index 00000000..4f44a873 --- /dev/null +++ b/rl_games/configs/myosuite/myo_hand_reorient.yaml @@ -0,0 +1,68 @@ +params: + seed: 5 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: actor_critic + separate: False + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + mlp: + units: [512, 256, 128] + activation: elu + initializer: + name: default + + config: + name: MyoHandReorient8 + env_name: myosuite_gym + #score_to_win: 20000 + mixed_precision: True + normalize_input: True + normalize_value: True + value_bootstrap: True + reward_shaper: + scale_value: 1 + #shift_value: 1 + log_val: False + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.008 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + clip_value: True + use_smooth_clamp: False + bound_loss_type: regularisation + bounds_loss_coef: 0.001 + max_epochs: 10000 + #max_frames: 100_000_000 + num_actors: 2 + horizon_length: 128 + minibatch_size: 128 #2048 + mini_epochs: 5 + critic_coef: 4 + use_diagnostics: False + env_config: + env_name: myoFingerPoseRandom-v0 #myoHandReorient8-v0 + flatten_obs: True + + player: + render: False + \ No newline at end of file diff --git a/rl_games/configs/ppo_myo.yaml b/rl_games/configs/myosuite/ppo_myo_elbow.yaml similarity index 73% rename from rl_games/configs/ppo_myo.yaml rename to rl_games/configs/myosuite/ppo_myo_elbow.yaml index 297a014b..4772cb4e 100644 --- a/rl_games/configs/ppo_myo.yaml +++ b/rl_games/configs/myosuite/ppo_myo_elbow.yaml @@ -1,5 +1,5 @@ params: - seed: 8 + seed: 42 algo: name: a2c_continuous @@ -20,7 +20,7 @@ params: val: 0 fixed_sigma: True mlp: - units: [256,128,64] + units: [256, 128, 64] d2rl: False activation: elu initializer: @@ -28,41 +28,41 @@ params: scale: 2 config: env_name: myo_gym - name: myo + name: MyoElbowPose1D6MRandom reward_shaper: - min_val: -1 - scale_value: 0.1 - + scale_value: 1.0 + #shift_value: 1.0 + #min_val: 0 + log_val: False + mixed_precision: True + normalize_input: True + normalize_value: True + value_bootstrap: True normalize_advantage: True - gamma: 0.995 + gamma: 0.99 tau: 0.95 learning_rate: 3e-4 lr_schedule: adaptive kl_threshold: 0.008 - save_best_after: 10 - score_to_win: 10000 + save_best_after: 50 grad_norm: 1.5 entropy_coef: 0 truncate_grads: True e_clip: 0.2 clip_value: False - num_actors: 16 + num_actors: 32 horizon_length: 128 - minibatch_size: 1024 + minibatch_size: 2048 mini_epochs: 4 critic_coef: 2 - normalize_input: True - bounds_loss_coef: 0.00 - max_epochs: 10000 - normalize_value: True + bounds_loss_coef: 0.001 + max_epochs: 1000 use_diagnostics: True - value_bootstrap: True - #weight_decay: 0.0001 + weight_decay: 0.0 use_smooth_clamp: True env_config: - name: 'myoElbowPose1D6MRandom-v0' + name: myoElbowPose1D6MRandom-v0 player: - render: True deterministic: True games_num: 200 diff --git a/rl_games/configs/myosuite/ppo_myo_hand_pose.yaml b/rl_games/configs/myosuite/ppo_myo_hand_pose.yaml new file mode 100644 index 00000000..a4df31d9 --- /dev/null +++ b/rl_games/configs/myosuite/ppo_myo_hand_pose.yaml @@ -0,0 +1,76 @@ +params: + seed: 8 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: actor_critic + separate: False + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + rnn: + before_mlp: True + name: lstm + units: 512 + layers: 1 + layer_norm: True + concat_output: True + mlp: + units: [256, 128] + d2rl: False + activation: elu + initializer: + name: default + scale: 2 + config: + env_name: myo_gym + name: MyoHandPoseRandomLSTM + reward_shaper: + scale_value: 1.0 + #shift_value: 1.0 + #min_val: 0 + log_val: False + mixed_precision: True + normalize_input: True + normalize_value: True + value_bootstrap: True + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.008 + save_best_after: 20 + save_frequency: 500 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + clip_value: False + num_actors: 32 + horizon_length: 256 + minibatch_size: 2048 + mini_epochs: 5 + critic_coef: 2 + bounds_loss_coef: 0.001 + max_epochs: 5000 + use_diagnostics: True + weight_decay: 0.0 + use_smooth_clamp: True + env_config: + name: myoHandPoseRandom-v0 + player: + render: True + deterministic: True + games_num: 200 diff --git a/rl_games/configs/myosuite/ppo_myo_hand_reach.yaml b/rl_games/configs/myosuite/ppo_myo_hand_reach.yaml new file mode 100644 index 00000000..0b349323 --- /dev/null +++ b/rl_games/configs/myosuite/ppo_myo_hand_reach.yaml @@ -0,0 +1,68 @@ +params: + seed: 8 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: actor_critic + separate: False + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + mlp: + units: [512, 256, 128] + d2rl: False + activation: elu + initializer: + name: default + scale: 2 + + config: + env_name: myo_gym + name: MyoHandReachRandom + reward_shaper: + scale_value: 1.0 + log_val: False + mixed_precision: True + normalize_input: True + normalize_value: True + value_bootstrap: True + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.008 + save_best_after: 20 + save_frequency: 500 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + clip_value: False + num_actors: 32 + horizon_length: 256 + minibatch_size: 2048 + mini_epochs: 5 + critic_coef: 2 + bounds_loss_coef: 0.001 + max_epochs: 10000 + use_diagnostics: True + weight_decay: 0.0 + use_smooth_clamp: True + env_config: + name: myoHandReachRandom-v0 + player: + render: True + deterministic: True + games_num: 200 diff --git a/rl_games/configs/myosuite/ppo_myo_hand_reorient.yaml b/rl_games/configs/myosuite/ppo_myo_hand_reorient.yaml new file mode 100644 index 00000000..3e0f31ec --- /dev/null +++ b/rl_games/configs/myosuite/ppo_myo_hand_reorient.yaml @@ -0,0 +1,67 @@ +params: + seed: 8 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: actor_critic + separate: False + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + mlp: + units: [512, 256, 128] + d2rl: False + activation: elu + initializer: + name: default + scale: 2 + config: + env_name: myo_gym + name: MyoChallengeDieReorientP1 + reward_shaper: + scale_value: 1.0 + log_val: False + mixed_precision: True + normalize_input: True + normalize_value: True + value_bootstrap: True + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.008 + save_best_after: 20 + save_frequency: 500 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + clip_value: False + num_actors: 32 + horizon_length: 128 + minibatch_size: 2048 + mini_epochs: 5 + critic_coef: 2 + bounds_loss_coef: 0.001 + max_epochs: 5000 + use_diagnostics: True + weight_decay: 0.0 + use_smooth_clamp: True + env_config: + name: myoChallengeDieReorientP1-v0 + player: + render: True + deterministic: True + games_num: 200 diff --git a/rl_games/configs/myosuite/ppo_myo_walk.yaml b/rl_games/configs/myosuite/ppo_myo_walk.yaml new file mode 100644 index 00000000..93a8dd34 --- /dev/null +++ b/rl_games/configs/myosuite/ppo_myo_walk.yaml @@ -0,0 +1,68 @@ +params: + seed: 8 + algo: + name: a2c_continuous + + model: + name: continuous_a2c_logstd + + network: + name: actor_critic + separate: False + space: + continuous: + mu_activation: None + sigma_activation: None + mu_init: + name: default + sigma_init: + name: const_initializer + val: 0 + fixed_sigma: True + mlp: + units: [512, 256, 128] + d2rl: False + activation: elu + initializer: + name: default + scale: 2 + config: + env_name: myo_gym + name: MyoLegWalk + reward_shaper: + scale_value: 1.0 + log_val: False + mixed_precision: True + normalize_input: True + normalize_value: True + value_bootstrap: True + normalize_advantage: True + gamma: 0.99 + tau: 0.95 + learning_rate: 3e-4 + lr_schedule: adaptive + kl_threshold: 0.008 + save_best_after: 20 + save_frequency: 500 + grad_norm: 1.0 + entropy_coef: 0.0 + truncate_grads: True + e_clip: 0.2 + clip_value: False + num_actors: 32 + horizon_length: 256 + minibatch_size: 2048 + mini_epochs: 6 + critic_coef: 2 + bounds_loss_coef: 0.001 + max_epochs: 50000 + use_diagnostics: True + weight_decay: 0.0 + use_smooth_clamp: True + env_config: + name: myoLegWalk-v0 + player: + render: True + deterministic: True + render_sleep: 0.01 + games_num: 1000 diff --git a/rl_games/envs/myosuite.py b/rl_games/envs/myosuite.py new file mode 100644 index 00000000..067b1f33 --- /dev/null +++ b/rl_games/envs/myosuite.py @@ -0,0 +1,305 @@ +from rl_games.common.ivecenv import IVecEnv +import numpy as np + +import torch +from typing import Dict + +import gymnasium as gym2 +import gymnasium.spaces.utils +from gymnasium.vector.utils import batch_space +#from mani_skill.utils import common + + +VecEnvObs = Dict[str, torch.Tensor | Dict[str, torch.Tensor]] + +def _process_obs(self, obs_dict: VecEnvObs) -> torch.Tensor | dict[str, torch.Tensor]: + + # process policy obs + obs = obs_dict["policy"] + + # TODO: add state processing for asymmetric case + # TODO: add clamping? + # currently supported only single-gpu case + + if not isinstance(obs, dict): + # clip the observations + obs = torch.clamp(obs, -self._clip_obs, self._clip_obs) + # move the buffer to rl-device + obs = obs.to(device=self._rl_device).clone() + + return obs + else: + # clip the observations + for key in obs.keys(): + obs[key] = torch.clamp(obs[key], -self._clip_obs, self._clip_obs) + # move the buffer to rl-device + obs[key] = obs[key].to(device=self._rl_device).clone() + # TODO: add state processing for asymmetric case + return obs + +def save_images_to_file(images: torch.Tensor, file_path: str): + """Save images to file. + + Args: + images: A tensor of shape (N, H, W, C) containing the images. + file_path: The path to save the images to. + """ + from torchvision.utils import make_grid, save_image + + save_image( + make_grid(torch.swapaxes(images.unsqueeze(1), 1, -1).squeeze(-1), nrow=round(images.shape[0] ** 0.5)), file_path + ) + + +class RlgFlattenRGBDObservationWrapper(gym2.ObservationWrapper): + """ + Flattens the rgbd mode observations into a dictionary with two keys, "camera" and "proprio" + + Args: + rgb (bool): Whether to include rgb images in the observation + depth (bool): Whether to include depth images in the observation + state (bool): Whether to include state data in the observation + + Note that the returned observations will have a "rgbd" or "rgb" or "depth" key depending on the rgb/depth bool flags. + """ + + def __init__(self, env, rgb=True, depth=False, state=True, aux_loss=False) -> None: + from mani_skill.envs.sapien_env import BaseEnv + + self.base_env: BaseEnv = env.unwrapped + self.aux_loss = aux_loss + self.write_image_to_file = False + + super().__init__(env) + self.include_rgb = rgb + self.include_depth = depth + self.include_state = state + new_obs = self.observation(self.base_env._init_raw_obs) + self.base_env.update_obs_space(new_obs) + + def observation(self, observation: Dict): + # print("Observation:", observation.keys()) + # for key, value in observation.items(): + # print(key, value.keys()) + if self.aux_loss: + aux_target = observation['extra']['aux_target'] + del observation['extra']['aux_target'] + # print("Input Obs:", observation.keys()) + # print("Input Obs Agent:", observation['agent'].keys()) + # print("Input Obs Extra:", observation['extra'].keys()) + sensor_data = observation.pop("sensor_data") + del observation["sensor_param"] + #del observation["extra"] + images = [] + for cam_data in sensor_data.values(): + if self.include_rgb: + images.append(cam_data["rgb"]) + if self.include_depth: + images.append(cam_data["depth"]) + images = torch.concat(images, axis=-1) + + if self.write_image_to_file: + save_images_to_file(images.float() / 255.0, f"pickup_cube_{'rgb'}.png") + + # flatten the rest of the data which should just be state data + observation = common.flatten_state_dict(observation, use_torch=True) + + ret = dict() + if self.include_state: + ret["proprio"] = observation + if self.aux_loss: + ret['aux_target'] = aux_target + + if not self.include_rgb and self.include_depth: + ret["camera"] = images.float() / 32768.0 + else: + ret["camera"] = images + + return ret + + +class Maniskill(IVecEnv): + + def __init__(self, config_name, num_envs, **kwargs): + import gym.spaces + import gymnasium + import gymnasium as gym2 + import mani_skill.envs + + # Can be any env_id from the list of Rigid-Body envs: https://maniskill.readthedocs.io/en/latest/tasks/index.html + self.env_name = kwargs.pop('env_name', 'PickCube-v1') # can be one of ['PickCube-v1', 'PegInsertionSide-v1', 'StackCube-v1'] + + # an observation type and space, see https://maniskill.readthedocs.io/en/latest/user_guide/concepts/observation.html for details + self.obs_mode = kwargs.pop('obs_mode', 'state') # can be one of ['pointcloud', 'rgbd', 'state_dict', 'state'] + self.aux_loss = kwargs.pop('aux_loss', False) + + # a controller type / action space, see https://maniskill.readthedocs.io/en/latest/user_guide/concepts/controllers.html for a full list + # can be one of ['pd_ee_delta_pose', 'pd_ee_delta_pos', 'pd_joint_delta_pos', 'arm_pd_joint_pos_vel'] + self.control_mode = kwargs.pop('control_mode', 'pd_ee_delta_pose') #"pd_joint_delta_pos" + + self.reward_mode = kwargs.pop('reward_mode', 'dense') # can be one of ['sparse', 'dense'] + self.robot_uids = "panda" # can be one of ['panda', 'fetch'] + + print("Creating Maniskill env with the following parameters:") + print("env_name:", self.env_name) + print("obs_mode:", self.obs_mode) + print("control_mode:", self.control_mode) + print("reward_mode:", self.reward_mode) + print("robot_uids:", self.robot_uids) + + self.env = gym2.make(self.env_name, + num_envs=num_envs, + # render_mode="rgb_array", + obs_mode=self.obs_mode, + reward_mode=self.reward_mode, + control_mode=self.control_mode, + robot_uids=self.robot_uids, + enable_shadow=True # this makes the default lighting cast shadows + ) + + print("Observation Space Before:", self.env.observation_space) + policy_obs_space = self.env.unwrapped.single_observation_space + print("Observation Space Unwrapped Before:", policy_obs_space) + + # TODO: add pointcloud and Depth support + use_rgb = self.obs_mode == 'rgbd' or self.obs_mode == 'rgb' + use_depth = self.obs_mode == 'rgbd' or self.obs_mode == 'depth' + if self.obs_mode == 'rgb' or self.obs_mode == 'rgbd' or self.obs_mode == 'depth': + self.env = RlgFlattenRGBDObservationWrapper(self.env, aux_loss=self.aux_loss, rgb=use_rgb, depth=use_depth) + policy_obs_space = self.env.unwrapped.single_observation_space + print("Observation Space Unwrapped After:", policy_obs_space) + + modified_policy_obs_space = {} + + # Copy existing keys and values, renaming as needed + for key, value in policy_obs_space.items(): + print("Key:", key) + print("Value:", value) + if key == 'rgb' or key == 'rgbd': + print("RGBD Shape:", value.shape) + print("RGBD Dtype:", value.dtype) + print(value) + self.env.unwrapped.single_observation_space[key].dtype = np.uint8 + value.dtype = np.int8 + modified_policy_obs_space['camera'] = value + elif key == 'state': + modified_policy_obs_space['proprio'] = value + else: + modified_policy_obs_space[key] = value + + print("Observation Space Unwrapped Done:", modified_policy_obs_space) + + policy_obs_space = gymnasium.spaces.Dict(modified_policy_obs_space) + print("Observation Space After:", policy_obs_space) + + # from mani_skill.utils.wrappers import RecordEpisode + # # to make it look a little more realistic, we will enable shadows which make the default lighting cast shadows + # self.env = RecordEpisode( + # self.env, + # "./videos", # the directory to save replay videos and trajectories to + # # on GPU sim we record intervals, not by single episodes as there are multiple envs + # # each 100 steps a new video is saved + # max_steps_per_video=240 + # ) + + self._clip_obs = 5.0 + + self.observation_space = gym.spaces.Dict() + + # TODO: single function + if isinstance(policy_obs_space, gymnasium.spaces.Dict): + # check if we have a dictionary of observations + for key in policy_obs_space.keys(): + if not isinstance(policy_obs_space[key], gymnasium.spaces.Box): + print("Key:", key) + print("Value:", policy_obs_space[key]) + raise NotImplementedError( + f"Dictinary of dictinary observations support was not testes: '{type(policy_obs_space[key])}'." + ) + + val = policy_obs_space[key] + if val.dtype == np.float16 or val.dtype == np.float32: + self.observation_space[key] = gym.spaces.Box(-self._clip_obs, self._clip_obs, val.shape, dtype=val.dtype) + elif val.dtype == np.int16: + # to fix!!! + #self.observation_space[key] = gym.spaces.Box(-32768, 32767, val.shape, dtype=np.int16) + self.observation_space[key] = gym.spaces.Box(-1,0, 1.0, val.shape, dtype=np.float32) + elif policy_obs_space[key].dtype == np.uint8: + self.observation_space[key] = gym.spaces.Box(0, 255, val.shape, dtype=np.uint8) + else: + self.observation_space = gym.spaces.Box(-self._clip_obs, self._clip_obs, policy_obs_space.shape) + + print("Observation Space:", self.observation_space) + + self._clip_actions = 1.0 + + action_space = self.env.unwrapped.single_action_space + print("Single action apace:", action_space) + self.action_space = gym.spaces.Box(-self._clip_actions, self._clip_actions, action_space.shape) + + def step(self, actions): + # TODO: use env device + # TODO: add reward/observation clamoping + # TODO: move buffers to rl-device + # TODO: move actions to sim-device + # actions = actions.detach().clone().to(device=self._sim_device) + # # clip the actions + actions = torch.clamp(actions, -self._clip_actions, self._clip_actions) + + obs_dict, rew, terminated, truncated, extras = self.env.step(actions) + + # move time out information to the extras dict + # note: only used when `value_bootstrap` is True in the agent configuration + extras["time_outs"] = truncated + + obs_and_states = {'obs': obs_dict} + + # dones = (terminated | truncated) + dones = torch.logical_or(terminated, truncated) + if dones.any(): + env_idx = torch.arange(0, self.env.unwrapped.num_envs, device=self.env.unwrapped.device)[dones] # device=self.device + reset_obs, _ = self.env.reset(options=dict(env_idx=env_idx)) + obs_and_states['obs'] = reset_obs + + # remap extras from "log" to "episode" + if "log" in extras: + extras["episode"] = extras.pop("log") + + if "success" in extras: + extras["successes"] = extras["success"].float().mean() + + return obs_and_states, rew, dones, extras + + def reset(self): + obs = self.env.reset() + obs_dict = {'obs': obs[0]} + + # if self.obs_mode == 'rgbd': + # obs_dict = maniskill_process_obs(obs_dict) + + # print("obs_dict:", obs_dict.keys()) + # print("obs_dict['obs']:", obs_dict['obs'].keys()) + # print("obs_dict['obs']['camera']:", obs_dict['obs']['camera'].shape) + # print("obs_dict['obs']['camera']:", obs_dict['obs']['camera'].dtype) + # print("obs_dict['obs']['camera']:", obs_dict['obs']['camera']) + + return obs_dict + + def render(self, mode='human'): + self.env.render_human() + + def get_number_of_agents(self): + return 1 + + def get_env_info(self): + info = {} + info['action_space'] = self.action_space + info['observation_space'] = self.observation_space + print("info:", info) + return info + + +def create_maniskill(**kwargs): + print("Creating Maniskill env with the following parameters:") + print(kwargs) + return Maniskill("", num_envs=kwargs.pop('num_actors', 4), **kwargs) \ No newline at end of file diff --git a/runner.py b/runner.py index 4646e892..6c736df2 100644 --- a/runner.py +++ b/runner.py @@ -41,6 +41,7 @@ from rl_games.torch_runner import Runner + # Import Ray only when needed try: import ray except ImportError: diff --git a/setup.py b/setup.py index d3c36193..e5d2d7b3 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,6 @@ """Setup script for rl_games""" -import sys -import os import pathlib - from setuptools import setup, find_packages # The directory containing this file HERE = pathlib.Path(__file__).parent @@ -16,34 +13,33 @@ long_description=README, long_description_content_type="text/markdown", url="https://github.com/Denys88/rl_games", - #packages=[package for package in find_packages() if package.startswith('rl_games')], - packages = ['.','rl_games','docs'], - package_data={'rl_games':['*','*/*','*/*/*'],'docs':['*','*/*','*/*/*'],}, + packages=['.', 'rl_games', 'docs'], + package_data={'rl_games': ['*', '*/*', '*/*/*'], 'docs': ['*', '*/*', '*/*/*'], }, version='1.6.1', author='Denys Makoviichuk, Viktor Makoviichuk', author_email='trrrrr97@gmail.com, victor.makoviychuk@gmail.com', license="MIT", classifiers=[ - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10" + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ], - #packages=["rlg"], include_package_data=True, install_requires=[ - # this setup is only for pytorch - # - 'gym>=0.17.2', - 'torch>=1.7.0', - 'numpy>=1.16.0', - 'tensorboard>=1.14.0', - 'tensorboardX>=1.6', - 'setproctitle', - 'psutil', - 'pyyaml', - 'watchdog>=2.1.9,<3.0.0', # for evaluation process + 'gym>=0.17.2', + 'torch>=2.0.0', + 'numpy>=1.16.0', + 'tensorboard>=1.14.0', + 'tensorboardX>=1.6', + 'setproctitle', + 'psutil', + 'pyyaml', + 'watchdog>=2.1.9', # for evaluation process ], )