pytorch · vmoens · Apr 27, 2023 · Apr 27, 2023 · Apr 27, 2023 · Apr 27, 2023
diff --git a/.circleci/unittest/linux_examples/scripts/run_local.sh b/.circleci/unittest/linux_examples/scripts/run_local.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+
+# Read script from line 29
+filename=".circleci/unittest/linux_examples/scripts/run_test.sh"
+start_line=29
+script=$(tail -n +$start_line "$filename")
+
+# Replace "cuda:0" with "cpu"
+script="${script//cuda:0/cpu}"
+
+# Remove any instances of ".circleci/unittest/helpers/coverage_run_parallel.py"
+script="${script//.circleci\/unittest\/helpers\/coverage_run_parallel.py}"
+script="${script//coverage combine}"
+script="${script//coverage xml -i}"
+
+# Execute the modified script
+echo "$script" | bash
diff --git a/.circleci/unittest/linux_examples/scripts/run_test.sh b/.circleci/unittest/linux_examples/scripts/run_test.sh
@@ -35,7 +35,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py
   frames_per_batch=16 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -47,15 +47,16 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/a2c/a2c.py \
   collector.collector_device=cuda:0 \
   logger.backend= \
   logger.log_interval=4 \
-  optim.lr_scheduler=False
+  optim.lr_scheduler=False \
+  optim.device=cuda:0
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/dqn/dqn.py \
   total_frames=48 \
   init_random_frames=10 \
   batch_size=10 \
   frames_per_batch=16 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -67,7 +68,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/redq/redq.py
   frames_per_batch=16 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -79,7 +80,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/sac/sac.py \
   frames_per_batch=16 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -89,6 +90,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/ppo/ppo.py \
   collector.total_frames=48 \
   collector.frames_per_batch=16 \
   collector.collector_device=cuda:0 \
+  optim.device=cuda:0 \
   loss.mini_batch_size=10 \
   loss.ppo_epochs=1 \
   logger.backend= \
@@ -101,7 +103,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/dreamer/drea
   frames_per_batch=200 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -114,16 +116,17 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/td3/td3.py \
   frames_per_batch=16 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
-  mode=offline 
+  collector_device=cuda:0 \
+  mode=offline
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/iql/iql_online.py \
   total_frames=48 \
   batch_size=10 \
   frames_per_batch=16 \
   num_workers=4 \
   env_per_collector=2 \
-  collector_devices=cuda:0 \
-  mode=offline 
+  collector_device=cuda:0 \
+  device=cuda:0 \
+  mode=offline
 
 # With single envs
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py \
@@ -133,7 +136,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/ddpg/ddpg.py
   frames_per_batch=16 \
   num_workers=2 \
   env_per_collector=1 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -145,15 +148,16 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/a2c/a2c.py \
   collector.collector_device=cuda:0 \
   logger.backend= \
   logger.log_interval=4 \
-  optim.lr_scheduler=False
+  optim.lr_scheduler=False \
+  optim.device=cuda:0
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/dqn/dqn.py \
   total_frames=48 \
   init_random_frames=10 \
   batch_size=10 \
   frames_per_batch=16 \
   num_workers=2 \
   env_per_collector=1 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -165,7 +169,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/redq/redq.py
   frames_per_batch=16 \
   num_workers=2 \
   env_per_collector=1 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -177,7 +181,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/sac/sac.py \
   frames_per_batch=16 \
   num_workers=2 \
   env_per_collector=1 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -187,6 +191,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/ppo/ppo.py \
   collector.total_frames=48 \
   collector.frames_per_batch=16 \
   collector.collector_device=cuda:0 \
+  optim.device=cuda:0 \
   loss.mini_batch_size=10 \
   loss.ppo_epochs=1 \
   logger.backend= \
@@ -199,7 +204,7 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/dreamer/drea
   frames_per_batch=200 \
   num_workers=2 \
   env_per_collector=1 \
-  collector_devices=cuda:0 \
+  collector_device=cuda:0 \
   optim_steps_per_batch=1 \
   record_video=True \
   record_frames=4 \
@@ -213,15 +218,16 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/td3/td3.py \
   num_workers=2 \
   env_per_collector=1 \
   mode=offline \
-  collector_devices=cuda:0 
+  collector_device=cuda:0
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/iql/iql_online.py \
   total_frames=48 \
   batch_size=10 \
   frames_per_batch=16 \
   num_workers=2 \
   env_per_collector=1 \
   mode=offline \
-  collector_devices=cuda:0
+  device=cuda:0 \
+  collector_device=cuda:0
 
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/bandits/dqn.py --n_steps=100
 

diff --git a/examples/a2c/utils.py b/examples/a2c/utils.py
@@ -11,6 +11,7 @@
     CatTensors,
     DoubleToFloat,
     EnvCreator,
+    ExplorationType,
     GrayScale,
     NoopResetEnv,
     ObservationNorm,
@@ -261,7 +262,7 @@ def make_a2c_models(cfg):
             value_operator=value_module,
         )
         actor = actor_critic.get_policy_operator()
-        critic = actor_critic.get_value_operator()
+        critic = actor_critic.get_value_head()  # to avoid duplicate params
     else:
         actor = policy_module
         critic = value_module
@@ -326,7 +327,7 @@ def make_a2c_modules_state(proof_environment):
         distribution_class=distribution_class,
         distribution_kwargs=distribution_kwargs,
         return_log_prob=True,
-        default_interaction_mode="random",
+        default_interaction_type=ExplorationType.RANDOM,
     )
 
     # Define the value net
@@ -412,7 +413,7 @@ def make_a2c_modules_pixels(proof_environment):
         distribution_class=distribution_class,
         distribution_kwargs=distribution_kwargs,
         return_log_prob=True,
-        default_interaction_mode="random",
+        default_interaction_type=ExplorationType.RANDOM,
     )
 
     # Define another head for the value
@@ -451,8 +452,8 @@ def make_loss(loss_cfg, actor_network, value_network):
         entropy_coef=loss_cfg.entropy_coef,
         critic_coef=loss_cfg.critic_coef,
         entropy_bonus=True,
-        gamma=loss_cfg.gamma,
     )
+    loss_module.make_value_estimator(gamma=loss_cfg.gamma)
     return loss_module, advantage_module
 
 

diff --git a/examples/bandits/dqn.py b/examples/bandits/dqn.py
@@ -75,15 +75,16 @@
         actor(env.reset())
         loss = DistributionalDQNLoss(
             actor,
-            gamma=0.0,
         )
+        loss.make_value_estimator(gamma=0.9)
     else:
         model = MLP(
             out_features=n_actions, depth=3, num_cells=n_cells, activation_class=nn.Tanh
         )
         actor = QValueActor(model, action_space="categorical")
         actor(env.reset())
-        loss = DQNLoss(actor, gamma=0.0, loss_function="smooth_l1")
+        loss = DQNLoss(actor, loss_function="smooth_l1", action_space=env.action_spec)
+        loss.make_value_estimator(gamma=0.0)
     policy = EGreedyWrapper(
         actor, eps_greedy, 0.0, annealing_num_steps=n_steps, spec=env.action_spec
     )

diff --git a/examples/ddpg/config.yaml b/examples/ddpg/config.yaml
@@ -18,8 +18,7 @@ init_random_frames: 25000
 activation: elu
 gSDE: 0
 from_pixels: 0
-#collector_devices: [cuda:1,cuda:1,cuda:1,cuda:1]
-collector_devices: [cpu,cpu,cpu,cpu]
+collector_device: cpu
 env_per_collector: 8
 num_workers: 32
 lr_scheduler: ""

diff --git a/examples/discrete_sac/discrete_sac.py b/examples/discrete_sac/discrete_sac.py
@@ -174,10 +174,10 @@ def env_factory(num_workers):
         qvalue_network=model[1],
         num_actions=num_actions,
         num_qvalue_nets=2,
-        gamma=cfg.gamma,
         target_entropy_weight=cfg.target_entropy_weight,
         loss_function="smooth_l1",
     )
+    loss_module.make_value_estimator(gamma=cfg.gamma)
 
     # Define Target Network Updater
     target_net_updater = SoftUpdate(loss_module, cfg.target_update_polyak)

diff --git a/examples/distributed/collectors/multi_nodes/ray_train.py b/examples/distributed/collectors/multi_nodes/ray_train.py
@@ -154,9 +154,9 @@
         entropy_coef=entropy_eps,  # these keys match by default but we set this for completeness
         value_target_key=advantage_module.value_target_key,
         critic_coef=1.0,
-        gamma=0.99,
         loss_critic_type="smooth_l1",
     )
+    loss_module.make_value_estimator(gamma=0.99)
 
     # 7. Define optimizer
     optim = torch.optim.Adam(loss_module.parameters(), lr)

diff --git a/examples/dqn/config.yaml b/examples/dqn/config.yaml
@@ -16,8 +16,7 @@ lr: 3e-4
 multi_step: 1
 init_random_frames: 25000
 from_pixels: 1
-#collector_devices: [cuda:1,cuda:1,cuda:1,cuda:1]
-collector_devices: [cpu,cpu,cpu,cpu]
+collector_device: cpu
 env_per_collector: 8
 num_workers: 32
 lr_scheduler: ""

diff --git a/examples/dreamer/config.yaml b/examples/dreamer/config.yaml
@@ -14,8 +14,7 @@ from_pixels: True
 # we want 50 frames / traj in the replay buffer. Given the frame_skip=2 this makes each traj 100 steps long
 env_per_collector: 8
 num_workers: 8
-# collector_devices: [cuda:1]
-collector_devices: cuda:1  # [cpu,cpu,cpu,cpu,cpu,cpu,cpu,cpu]
+collector_device: cuda:1
 frames_per_batch: 800
 optim_steps_per_batch: 80
 record_interval: 30

diff --git a/examples/dreamer/dreamer.py b/examples/dreamer/dreamer.py
@@ -177,10 +177,6 @@ def main(cfg: "DictConfig"):  # noqa: F821
         make_env=create_env_fn,
         actor_model_explore=exploration_policy,
         cfg=cfg,
-        # make_env_kwargs=[
-        #     {"device": device}
-        #     for device in cfg.collector_devices
-        # ],
     )
     print("collector:", collector)
 
@@ -190,7 +186,7 @@ def main(cfg: "DictConfig"):  # noqa: F821
         record_frames=cfg.record_frames,
         frame_skip=cfg.frame_skip,
         policy_exploration=policy,
-        recorder=make_recorder_env(
+        environment=make_recorder_env(
             cfg=cfg,
             video_tag=video_tag,
             obs_norm_state_dict=obs_norm_state_dict,

diff --git a/examples/dreamer/dreamer_utils.py b/examples/dreamer/dreamer_utils.py
@@ -195,13 +195,13 @@ def make_transformed_env(**kwargs) -> TransformedEnv:
         from_pixels = cfg.from_pixels
 
         if custom_env is None and custom_env_maker is None:
-            if isinstance(cfg.collector_devices, str):
-                device = cfg.collector_devices
-            elif isinstance(cfg.collector_devices, Sequence):
-                device = cfg.collector_devices[0]
+            if isinstance(cfg.collector_device, str):
+                device = cfg.collector_device
+            elif isinstance(cfg.collector_device, Sequence):
+                device = cfg.collector_device[0]
             else:
                 raise ValueError(
-                    "collector_devices must be either a string or a sequence of strings"
+                    "collector_device must be either a string or a sequence of strings"
                 )
             env_kwargs = {
                 "env_name": env_name,

diff --git a/examples/iql/iql_online.py b/examples/iql/iql_online.py
@@ -73,13 +73,7 @@ def make_replay_buffer(
 @hydra.main(version_base=None, config_path=".", config_name="online_config")
 def main(cfg: "DictConfig"):  # noqa: F821
 
-    device = (
-        torch.device("cuda:0")
-        if torch.cuda.is_available()
-        and torch.cuda.device_count() > 0
-        and cfg.device == "cuda:0"
-        else torch.device("cpu")
-    )
+    device = torch.device(cfg.device)
 
     exp_name = generate_exp_name("Online_IQL", cfg.exp_name)
     logger = get_logger(
@@ -199,11 +193,11 @@ def env_factory(num_workers):
         qvalue_network=model[1],
         value_network=model[2],
         num_qvalue_nets=2,
-        gamma=cfg.gamma,
         temperature=cfg.temperature,
         expectile=cfg.expectile,
         loss_function="smooth_l1",
     )
+    loss_module.make_value_estimator(gamma=cfg.gamma)
 
     # Define Target Network Updater
     target_net_updater = SoftUpdate(loss_module, cfg.target_update_polyak)
@@ -216,7 +210,7 @@ def env_factory(num_workers):
         frames_per_batch=cfg.frames_per_batch,
         max_frames_per_traj=cfg.max_frames_per_traj,
         total_frames=cfg.total_frames,
-        device=cfg.device,
+        device=cfg.collector_device,
     )
     collector.set_seed(cfg.seed)
 

diff --git a/examples/iql/online_config.yaml b/examples/iql/online_config.yaml
@@ -23,8 +23,7 @@ default_policy_scale: 1.0
 scale_lb: 0.1
 activation: elu
 from_pixels: 0
-#collector_devices: [cuda:1,cuda:1,cuda:1,cuda:1]
-collector_devices: [cpu]
+collector_device: cuda:0
 env_per_collector: 5
 frames_per_batch: 1000 # 5*200
 max_frames_per_traj: 200