diff --git a/docs/source/user_guide/getting_started/quickstart.md b/docs/source/user_guide/getting_started/quickstart.md
index cdde4d873..c912001a1 100644
--- a/docs/source/user_guide/getting_started/quickstart.md
+++ b/docs/source/user_guide/getting_started/quickstart.md
@@ -161,19 +161,9 @@ This will then open up a GUI that looks like so:
 
 ### Additional GPU simulation/rendering customization
 
-Finally on servers with multiple GPUs you can directly pick which devices/backends to use for simulation and rendering.
+Finally on servers with multiple GPUs you can directly pick which devices/backends to use for simulation and rendering by setting the `CUDA_VISIBLE_DEVICES` environment variable. You can do this by e.g. running `export CUDA_VISIBLE_DEVICES=1` and then run the same code. While everything is labeled as device "cuda:0" it is actually using GPU device 1 now, which you can verify by running `nvidia-smi`.
 
-```python
-import gymnasium as gym
-import mani_skill.envs
-
-env = gym.make(
-    "PickCube-v1",
-    num_envs=16,
-    sim_backend="cuda:1", # selects the GPU with index 1
-    render_backend="cuda", # auto selects a GPU
-)
-```
+We currently do not properly support exposing multiple visible CUDA devices to a single process as it has some rendering bugs at the moment.
 
 ## Task Instantiation Options
 
diff --git a/mani_skill/agents/robots/allegro_hand/allegro.py b/mani_skill/agents/robots/allegro_hand/allegro.py
index 19b328570..43d543e15 100644
--- a/mani_skill/agents/robots/allegro_hand/allegro.py
+++ b/mani_skill/agents/robots/allegro_hand/allegro.py
@@ -159,7 +159,9 @@ def tip_poses(self):
         """
         Get the tip pose for each of the finger, four fingers in total
         """
-        tip_poses = [vectorize_pose(link.pose) for link in self.tip_links]
+        tip_poses = [
+            vectorize_pose(link.pose, device=self.device) for link in self.tip_links
+        ]
         return torch.stack(tip_poses, dim=-2)
 
     @property
@@ -167,7 +169,7 @@ def palm_pose(self):
         """
         Get the palm pose for allegro hand
         """
-        return vectorize_pose(self.palm_link.pose)
+        return vectorize_pose(self.palm_link.pose, device=self.device)
 
 
 @register_agent()
diff --git a/mani_skill/agents/robots/dclaw/dclaw.py b/mani_skill/agents/robots/dclaw/dclaw.py
index 1cac67331..097650e6f 100644
--- a/mani_skill/agents/robots/dclaw/dclaw.py
+++ b/mani_skill/agents/robots/dclaw/dclaw.py
@@ -117,5 +117,7 @@ def tip_poses(self):
         """
         Get the tip pose for each of the finger, three fingers in total
         """
-        tip_poses = [vectorize_pose(link.pose) for link in self.tip_links]
+        tip_poses = [
+            vectorize_pose(link.pose, device=self.device) for link in self.tip_links
+        ]
         return torch.stack(tip_poses, dim=-2)
diff --git a/mani_skill/agents/robots/trifingerpro/trifingerpro.py b/mani_skill/agents/robots/trifingerpro/trifingerpro.py
index e3bdae789..c9ee4eafd 100644
--- a/mani_skill/agents/robots/trifingerpro/trifingerpro.py
+++ b/mani_skill/agents/robots/trifingerpro/trifingerpro.py
@@ -8,12 +8,8 @@
 from mani_skill.agents.base_agent import BaseAgent
 from mani_skill.agents.controllers import *
 from mani_skill.agents.registration import register_agent
-from mani_skill.agents.utils import (
-    get_active_joint_indices,
-)
-from mani_skill.utils.sapien_utils import (
-    get_objs_by_names,
-)
+from mani_skill.agents.utils import get_active_joint_indices
+from mani_skill.utils.sapien_utils import get_objs_by_names
 from mani_skill.utils.structs.pose import vectorize_pose
 
 
@@ -23,6 +19,7 @@ class TriFingerPro(BaseAgent):
     Modified from https://github.com/NVIDIA-Omniverse/IsaacGymEnvs/blob/main/isaacgymenvs/tasks/trifinger.py
 
     """
+
     uid = "trifingerpro"
     urdf_path = f"{PACKAGE_ASSET_DIR}/robots/trifinger/trifingerpro.urdf"
     urdf_config = dict(
@@ -66,8 +63,16 @@ def __init__(self, *args, **kwargs):
         self.joint_stiffness = 1e2
         self.joint_damping = 1e1
         self.joint_force_limit = 2e1
-        self.tip_link_names = ["finger_tip_link_0", "finger_tip_link_120", "finger_tip_link_240"]
-        self.root_joint_names = ["finger_base_to_upper_joint_0", "finger_base_to_upper_joint_120", "finger_base_to_upper_joint_240"]
+        self.tip_link_names = [
+            "finger_tip_link_0",
+            "finger_tip_link_120",
+            "finger_tip_link_240",
+        ]
+        self.root_joint_names = [
+            "finger_base_to_upper_joint_0",
+            "finger_base_to_upper_joint_120",
+            "finger_base_to_upper_joint_240",
+        ]
 
         super().__init__(*args, **kwargs)
 
@@ -164,7 +169,9 @@ def tip_poses(self):
         """
         Get the tip pose for each of the finger, three fingers in total
         """
-        tip_poses = [vectorize_pose(link.pose) for link in self.tip_links]
+        tip_poses = [
+            vectorize_pose(link.pose, device=self.device) for link in self.tip_links
+        ]
         return torch.stack(tip_poses, dim=-1)
 
     # @property
diff --git a/mani_skill/envs/sapien_env.py b/mani_skill/envs/sapien_env.py
index ebb2fb9ce..6069026cc 100644
--- a/mani_skill/envs/sapien_env.py
+++ b/mani_skill/envs/sapien_env.py
@@ -2,7 +2,7 @@
 import gc
 import os
 from functools import cached_property
-from typing import Any, Dict, List, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import dacite
 import gymnasium as gym
@@ -423,7 +423,7 @@ def elapsed_steps(self):
     def obs_mode(self):
         return self._obs_mode
 
-    def get_obs(self, info: Dict = None):
+    def get_obs(self, info: Optional[Dict] = None):
         """
         Return the current observation of the environment. User may call this directly to get the current observation
         as opposed to taking a step with actions in the environment.
@@ -581,7 +581,6 @@ def _reconfigure(self, options = dict()):
         self._setup_scene()
         self._load_agent(options)
         self._load_scene(options)
-
         self._load_lighting(options)
 
         if physx.is_gpu_enabled():
diff --git a/mani_skill/utils/common.py b/mani_skill/utils/common.py
index a3ba302f2..e384c851e 100644
--- a/mani_skill/utils/common.py
+++ b/mani_skill/utils/common.py
@@ -3,7 +3,7 @@
 """
 
 from collections import defaultdict
-from typing import Dict, Sequence, Tuple, Union
+from typing import Dict, Optional, Sequence, Tuple, Union
 
 import gymnasium as gym
 import numpy as np
@@ -124,7 +124,9 @@ def index_dict_array(x1, idx: Union[int, slice], inplace=True):
 
 
 # TODO (stao): this code can be simplified
-def to_tensor(array: Union[torch.Tensor, np.array, Sequence], device: Device = None):
+def to_tensor(
+    array: Union[torch.Tensor, np.array, Sequence], device: Optional[Device] = None
+):
     """
     Maps any given sequence to a torch tensor on the CPU/GPU. If physx gpu is not enabled then we use CPU, otherwise GPU, unless specified
     by the device argument
@@ -149,7 +151,11 @@ def to_tensor(array: Union[torch.Tensor, np.array, Sequence], device: Device = N
         else:
             ret = torch.tensor(array)
         if device is None:
-            return ret.cuda()
+            if ret.device.type == "cpu":
+                # TODO (stao): note that .cuda does move a tensor to the torch.device context, it moves to the default
+                return ret.cuda()
+            # keep same device if already on GPU
+            return ret
         else:
             return ret.to(device)
     else:
diff --git a/mani_skill/utils/structs/actor.py b/mani_skill/utils/structs/actor.py
index 0a868e6fe..29521fff3 100644
--- a/mani_skill/utils/structs/actor.py
+++ b/mani_skill/utils/structs/actor.py
@@ -327,7 +327,7 @@ def pose(self) -> Pose:
     def pose(self, arg1: Union[Pose, sapien.Pose, Array]) -> None:
         if physx.is_gpu_enabled():
             if not isinstance(arg1, torch.Tensor):
-                arg1 = vectorize_pose(arg1)
+                arg1 = vectorize_pose(arg1, device=self.device)
             if self.hidden:
                 self.before_hide_pose[self.scene._reset_mask[self._scene_idxs]] = arg1
                 return
diff --git a/mani_skill/utils/structs/articulation.py b/mani_skill/utils/structs/articulation.py
index f5d36e2a0..34acfc1dd 100644
--- a/mani_skill/utils/structs/articulation.py
+++ b/mani_skill/utils/structs/articulation.py
@@ -504,7 +504,7 @@ def set_root_pose(self, pose: sapien.Pose) -> None:
 
     @cached_property
     def dof(self) -> torch.tensor:
-        return torch.tensor([obj.dof for obj in self._objs])
+        return torch.tensor([obj.dof for obj in self._objs], device=self.device)
 
     # @property
     # def gpu_index(self) -> int:
@@ -580,10 +580,7 @@ def qlimits(self):
             ]
         )
         padded_qlimits = torch.from_numpy(padded_qlimits).float()
-        if physx.is_gpu_enabled():
-            return padded_qlimits.cuda()
-        else:
-            return padded_qlimits
+        return padded_qlimits.to(self.device)
 
     @property
     def qpos(self):
diff --git a/mani_skill/utils/structs/base.py b/mani_skill/utils/structs/base.py
index 22caee6a6..f3a1816c1 100644
--- a/mani_skill/utils/structs/base.py
+++ b/mani_skill/utils/structs/base.py
@@ -53,10 +53,7 @@ def __maniskill_hash__(self):
 
     @property
     def device(self):
-        if physx.is_gpu_enabled():
-            return torch.device("cuda")
-        else:
-            return torch.device("cpu")
+        return self.scene.device
 
     @property
     def _num_objs(self):
@@ -108,7 +105,7 @@ def _body_data_index(self):
         """a list of indexes of each GPU rigid body in the `px.cuda_rigid_body_data` buffer, one for each element in `self._objs`"""
         if self._body_data_index_internal is None:
             self._body_data_index_internal = torch.tensor(
-                [body.gpu_pose_index for body in self._bodies], device="cuda"
+                [body.gpu_pose_index for body in self._bodies], device=self.device
             )
         return self._body_data_index_internal
 
diff --git a/mani_skill/utils/structs/link.py b/mani_skill/utils/structs/link.py
index 1322cc956..663413614 100644
--- a/mani_skill/utils/structs/link.py
+++ b/mani_skill/utils/structs/link.py
@@ -213,7 +213,7 @@ def pose(self) -> Pose:
     def pose(self, arg1: Union[Pose, sapien.Pose, Array]) -> None:
         if physx.is_gpu_enabled():
             if not isinstance(arg1, torch.Tensor):
-                arg1 = vectorize_pose(arg1)
+                arg1 = vectorize_pose(arg1, device=self.device)
             if self.scene.parallel_in_single_scene:
                 if len(arg1.shape) == 1:
                     arg1 = arg1.view(1, -1)
diff --git a/mani_skill/utils/structs/pose.py b/mani_skill/utils/structs/pose.py
index 69cac825b..84e13aec0 100644
--- a/mani_skill/utils/structs/pose.py
+++ b/mani_skill/utils/structs/pose.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import sapien
@@ -94,11 +94,16 @@ def create_from_pq(
 
     @classmethod
     def create(
-        cls, pose: Union[torch.Tensor, sapien.Pose, List[sapien.Pose], "Pose"]
+        cls,
+        pose: Union[torch.Tensor, sapien.Pose, List[sapien.Pose], "Pose"],
+        device: Optional[Device] = None,
     ) -> "Pose":
         if isinstance(pose, sapien.Pose):
             raw_pose = torch.hstack(
-                [common.to_tensor(pose.p), common.to_tensor(pose.q)]
+                [
+                    common.to_tensor(pose.p, device=device),
+                    common.to_tensor(pose.q, device=device),
+                ]
             )
             return cls(raw_pose=add_batch_dim(raw_pose))
         elif isinstance(pose, cls):
@@ -109,8 +114,8 @@ def create(
             for p in pose:
                 ps.append(p.p)
                 qs.append(p.q)
-            ps = common.to_tensor(ps)
-            qs = common.to_tensor(qs)
+            ps = common.to_tensor(ps, device=device)
+            qs = common.to_tensor(qs, device=device)
             return cls(raw_pose=torch.hstack([ps, qs]))
 
         else:
@@ -136,6 +141,11 @@ def shape(self):
     def device(self):
         return self.raw_pose.device
 
+    def to(self, device: Device):
+        if self.raw_pose.device == device:
+            return self
+        return Pose.create(self.raw_pose.to(device))
+
     # -------------------------------------------------------------------------- #
     # Functions from sapien.Pose
     # -------------------------------------------------------------------------- #
@@ -224,21 +234,26 @@ def q(self, arg1: torch.Tensor):
     #     pass
 
 
-def vectorize_pose(pose: Union[sapien.Pose, Pose, Array]) -> torch.Tensor:
+def vectorize_pose(
+    pose: Union[sapien.Pose, Pose, Array], device: Optional[Device] = None
+) -> torch.Tensor:
     """
     Maps several formats of Pose representation to the appropriate tensor representation
     """
     if isinstance(pose, sapien.Pose):
         if physx.is_gpu_enabled():
             return torch.concatenate(
-                [common.to_tensor(pose.p), common.to_tensor(pose.q)]
+                [
+                    common.to_tensor(pose.p, device=device),
+                    common.to_tensor(pose.q, device=device),
+                ]
             )
         else:
             return np.hstack([pose.p, pose.q])
     elif isinstance(pose, Pose):
         return pose.raw_pose
     else:
-        return common.to_tensor(pose)
+        return common.to_tensor(pose, device=device)
 
 
 def to_sapien_pose(pose: Union[torch.Tensor, sapien.Pose, Pose]) -> sapien.Pose: