From 83f4b6f1a7d73361710debf94052ff8d0c5d04a7 Mon Sep 17 00:00:00 2001
From: mtrepte <mtrepte@nvidia.com>
Date: Fri, 21 Nov 2025 16:42:00 -0800
Subject: [PATCH 1/3] asdf

---
 apps/rendering_modes/balanced.kit             |   2 +
 apps/rendering_modes/performance.kit          |   2 +
 apps/rendering_modes/quality.kit              |   2 +
 .../isaaclab/sensors/camera/tiled_camera.py   |  30 ++-
 .../direct/shadow_hand/__init__.py            |  57 +++++
 .../direct/shadow_hand/feature_extractor.py   | 116 +++++++---
 .../shadow_hand/shadow_hand_vision_env.py     | 207 ++++++++++++++++--
 7 files changed, 368 insertions(+), 48 deletions(-)

diff --git a/apps/rendering_modes/balanced.kit b/apps/rendering_modes/balanced.kit
index ee92625fd7e..d9b793f2915 100644
--- a/apps/rendering_modes/balanced.kit
+++ b/apps/rendering_modes/balanced.kit
@@ -1,3 +1,5 @@
+rtx.sdg.force.disableColorRender=false
+
 rtx.translucency.enabled = false
 
 rtx.reflections.enabled = false
diff --git a/apps/rendering_modes/performance.kit b/apps/rendering_modes/performance.kit
index 3cfe6e8c0e2..3925a8e1dff 100644
--- a/apps/rendering_modes/performance.kit
+++ b/apps/rendering_modes/performance.kit
@@ -1,3 +1,5 @@
+rtx.sdg.force.disableColorRender=true # change to false for RGB baselines
+
 rtx.translucency.enabled = false
 
 rtx.reflections.enabled = false
diff --git a/apps/rendering_modes/quality.kit b/apps/rendering_modes/quality.kit
index 8e966ddfd3b..2aa8d8eae98 100644
--- a/apps/rendering_modes/quality.kit
+++ b/apps/rendering_modes/quality.kit
@@ -1,3 +1,5 @@
+rtx.sdg.force.disableColorRender=false
+
 rtx.translucency.enabled = true
 
 rtx.reflections.enabled = true
diff --git a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py
index 3e9982135c5..bf057d27954 100644
--- a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py
+++ b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py
@@ -189,12 +189,20 @@ def _initialize_impl(self):
         )
         self._render_product_paths = [rp.path]
 
+        rep.AnnotatorRegistry.register_annotator_from_aov(aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4)
+        rep.AnnotatorRegistry.register_annotator_from_aov(aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4)
         # Define the annotators based on requested data types
         self._annotators = dict()
         for annotator_type in self.cfg.data_types:
             if annotator_type == "rgba" or annotator_type == "rgb":
                 annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False)
                 self._annotators["rgba"] = annotator
+            elif annotator_type == "diffuse_albedo":
+                annotator = rep.AnnotatorRegistry.get_annotator("DiffuseAlbedoSD", device=self.device, do_array_copy=False)
+                self._annotators["diffuse_albedo"] = annotator
+            elif annotator_type == "simple_shading":
+                annotator = rep.AnnotatorRegistry.get_annotator("SimpleShadingSD", device=self.device, do_array_copy=False)
+                self._annotators["simple_shading"] = annotator
             elif annotator_type == "depth" or annotator_type == "distance_to_image_plane":
                 # keep depth for backwards compatibility
                 annotator = rep.AnnotatorRegistry.get_annotator(
@@ -254,13 +262,16 @@ def _update_buffers_impl(self, env_ids: Sequence[int]):
             else:
                 tiled_data_buffer = tiled_data_buffer.to(device=self.device)
 
-            # process data for different segmentation types
+            # process data for different segmentation types and custom annotators
             # Note: Replicator returns raw buffers of dtype uint32 for segmentation types
             #   so we need to convert them to uint8 4 channel images for colorized types
+            # Note: Custom annotators (diffuse_albedo, simple_shading) also return 4 channel data
             if (
                 (data_type == "semantic_segmentation" and self.cfg.colorize_semantic_segmentation)
                 or (data_type == "instance_segmentation_fast" and self.cfg.colorize_instance_segmentation)
                 or (data_type == "instance_id_segmentation_fast" and self.cfg.colorize_instance_id_segmentation)
+                or data_type == "diffuse_albedo"
+                or data_type == "simple_shading"
             ):
                 tiled_data_buffer = wp.array(
                     ptr=tiled_data_buffer.ptr, shape=(*tiled_data_buffer.shape, 4), dtype=wp.uint8, device=self.device
@@ -271,6 +282,13 @@ def _update_buffers_impl(self, env_ids: Sequence[int]):
             if data_type == "motion_vectors":
                 tiled_data_buffer = tiled_data_buffer[:, :, :2].contiguous()
 
+            # For diffuse albedo, keep only the first three channels (RGB)
+            if data_type == "diffuse_albedo":
+                tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
+            # For simple shading, keep only the first three channels (RGB)
+            if data_type == "simple_shading":
+                tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous()
+
             wp.launch(
                 kernel=reshape_tiled_image,
                 dim=(self._view.count, self.cfg.height, self.cfg.width),
@@ -347,6 +365,16 @@ def _create_buffers(self):
         if "rgb" in self.cfg.data_types:
             # RGB is the first 3 channels of RGBA
             data_dict["rgb"] = data_dict["rgba"][..., :3]
+        if "diffuse_albedo" in self.cfg.data_types:
+            data_dict["diffuse_albedo"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
+            ).contiguous()
+            data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3]
+        if "simple_shading" in self.cfg.data_types:
+            data_dict["simple_shading"] = torch.zeros(
+                (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8
+            ).contiguous()
+            data_dict["simple_shading"] = data_dict["simple_shading"][..., :3]
         if "distance_to_image_plane" in self.cfg.data_types:
             data_dict["distance_to_image_plane"] = torch.zeros(
                 (self._view.count, self.cfg.height, self.cfg.width, 1), device=self.device, dtype=torch.float32
diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py
index ed316e6e267..3ce6c42c5b6 100644
--- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py
+++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py
@@ -64,6 +64,63 @@
     },
 )
 
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-Segmentation-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSegmentationEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-RGB-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionRGBEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-DiffuseAlbedo-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDiffuseAlbedoEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-SimpleShading-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSimpleShadingEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
+gym.register(
+    id="Isaac-Repose-Cube-Shadow-Depth-Direct-v0",
+    entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
+    disable_env_checker=True,
+    kwargs={
+        "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDepthEnvCfg",
+        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg",
+        "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml",
+    },
+)
+
 gym.register(
     id="Isaac-Repose-Cube-Shadow-Vision-Direct-Play-v0",
     entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv",
diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py
index 82d76ec7f1e..a9f7785654c 100644
--- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py
+++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py
@@ -16,9 +16,9 @@
 class FeatureExtractorNetwork(nn.Module):
     """CNN architecture used to regress keypoint positions of the in-hand cube from image data."""
 
-    def __init__(self):
+    def __init__(self, num_channel):
         super().__init__()
-        num_channel = 7
+        self.num_channel = num_channel
         self.cnn = nn.Sequential(
             nn.Conv2d(num_channel, 16, kernel_size=6, stride=2, padding=0),
             nn.ReLU(),
@@ -45,8 +45,11 @@ def __init__(self):
 
     def forward(self, x):
         x = x.permute(0, 3, 1, 2)
-        x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :])
-        x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :])
+        if self.num_channel == 7:
+            x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :])
+            x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :])
+        elif self.num_channel == 3:
+            x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :])
         cnn_x = self.cnn(x)
         out = self.linear(cnn_x.view(-1, 128))
         return out
@@ -65,6 +68,8 @@ class FeatureExtractorCfg:
     write_image_to_file: bool = False
     """If True, the images from the camera sensor are written to file. Default is False."""
 
+    num_channel: int = 7
+
 
 class FeatureExtractor:
     """Class for extracting features from image data.
@@ -86,7 +91,7 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None =
         self.device = device
 
         # Feature extractor model
-        self.feature_extractor = FeatureExtractorNetwork()
+        self.feature_extractor = FeatureExtractorNetwork(self.cfg.num_channel)
         self.feature_extractor.to(self.device)
 
         self.step_count = 0
@@ -112,8 +117,13 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None =
             self.feature_extractor.eval()
 
     def _preprocess_images(
-        self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        self,
+        rgb_img: torch.Tensor,
+        depth_img: torch.Tensor,
+        segmentation_img: torch.Tensor,
+        albedo_img: torch.Tensor | None = None,
+        simple_shading_img: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
         """Preprocesses the input images.
 
         Args:
@@ -122,20 +132,37 @@ def _preprocess_images(
             segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3)
 
         Returns:
-            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Preprocessed RGB, depth, and segmentation
+            tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
+            Preprocessed RGB, depth, segmentation, albedo, and simple shading images
         """
-        rgb_img = rgb_img / 255.0
+        if rgb_img is not None:
+            rgb_img = rgb_img / 255.0
         # process depth image
-        depth_img[depth_img == float("inf")] = 0
-        depth_img /= 5.0
-        depth_img /= torch.max(depth_img)
+        if depth_img is not None:
+            depth_img[depth_img == float("inf")] = 0
+            depth_img /= 5.0
+            depth_img /= torch.max(depth_img)
         # process segmentation image
-        segmentation_img = segmentation_img / 255.0
-        mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True)
-        segmentation_img -= mean_tensor
-        return rgb_img, depth_img, segmentation_img
-
-    def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor):
+        if segmentation_img is not None:
+            segmentation_img = segmentation_img / 255.0
+            mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True)
+            segmentation_img -= mean_tensor
+        # process albedo image
+        if albedo_img is not None:
+            albedo_img = albedo_img / 255.0
+        # process simple shading image
+        if simple_shading_img is not None:
+            simple_shading_img = simple_shading_img / 255.0
+        return rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img
+
+    def _save_images(
+        self,
+        rgb_img: torch.Tensor | None,
+        depth_img: torch.Tensor | None,
+        segmentation_img: torch.Tensor | None,
+        albedo_img: torch.Tensor | None,
+        simple_shading_img: torch.Tensor | None,
+    ):
         """Writes image buffers to file.
 
         Args:
@@ -143,12 +170,25 @@ def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentat
             depth_img (torch.Tensor): Depth image tensor. Shape: (N, H, W, 1).
             segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3).
         """
-        save_images_to_file(rgb_img, "shadow_hand_rgb.png")
-        save_images_to_file(depth_img, "shadow_hand_depth.png")
-        save_images_to_file(segmentation_img, "shadow_hand_segmentation.png")
+        if rgb_img is not None:
+            save_images_to_file(rgb_img, "shadow_hand_rgb.png")
+        if depth_img is not None:
+            save_images_to_file(depth_img, "shadow_hand_depth.png")
+        if segmentation_img is not None:
+            save_images_to_file(segmentation_img, "shadow_hand_segmentation.png")
+        if albedo_img is not None:
+            save_images_to_file(albedo_img, "shadow_hand_diffuse_albedo.png")
+        if simple_shading_img is not None:
+            save_images_to_file(simple_shading_img, "shadow_hand_simple_shading.png")
 
     def step(
-        self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor, gt_pose: torch.Tensor
+        self,
+        rgb_img: torch.Tensor = None,
+        depth_img: torch.Tensor = None,
+        segmentation_img: torch.Tensor = None,
+        albedo_img: torch.Tensor = None,
+        simple_shading_img: torch.Tensor = None,
+        gt_pose: torch.Tensor = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """Extracts the features using the images and trains the model if the train flag is set to True.
 
@@ -162,15 +202,28 @@ def step(
             tuple[torch.Tensor, torch.Tensor]: Pose loss and predicted pose.
         """
 
-        rgb_img, depth_img, segmentation_img = self._preprocess_images(rgb_img, depth_img, segmentation_img)
+        rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img = self._preprocess_images(
+            rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img
+        )
 
         if self.cfg.write_image_to_file:
-            self._save_images(rgb_img, depth_img, segmentation_img)
+            self._save_images(rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img)
 
         if self.cfg.train:
             with torch.enable_grad():
                 with torch.inference_mode(False):
-                    img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+                    if rgb_img is not None and depth_img is not None and segmentation_img is not None:
+                        img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+                    elif albedo_img is not None:
+                        img_input = albedo_img
+                    elif simple_shading_img is not None:
+                        img_input = simple_shading_img
+                    elif rgb_img is not None:
+                        img_input = rgb_img
+                    elif depth_img is not None:
+                        img_input = depth_img
+                    elif segmentation_img is not None:
+                        img_input = segmentation_img
                     self.optimizer.zero_grad()
 
                     predicted_pose = self.feature_extractor(img_input)
@@ -189,6 +242,17 @@ def step(
 
                     return pose_loss, predicted_pose
         else:
-            img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+            if albedo_img is not None:
+                img_input = albedo_img
+            elif simple_shading_img is not None:
+                img_input = simple_shading_img
+            elif rgb_img is not None and depth_img is not None and segmentation_img is not None:
+                img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1)
+            elif rgb_img is not None:
+                img_input = rgb_img
+            elif depth_img is not None:
+                img_input = depth_img
+            elif segmentation_img is not None:
+                img_input = segmentation_img
             predicted_pose = self.feature_extractor(img_input)
             return None, predicted_pose
diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
index 13bc6a55328..12ad0e77985 100644
--- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
+++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
@@ -41,10 +41,124 @@ class ShadowHandVisionEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=120,
-        height=120,
+        width=240,
+        height=240,
     )
-    feature_extractor = FeatureExtractorCfg()
+    feature_extractor = FeatureExtractorCfg(num_channel=7)
+
+    # env
+    observation_space = 164 + 27  # state observation + vision CNN embedding
+    state_space = 187 + 27  # asymettric states + vision CNN embedding
+
+
+@configclass
+class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg):
+    # scene
+    scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True)
+
+    # camera
+    tiled_camera: TiledCameraCfg = TiledCameraCfg(
+        prim_path="/World/envs/env_.*/Camera",
+        offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"),
+        data_types=["rgb"],
+        spawn=sim_utils.PinholeCameraCfg(
+            focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
+        ),
+        width=240,
+        height=240,
+    )
+    feature_extractor = FeatureExtractorCfg(num_channel=3)
+
+    # env
+    observation_space = 164 + 27  # state observation + vision CNN embedding
+    state_space = 187 + 27  # asymettric states + vision CNN embedding
+
+@configclass
+class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg):
+    # scene
+    scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True)
+
+    # camera
+    tiled_camera: TiledCameraCfg = TiledCameraCfg(
+        prim_path="/World/envs/env_.*/Camera",
+        offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"),
+        data_types=["diffuse_albedo"],
+        spawn=sim_utils.PinholeCameraCfg(
+            focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
+        ),
+        width=240,
+        height=240,
+    )
+    feature_extractor = FeatureExtractorCfg(num_channel=3)
+
+    # env
+    observation_space = 164 + 27  # state observation + vision CNN embedding
+    state_space = 187 + 27  # asymettric states + vision CNN embedding
+
+
+@configclass
+class ShadowHandVisionSimpleShadingEnvCfg(ShadowHandEnvCfg):
+    # scene
+    scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True)
+
+    # camera
+    tiled_camera: TiledCameraCfg = TiledCameraCfg(
+        prim_path="/World/envs/env_.*/Camera",
+        offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"),
+        data_types=["simple_shading"],
+        spawn=sim_utils.PinholeCameraCfg(
+            focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
+        ),
+        width=240,
+        height=240,
+    )
+    feature_extractor = FeatureExtractorCfg(num_channel=3)
+
+    # env
+    observation_space = 164 + 27  # state observation + vision CNN embedding
+    state_space = 187 + 27  # asymettric states + vision CNN embedding
+
+
+@configclass
+class ShadowHandVisionDepthEnvCfg(ShadowHandEnvCfg):
+    # scene
+    scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True)
+
+    # camera
+    tiled_camera: TiledCameraCfg = TiledCameraCfg(
+        prim_path="/World/envs/env_.*/Camera",
+        offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"),
+        data_types=["depth"],
+        spawn=sim_utils.PinholeCameraCfg(
+            focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
+        ),
+        width=240,
+        height=240,
+    )
+    feature_extractor = FeatureExtractorCfg(num_channel=1)
+
+    # env
+    observation_space = 164 + 27  # state observation + vision CNN embedding
+    state_space = 187 + 27  # asymettric states + vision CNN embedding
+
+
+@configclass
+class ShadowHandVisionSegmentationEnvCfg(ShadowHandEnvCfg):
+    # scene
+    scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True)
+
+    # camera
+    tiled_camera: TiledCameraCfg = TiledCameraCfg(
+        prim_path="/World/envs/env_.*/Camera",
+        offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"),
+        data_types=["semantic_segmentation"],
+        spawn=sim_utils.PinholeCameraCfg(
+            focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
+        ),
+        width=240,
+        height=240,
+    )
+    feature_extractor = FeatureExtractorCfg(num_channel=3)
 
     # env
     observation_space = 164 + 27  # state observation + vision CNN embedding
@@ -102,32 +216,82 @@ def _compute_image_observations(self):
 
         object_pose = torch.cat([self.object_pos, self.gt_keypoints.view(-1, 24)], dim=-1)
 
-        # train CNN to regress on keypoint positions
-        pose_loss, embeddings = self.feature_extractor.step(
-            self._tiled_camera.data.output["rgb"],
-            self._tiled_camera.data.output["depth"],
-            self._tiled_camera.data.output["semantic_segmentation"][..., :3],
-            object_pose,
-        )
+        # If requested, write out camera images using the feature extractor's utilities
+        if getattr(self.feature_extractor.cfg, "write_image_to_file", False):
+            rgb_img = self._tiled_camera.data.output["rgb"] if "rgb" in self.cfg.tiled_camera.data_types else None
+            depth_img = self._tiled_camera.data.output["depth"] if "depth" in self.cfg.tiled_camera.data_types else None
+            segmentation_img = (
+                self._tiled_camera.data.output["semantic_segmentation"][..., :3]
+                if "semantic_segmentation" in self.cfg.tiled_camera.data_types
+                else None
+            )
+            albedo_img = (
+                self._tiled_camera.data.output["diffuse_albedo"]
+                if "diffuse_albedo" in self.cfg.tiled_camera.data_types
+                else None
+            )
+            simple_shading_img = (
+                self._tiled_camera.data.output["simple_shading"]
+                if "simple_shading" in self.cfg.tiled_camera.data_types
+                else None
+            )
+
+            pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading = self.feature_extractor._preprocess_images(
+                rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img
+            )
+            self.feature_extractor._save_images(pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading)
 
-        self.embeddings = embeddings.clone().detach()
-        # compute keypoints for goal cube
-        compute_keypoints(
-            pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints
-        )
+        # train CNN to regress on keypoint positions
+        # if (
+        #     "rgb" in self.cfg.tiled_camera.data_types
+        #     and "depth" in self.cfg.tiled_camera.data_types
+        #     and "semantic_segmentation" in self.cfg.tiled_camera.data_types
+        # ):
+        #     pose_loss, embeddings = self.feature_extractor.step(
+        #         self._tiled_camera.data.output["rgb"],
+        #         self._tiled_camera.data.output["depth"],
+        #         self._tiled_camera.data.output["semantic_segmentation"][..., :3],
+        #         object_pose,
+        #     )
+        # elif "rgb" in self.cfg.tiled_camera.data_types:
+        #     pose_loss, embeddings = self.feature_extractor.step(
+        #         rgb_img=self._tiled_camera.data.output["rgb"], gt_pose=object_pose
+        #     )
+        # elif "depth" in self.cfg.tiled_camera.data_types:
+        #     pose_loss, embeddings = self.feature_extractor.step(
+        #         depth_img=self._tiled_camera.data.output["depth"], gt_pose=object_pose
+        #     )
+        # elif "semantic_segmentation" in self.cfg.tiled_camera.data_types:
+        #     pose_loss, embeddings = self.feature_extractor.step(
+        #         segmentation_img=self._tiled_camera.data.output["semantic_segmentation"][..., :3], gt_pose=object_pose
+        #     )
+        # elif "diffuse_albedo" in self.cfg.tiled_camera.data_types:
+        #     pose_loss, embeddings = self.feature_extractor.step(
+        #         albedo_img=self._tiled_camera.data.output["diffuse_albedo"], gt_pose=object_pose
+        #     )
+        # elif "simple_shading" in self.cfg.tiled_camera.data_types:
+        #     pose_loss, embeddings = self.feature_extractor.step(
+        #         simple_shading_img=self._tiled_camera.data.output["simple_shading"], gt_pose=object_pose
+        #     )
+
+        # self.embeddings = embeddings.clone().detach()
+        # # compute keypoints for goal cube
+        # compute_keypoints(
+        #     pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints
+        # )
 
         obs = torch.cat(
             (
-                self.embeddings,
+                # self.embeddings,
+                object_pose,
                 self.goal_keypoints.view(-1, 24),
             ),
             dim=-1,
         )
-
         # log pose loss from CNN training
-        if "log" not in self.extras:
-            self.extras["log"] = dict()
-        self.extras["log"]["pose_loss"] = pose_loss
+        # if "log" not in self.extras:
+        #     self.extras["log"] = dict()
+        # self.extras["log"]["pose_loss"] = pose_loss
 
         return obs
 
@@ -155,7 +319,8 @@ def _compute_proprio_observations(self):
     def _compute_states(self):
         """Asymmetric states for the critic."""
         sim_states = self.compute_full_state()
-        state = torch.cat((sim_states, self.embeddings), dim=-1)
+        # state = torch.cat((sim_states, self.embeddings), dim=-1)
+        state = sim_states
         return state
 
     def _get_observations(self) -> dict:

From ee0cc9aa8ec561e23a290faba0ce3e689ec2a8dd Mon Sep 17 00:00:00 2001
From: mtrepte <mtrepte@nvidia.com>
Date: Fri, 21 Nov 2025 16:50:46 -0800
Subject: [PATCH 2/3] fix

---
 .../shadow_hand/shadow_hand_vision_env.py     | 111 +++++-------------
 1 file changed, 30 insertions(+), 81 deletions(-)

diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
index 12ad0e77985..c5a51a34850 100644
--- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
+++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
@@ -41,8 +41,8 @@ class ShadowHandVisionEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=240,
-        height=240,
+        width=120,
+        height=120,
     )
     feature_extractor = FeatureExtractorCfg(num_channel=7)
 
@@ -64,8 +64,8 @@ class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=240,
-        height=240,
+        width=120,
+        height=120,
     )
     feature_extractor = FeatureExtractorCfg(num_channel=3)
 
@@ -86,8 +86,8 @@ class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=240,
-        height=240,
+        width=120,
+        height=120,
     )
     feature_extractor = FeatureExtractorCfg(num_channel=3)
 
@@ -109,8 +109,8 @@ class ShadowHandVisionSimpleShadingEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=240,
-        height=240,
+        width=120,
+        height=120,
     )
     feature_extractor = FeatureExtractorCfg(num_channel=3)
 
@@ -132,8 +132,8 @@ class ShadowHandVisionDepthEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=240,
-        height=240,
+        width=120,
+        height=120,
     )
     feature_extractor = FeatureExtractorCfg(num_channel=1)
 
@@ -155,8 +155,8 @@ class ShadowHandVisionSegmentationEnvCfg(ShadowHandEnvCfg):
         spawn=sim_utils.PinholeCameraCfg(
             focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0)
         ),
-        width=240,
-        height=240,
+        width=120,
+        height=120,
     )
     feature_extractor = FeatureExtractorCfg(num_channel=3)
 
@@ -216,82 +216,32 @@ def _compute_image_observations(self):
 
         object_pose = torch.cat([self.object_pos, self.gt_keypoints.view(-1, 24)], dim=-1)
 
-        # If requested, write out camera images using the feature extractor's utilities
-        if getattr(self.feature_extractor.cfg, "write_image_to_file", False):
-            rgb_img = self._tiled_camera.data.output["rgb"] if "rgb" in self.cfg.tiled_camera.data_types else None
-            depth_img = self._tiled_camera.data.output["depth"] if "depth" in self.cfg.tiled_camera.data_types else None
-            segmentation_img = (
-                self._tiled_camera.data.output["semantic_segmentation"][..., :3]
-                if "semantic_segmentation" in self.cfg.tiled_camera.data_types
-                else None
-            )
-            albedo_img = (
-                self._tiled_camera.data.output["diffuse_albedo"]
-                if "diffuse_albedo" in self.cfg.tiled_camera.data_types
-                else None
-            )
-            simple_shading_img = (
-                self._tiled_camera.data.output["simple_shading"]
-                if "simple_shading" in self.cfg.tiled_camera.data_types
-                else None
-            )
-
-            pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading = self.feature_extractor._preprocess_images(
-                rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img
-            )
-            self.feature_extractor._save_images(pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading)
-
         # train CNN to regress on keypoint positions
-        # if (
-        #     "rgb" in self.cfg.tiled_camera.data_types
-        #     and "depth" in self.cfg.tiled_camera.data_types
-        #     and "semantic_segmentation" in self.cfg.tiled_camera.data_types
-        # ):
-        #     pose_loss, embeddings = self.feature_extractor.step(
-        #         self._tiled_camera.data.output["rgb"],
-        #         self._tiled_camera.data.output["depth"],
-        #         self._tiled_camera.data.output["semantic_segmentation"][..., :3],
-        #         object_pose,
-        #     )
-        # elif "rgb" in self.cfg.tiled_camera.data_types:
-        #     pose_loss, embeddings = self.feature_extractor.step(
-        #         rgb_img=self._tiled_camera.data.output["rgb"], gt_pose=object_pose
-        #     )
-        # elif "depth" in self.cfg.tiled_camera.data_types:
-        #     pose_loss, embeddings = self.feature_extractor.step(
-        #         depth_img=self._tiled_camera.data.output["depth"], gt_pose=object_pose
-        #     )
-        # elif "semantic_segmentation" in self.cfg.tiled_camera.data_types:
-        #     pose_loss, embeddings = self.feature_extractor.step(
-        #         segmentation_img=self._tiled_camera.data.output["semantic_segmentation"][..., :3], gt_pose=object_pose
-        #     )
-        # elif "diffuse_albedo" in self.cfg.tiled_camera.data_types:
-        #     pose_loss, embeddings = self.feature_extractor.step(
-        #         albedo_img=self._tiled_camera.data.output["diffuse_albedo"], gt_pose=object_pose
-        #     )
-        # elif "simple_shading" in self.cfg.tiled_camera.data_types:
-        #     pose_loss, embeddings = self.feature_extractor.step(
-        #         simple_shading_img=self._tiled_camera.data.output["simple_shading"], gt_pose=object_pose
-        #     )
-
-        # self.embeddings = embeddings.clone().detach()
-        # # compute keypoints for goal cube
-        # compute_keypoints(
-        #     pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints
-        # )
+        pose_loss, embeddings = self.feature_extractor.step(
+            self._tiled_camera.data.output["rgb"],
+            self._tiled_camera.data.output["depth"],
+            self._tiled_camera.data.output["semantic_segmentation"][..., :3],
+            object_pose,
+        )
+
+        self.embeddings = embeddings.clone().detach()
+        # compute keypoints for goal cube
+        compute_keypoints(
+            pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints
+        )
 
         obs = torch.cat(
             (
-                # self.embeddings,
-                object_pose,
+                self.embeddings,
                 self.goal_keypoints.view(-1, 24),
             ),
             dim=-1,
         )
+
         # log pose loss from CNN training
-        # if "log" not in self.extras:
-        #     self.extras["log"] = dict()
-        # self.extras["log"]["pose_loss"] = pose_loss
+        if "log" not in self.extras:
+            self.extras["log"] = dict()
+        self.extras["log"]["pose_loss"] = pose_loss
 
         return obs
 
@@ -319,8 +269,7 @@ def _compute_proprio_observations(self):
     def _compute_states(self):
         """Asymmetric states for the critic."""
         sim_states = self.compute_full_state()
-        # state = torch.cat((sim_states, self.embeddings), dim=-1)
-        state = sim_states
+        state = torch.cat((sim_states, self.embeddings), dim=-1)
         return state
 
     def _get_observations(self) -> dict:

From 8fd06bc0ef26f3b3a17a953d06c4c9f269d317a6 Mon Sep 17 00:00:00 2001
From: mtrepte <mtrepte@nvidia.com>
Date: Fri, 21 Nov 2025 16:52:25 -0800
Subject: [PATCH 3/3] adsf

---
 .../isaaclab/sensors/camera/tiled_camera.py      | 16 ++++++++++++----
 .../direct/shadow_hand/shadow_hand_vision_env.py |  1 +
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py
index bf057d27954..a36111853e7 100644
--- a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py
+++ b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py
@@ -189,8 +189,12 @@ def _initialize_impl(self):
         )
         self._render_product_paths = [rp.path]
 
-        rep.AnnotatorRegistry.register_annotator_from_aov(aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4)
-        rep.AnnotatorRegistry.register_annotator_from_aov(aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4)
+        rep.AnnotatorRegistry.register_annotator_from_aov(
+            aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4
+        )
+        rep.AnnotatorRegistry.register_annotator_from_aov(
+            aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4
+        )
         # Define the annotators based on requested data types
         self._annotators = dict()
         for annotator_type in self.cfg.data_types:
@@ -198,10 +202,14 @@ def _initialize_impl(self):
                 annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False)
                 self._annotators["rgba"] = annotator
             elif annotator_type == "diffuse_albedo":
-                annotator = rep.AnnotatorRegistry.get_annotator("DiffuseAlbedoSD", device=self.device, do_array_copy=False)
+                annotator = rep.AnnotatorRegistry.get_annotator(
+                    "DiffuseAlbedoSD", device=self.device, do_array_copy=False
+                )
                 self._annotators["diffuse_albedo"] = annotator
             elif annotator_type == "simple_shading":
-                annotator = rep.AnnotatorRegistry.get_annotator("SimpleShadingSD", device=self.device, do_array_copy=False)
+                annotator = rep.AnnotatorRegistry.get_annotator(
+                    "SimpleShadingSD", device=self.device, do_array_copy=False
+                )
                 self._annotators["simple_shading"] = annotator
             elif annotator_type == "depth" or annotator_type == "distance_to_image_plane":
                 # keep depth for backwards compatibility
diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
index c5a51a34850..6a8890c7ed2 100644
--- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
+++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py
@@ -73,6 +73,7 @@ class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg):
     observation_space = 164 + 27  # state observation + vision CNN embedding
     state_space = 187 + 27  # asymettric states + vision CNN embedding
 
+
 @configclass
 class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg):
     # scene