From 83f4b6f1a7d73361710debf94052ff8d0c5d04a7 Mon Sep 17 00:00:00 2001 From: mtrepte Date: Fri, 21 Nov 2025 16:42:00 -0800 Subject: [PATCH 1/3] asdf --- apps/rendering_modes/balanced.kit | 2 + apps/rendering_modes/performance.kit | 2 + apps/rendering_modes/quality.kit | 2 + .../isaaclab/sensors/camera/tiled_camera.py | 30 ++- .../direct/shadow_hand/__init__.py | 57 +++++ .../direct/shadow_hand/feature_extractor.py | 116 +++++++--- .../shadow_hand/shadow_hand_vision_env.py | 207 ++++++++++++++++-- 7 files changed, 368 insertions(+), 48 deletions(-) diff --git a/apps/rendering_modes/balanced.kit b/apps/rendering_modes/balanced.kit index ee92625fd7e..d9b793f2915 100644 --- a/apps/rendering_modes/balanced.kit +++ b/apps/rendering_modes/balanced.kit @@ -1,3 +1,5 @@ +rtx.sdg.force.disableColorRender=false + rtx.translucency.enabled = false rtx.reflections.enabled = false diff --git a/apps/rendering_modes/performance.kit b/apps/rendering_modes/performance.kit index 3cfe6e8c0e2..3925a8e1dff 100644 --- a/apps/rendering_modes/performance.kit +++ b/apps/rendering_modes/performance.kit @@ -1,3 +1,5 @@ +rtx.sdg.force.disableColorRender=true # change to false for RGB baselines + rtx.translucency.enabled = false rtx.reflections.enabled = false diff --git a/apps/rendering_modes/quality.kit b/apps/rendering_modes/quality.kit index 8e966ddfd3b..2aa8d8eae98 100644 --- a/apps/rendering_modes/quality.kit +++ b/apps/rendering_modes/quality.kit @@ -1,3 +1,5 @@ +rtx.sdg.force.disableColorRender=false + rtx.translucency.enabled = true rtx.reflections.enabled = true diff --git a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py index 3e9982135c5..bf057d27954 100644 --- a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py +++ b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py @@ -189,12 +189,20 @@ def _initialize_impl(self): ) self._render_product_paths = [rp.path] + rep.AnnotatorRegistry.register_annotator_from_aov(aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4) + rep.AnnotatorRegistry.register_annotator_from_aov(aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4) # Define the annotators based on requested data types self._annotators = dict() for annotator_type in self.cfg.data_types: if annotator_type == "rgba" or annotator_type == "rgb": annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False) self._annotators["rgba"] = annotator + elif annotator_type == "diffuse_albedo": + annotator = rep.AnnotatorRegistry.get_annotator("DiffuseAlbedoSD", device=self.device, do_array_copy=False) + self._annotators["diffuse_albedo"] = annotator + elif annotator_type == "simple_shading": + annotator = rep.AnnotatorRegistry.get_annotator("SimpleShadingSD", device=self.device, do_array_copy=False) + self._annotators["simple_shading"] = annotator elif annotator_type == "depth" or annotator_type == "distance_to_image_plane": # keep depth for backwards compatibility annotator = rep.AnnotatorRegistry.get_annotator( @@ -254,13 +262,16 @@ def _update_buffers_impl(self, env_ids: Sequence[int]): else: tiled_data_buffer = tiled_data_buffer.to(device=self.device) - # process data for different segmentation types + # process data for different segmentation types and custom annotators # Note: Replicator returns raw buffers of dtype uint32 for segmentation types # so we need to convert them to uint8 4 channel images for colorized types + # Note: Custom annotators (diffuse_albedo, simple_shading) also return 4 channel data if ( (data_type == "semantic_segmentation" and self.cfg.colorize_semantic_segmentation) or (data_type == "instance_segmentation_fast" and self.cfg.colorize_instance_segmentation) or (data_type == "instance_id_segmentation_fast" and self.cfg.colorize_instance_id_segmentation) + or data_type == "diffuse_albedo" + or data_type == "simple_shading" ): tiled_data_buffer = wp.array( ptr=tiled_data_buffer.ptr, shape=(*tiled_data_buffer.shape, 4), dtype=wp.uint8, device=self.device @@ -271,6 +282,13 @@ def _update_buffers_impl(self, env_ids: Sequence[int]): if data_type == "motion_vectors": tiled_data_buffer = tiled_data_buffer[:, :, :2].contiguous() + # For diffuse albedo, keep only the first three channels (RGB) + if data_type == "diffuse_albedo": + tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous() + # For simple shading, keep only the first three channels (RGB) + if data_type == "simple_shading": + tiled_data_buffer = tiled_data_buffer[:, :, :3].contiguous() + wp.launch( kernel=reshape_tiled_image, dim=(self._view.count, self.cfg.height, self.cfg.width), @@ -347,6 +365,16 @@ def _create_buffers(self): if "rgb" in self.cfg.data_types: # RGB is the first 3 channels of RGBA data_dict["rgb"] = data_dict["rgba"][..., :3] + if "diffuse_albedo" in self.cfg.data_types: + data_dict["diffuse_albedo"] = torch.zeros( + (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8 + ).contiguous() + data_dict["diffuse_albedo"] = data_dict["diffuse_albedo"][..., :3] + if "simple_shading" in self.cfg.data_types: + data_dict["simple_shading"] = torch.zeros( + (self._view.count, self.cfg.height, self.cfg.width, 4), device=self.device, dtype=torch.uint8 + ).contiguous() + data_dict["simple_shading"] = data_dict["simple_shading"][..., :3] if "distance_to_image_plane" in self.cfg.data_types: data_dict["distance_to_image_plane"] = torch.zeros( (self._view.count, self.cfg.height, self.cfg.width, 1), device=self.device, dtype=torch.float32 diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py index ed316e6e267..3ce6c42c5b6 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/__init__.py @@ -64,6 +64,63 @@ }, ) + +gym.register( + id="Isaac-Repose-Cube-Shadow-Segmentation-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSegmentationEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + + +gym.register( + id="Isaac-Repose-Cube-Shadow-RGB-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionRGBEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + +gym.register( + id="Isaac-Repose-Cube-Shadow-DiffuseAlbedo-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDiffuseAlbedoEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + +gym.register( + id="Isaac-Repose-Cube-Shadow-SimpleShading-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionSimpleShadingEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + +gym.register( + id="Isaac-Repose-Cube-Shadow-Depth-Direct-v0", + entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", + disable_env_checker=True, + kwargs={ + "env_cfg_entry_point": f"{__name__}.shadow_hand_vision_env:ShadowHandVisionDepthEnvCfg", + "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ShadowHandVisionFFPPORunnerCfg", + "rl_games_cfg_entry_point": f"{agents.__name__}:rl_games_ppo_vision_cfg.yaml", + }, +) + gym.register( id="Isaac-Repose-Cube-Shadow-Vision-Direct-Play-v0", entry_point=f"{__name__}.shadow_hand_vision_env:ShadowHandVisionEnv", diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py index 82d76ec7f1e..a9f7785654c 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/feature_extractor.py @@ -16,9 +16,9 @@ class FeatureExtractorNetwork(nn.Module): """CNN architecture used to regress keypoint positions of the in-hand cube from image data.""" - def __init__(self): + def __init__(self, num_channel): super().__init__() - num_channel = 7 + self.num_channel = num_channel self.cnn = nn.Sequential( nn.Conv2d(num_channel, 16, kernel_size=6, stride=2, padding=0), nn.ReLU(), @@ -45,8 +45,11 @@ def __init__(self): def forward(self, x): x = x.permute(0, 3, 1, 2) - x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) - x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :]) + if self.num_channel == 7: + x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) + x[:, 4:7, :, :] = self.data_transforms(x[:, 4:7, :, :]) + elif self.num_channel == 3: + x[:, 0:3, :, :] = self.data_transforms(x[:, 0:3, :, :]) cnn_x = self.cnn(x) out = self.linear(cnn_x.view(-1, 128)) return out @@ -65,6 +68,8 @@ class FeatureExtractorCfg: write_image_to_file: bool = False """If True, the images from the camera sensor are written to file. Default is False.""" + num_channel: int = 7 + class FeatureExtractor: """Class for extracting features from image data. @@ -86,7 +91,7 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None = self.device = device # Feature extractor model - self.feature_extractor = FeatureExtractorNetwork() + self.feature_extractor = FeatureExtractorNetwork(self.cfg.num_channel) self.feature_extractor.to(self.device) self.step_count = 0 @@ -112,8 +117,13 @@ def __init__(self, cfg: FeatureExtractorCfg, device: str, log_dir: str | None = self.feature_extractor.eval() def _preprocess_images( - self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + self, + rgb_img: torch.Tensor, + depth_img: torch.Tensor, + segmentation_img: torch.Tensor, + albedo_img: torch.Tensor | None = None, + simple_shading_img: torch.Tensor | None = None, + ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: """Preprocesses the input images. Args: @@ -122,20 +132,37 @@ def _preprocess_images( segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3) Returns: - tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Preprocessed RGB, depth, and segmentation + tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]: + Preprocessed RGB, depth, segmentation, albedo, and simple shading images """ - rgb_img = rgb_img / 255.0 + if rgb_img is not None: + rgb_img = rgb_img / 255.0 # process depth image - depth_img[depth_img == float("inf")] = 0 - depth_img /= 5.0 - depth_img /= torch.max(depth_img) + if depth_img is not None: + depth_img[depth_img == float("inf")] = 0 + depth_img /= 5.0 + depth_img /= torch.max(depth_img) # process segmentation image - segmentation_img = segmentation_img / 255.0 - mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True) - segmentation_img -= mean_tensor - return rgb_img, depth_img, segmentation_img - - def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor): + if segmentation_img is not None: + segmentation_img = segmentation_img / 255.0 + mean_tensor = torch.mean(segmentation_img, dim=(1, 2), keepdim=True) + segmentation_img -= mean_tensor + # process albedo image + if albedo_img is not None: + albedo_img = albedo_img / 255.0 + # process simple shading image + if simple_shading_img is not None: + simple_shading_img = simple_shading_img / 255.0 + return rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img + + def _save_images( + self, + rgb_img: torch.Tensor | None, + depth_img: torch.Tensor | None, + segmentation_img: torch.Tensor | None, + albedo_img: torch.Tensor | None, + simple_shading_img: torch.Tensor | None, + ): """Writes image buffers to file. Args: @@ -143,12 +170,25 @@ def _save_images(self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentat depth_img (torch.Tensor): Depth image tensor. Shape: (N, H, W, 1). segmentation_img (torch.Tensor): Segmentation image tensor. Shape: (N, H, W, 3). """ - save_images_to_file(rgb_img, "shadow_hand_rgb.png") - save_images_to_file(depth_img, "shadow_hand_depth.png") - save_images_to_file(segmentation_img, "shadow_hand_segmentation.png") + if rgb_img is not None: + save_images_to_file(rgb_img, "shadow_hand_rgb.png") + if depth_img is not None: + save_images_to_file(depth_img, "shadow_hand_depth.png") + if segmentation_img is not None: + save_images_to_file(segmentation_img, "shadow_hand_segmentation.png") + if albedo_img is not None: + save_images_to_file(albedo_img, "shadow_hand_diffuse_albedo.png") + if simple_shading_img is not None: + save_images_to_file(simple_shading_img, "shadow_hand_simple_shading.png") def step( - self, rgb_img: torch.Tensor, depth_img: torch.Tensor, segmentation_img: torch.Tensor, gt_pose: torch.Tensor + self, + rgb_img: torch.Tensor = None, + depth_img: torch.Tensor = None, + segmentation_img: torch.Tensor = None, + albedo_img: torch.Tensor = None, + simple_shading_img: torch.Tensor = None, + gt_pose: torch.Tensor = None, ) -> tuple[torch.Tensor, torch.Tensor]: """Extracts the features using the images and trains the model if the train flag is set to True. @@ -162,15 +202,28 @@ def step( tuple[torch.Tensor, torch.Tensor]: Pose loss and predicted pose. """ - rgb_img, depth_img, segmentation_img = self._preprocess_images(rgb_img, depth_img, segmentation_img) + rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img = self._preprocess_images( + rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img + ) if self.cfg.write_image_to_file: - self._save_images(rgb_img, depth_img, segmentation_img) + self._save_images(rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img) if self.cfg.train: with torch.enable_grad(): with torch.inference_mode(False): - img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + if rgb_img is not None and depth_img is not None and segmentation_img is not None: + img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + elif albedo_img is not None: + img_input = albedo_img + elif simple_shading_img is not None: + img_input = simple_shading_img + elif rgb_img is not None: + img_input = rgb_img + elif depth_img is not None: + img_input = depth_img + elif segmentation_img is not None: + img_input = segmentation_img self.optimizer.zero_grad() predicted_pose = self.feature_extractor(img_input) @@ -189,6 +242,17 @@ def step( return pose_loss, predicted_pose else: - img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + if albedo_img is not None: + img_input = albedo_img + elif simple_shading_img is not None: + img_input = simple_shading_img + elif rgb_img is not None and depth_img is not None and segmentation_img is not None: + img_input = torch.cat((rgb_img, depth_img, segmentation_img), dim=-1) + elif rgb_img is not None: + img_input = rgb_img + elif depth_img is not None: + img_input = depth_img + elif segmentation_img is not None: + img_input = segmentation_img predicted_pose = self.feature_extractor(img_input) return None, predicted_pose diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py index 13bc6a55328..12ad0e77985 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py @@ -41,10 +41,124 @@ class ShadowHandVisionEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=120, - height=120, + width=240, + height=240, ) - feature_extractor = FeatureExtractorCfg() + feature_extractor = FeatureExtractorCfg(num_channel=7) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["rgb"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=240, + height=240, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + +@configclass +class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["diffuse_albedo"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=240, + height=240, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionSimpleShadingEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["simple_shading"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=240, + height=240, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionDepthEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["depth"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=240, + height=240, + ) + feature_extractor = FeatureExtractorCfg(num_channel=1) + + # env + observation_space = 164 + 27 # state observation + vision CNN embedding + state_space = 187 + 27 # asymettric states + vision CNN embedding + + +@configclass +class ShadowHandVisionSegmentationEnvCfg(ShadowHandEnvCfg): + # scene + scene: InteractiveSceneCfg = InteractiveSceneCfg(num_envs=1225, env_spacing=2.0, replicate_physics=True) + + # camera + tiled_camera: TiledCameraCfg = TiledCameraCfg( + prim_path="/World/envs/env_.*/Camera", + offset=TiledCameraCfg.OffsetCfg(pos=(0, -0.35, 1.0), rot=(0.7071, 0.0, 0.7071, 0.0), convention="world"), + data_types=["semantic_segmentation"], + spawn=sim_utils.PinholeCameraCfg( + focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) + ), + width=240, + height=240, + ) + feature_extractor = FeatureExtractorCfg(num_channel=3) # env observation_space = 164 + 27 # state observation + vision CNN embedding @@ -102,32 +216,82 @@ def _compute_image_observations(self): object_pose = torch.cat([self.object_pos, self.gt_keypoints.view(-1, 24)], dim=-1) - # train CNN to regress on keypoint positions - pose_loss, embeddings = self.feature_extractor.step( - self._tiled_camera.data.output["rgb"], - self._tiled_camera.data.output["depth"], - self._tiled_camera.data.output["semantic_segmentation"][..., :3], - object_pose, - ) + # If requested, write out camera images using the feature extractor's utilities + if getattr(self.feature_extractor.cfg, "write_image_to_file", False): + rgb_img = self._tiled_camera.data.output["rgb"] if "rgb" in self.cfg.tiled_camera.data_types else None + depth_img = self._tiled_camera.data.output["depth"] if "depth" in self.cfg.tiled_camera.data_types else None + segmentation_img = ( + self._tiled_camera.data.output["semantic_segmentation"][..., :3] + if "semantic_segmentation" in self.cfg.tiled_camera.data_types + else None + ) + albedo_img = ( + self._tiled_camera.data.output["diffuse_albedo"] + if "diffuse_albedo" in self.cfg.tiled_camera.data_types + else None + ) + simple_shading_img = ( + self._tiled_camera.data.output["simple_shading"] + if "simple_shading" in self.cfg.tiled_camera.data_types + else None + ) + + pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading = self.feature_extractor._preprocess_images( + rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img + ) + self.feature_extractor._save_images(pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading) - self.embeddings = embeddings.clone().detach() - # compute keypoints for goal cube - compute_keypoints( - pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints - ) + # train CNN to regress on keypoint positions + # if ( + # "rgb" in self.cfg.tiled_camera.data_types + # and "depth" in self.cfg.tiled_camera.data_types + # and "semantic_segmentation" in self.cfg.tiled_camera.data_types + # ): + # pose_loss, embeddings = self.feature_extractor.step( + # self._tiled_camera.data.output["rgb"], + # self._tiled_camera.data.output["depth"], + # self._tiled_camera.data.output["semantic_segmentation"][..., :3], + # object_pose, + # ) + # elif "rgb" in self.cfg.tiled_camera.data_types: + # pose_loss, embeddings = self.feature_extractor.step( + # rgb_img=self._tiled_camera.data.output["rgb"], gt_pose=object_pose + # ) + # elif "depth" in self.cfg.tiled_camera.data_types: + # pose_loss, embeddings = self.feature_extractor.step( + # depth_img=self._tiled_camera.data.output["depth"], gt_pose=object_pose + # ) + # elif "semantic_segmentation" in self.cfg.tiled_camera.data_types: + # pose_loss, embeddings = self.feature_extractor.step( + # segmentation_img=self._tiled_camera.data.output["semantic_segmentation"][..., :3], gt_pose=object_pose + # ) + # elif "diffuse_albedo" in self.cfg.tiled_camera.data_types: + # pose_loss, embeddings = self.feature_extractor.step( + # albedo_img=self._tiled_camera.data.output["diffuse_albedo"], gt_pose=object_pose + # ) + # elif "simple_shading" in self.cfg.tiled_camera.data_types: + # pose_loss, embeddings = self.feature_extractor.step( + # simple_shading_img=self._tiled_camera.data.output["simple_shading"], gt_pose=object_pose + # ) + + # self.embeddings = embeddings.clone().detach() + # # compute keypoints for goal cube + # compute_keypoints( + # pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints + # ) obs = torch.cat( ( - self.embeddings, + # self.embeddings, + object_pose, self.goal_keypoints.view(-1, 24), ), dim=-1, ) - # log pose loss from CNN training - if "log" not in self.extras: - self.extras["log"] = dict() - self.extras["log"]["pose_loss"] = pose_loss + # if "log" not in self.extras: + # self.extras["log"] = dict() + # self.extras["log"]["pose_loss"] = pose_loss return obs @@ -155,7 +319,8 @@ def _compute_proprio_observations(self): def _compute_states(self): """Asymmetric states for the critic.""" sim_states = self.compute_full_state() - state = torch.cat((sim_states, self.embeddings), dim=-1) + # state = torch.cat((sim_states, self.embeddings), dim=-1) + state = sim_states return state def _get_observations(self) -> dict: From ee0cc9aa8ec561e23a290faba0ce3e689ec2a8dd Mon Sep 17 00:00:00 2001 From: mtrepte Date: Fri, 21 Nov 2025 16:50:46 -0800 Subject: [PATCH 2/3] fix --- .../shadow_hand/shadow_hand_vision_env.py | 111 +++++------------- 1 file changed, 30 insertions(+), 81 deletions(-) diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py index 12ad0e77985..c5a51a34850 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py @@ -41,8 +41,8 @@ class ShadowHandVisionEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=240, - height=240, + width=120, + height=120, ) feature_extractor = FeatureExtractorCfg(num_channel=7) @@ -64,8 +64,8 @@ class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=240, - height=240, + width=120, + height=120, ) feature_extractor = FeatureExtractorCfg(num_channel=3) @@ -86,8 +86,8 @@ class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=240, - height=240, + width=120, + height=120, ) feature_extractor = FeatureExtractorCfg(num_channel=3) @@ -109,8 +109,8 @@ class ShadowHandVisionSimpleShadingEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=240, - height=240, + width=120, + height=120, ) feature_extractor = FeatureExtractorCfg(num_channel=3) @@ -132,8 +132,8 @@ class ShadowHandVisionDepthEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=240, - height=240, + width=120, + height=120, ) feature_extractor = FeatureExtractorCfg(num_channel=1) @@ -155,8 +155,8 @@ class ShadowHandVisionSegmentationEnvCfg(ShadowHandEnvCfg): spawn=sim_utils.PinholeCameraCfg( focal_length=24.0, focus_distance=400.0, horizontal_aperture=20.955, clipping_range=(0.1, 20.0) ), - width=240, - height=240, + width=120, + height=120, ) feature_extractor = FeatureExtractorCfg(num_channel=3) @@ -216,82 +216,32 @@ def _compute_image_observations(self): object_pose = torch.cat([self.object_pos, self.gt_keypoints.view(-1, 24)], dim=-1) - # If requested, write out camera images using the feature extractor's utilities - if getattr(self.feature_extractor.cfg, "write_image_to_file", False): - rgb_img = self._tiled_camera.data.output["rgb"] if "rgb" in self.cfg.tiled_camera.data_types else None - depth_img = self._tiled_camera.data.output["depth"] if "depth" in self.cfg.tiled_camera.data_types else None - segmentation_img = ( - self._tiled_camera.data.output["semantic_segmentation"][..., :3] - if "semantic_segmentation" in self.cfg.tiled_camera.data_types - else None - ) - albedo_img = ( - self._tiled_camera.data.output["diffuse_albedo"] - if "diffuse_albedo" in self.cfg.tiled_camera.data_types - else None - ) - simple_shading_img = ( - self._tiled_camera.data.output["simple_shading"] - if "simple_shading" in self.cfg.tiled_camera.data_types - else None - ) - - pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading = self.feature_extractor._preprocess_images( - rgb_img, depth_img, segmentation_img, albedo_img, simple_shading_img - ) - self.feature_extractor._save_images(pre_rgb, pre_depth, pre_seg, pre_albedo, pre_simple_shading) - # train CNN to regress on keypoint positions - # if ( - # "rgb" in self.cfg.tiled_camera.data_types - # and "depth" in self.cfg.tiled_camera.data_types - # and "semantic_segmentation" in self.cfg.tiled_camera.data_types - # ): - # pose_loss, embeddings = self.feature_extractor.step( - # self._tiled_camera.data.output["rgb"], - # self._tiled_camera.data.output["depth"], - # self._tiled_camera.data.output["semantic_segmentation"][..., :3], - # object_pose, - # ) - # elif "rgb" in self.cfg.tiled_camera.data_types: - # pose_loss, embeddings = self.feature_extractor.step( - # rgb_img=self._tiled_camera.data.output["rgb"], gt_pose=object_pose - # ) - # elif "depth" in self.cfg.tiled_camera.data_types: - # pose_loss, embeddings = self.feature_extractor.step( - # depth_img=self._tiled_camera.data.output["depth"], gt_pose=object_pose - # ) - # elif "semantic_segmentation" in self.cfg.tiled_camera.data_types: - # pose_loss, embeddings = self.feature_extractor.step( - # segmentation_img=self._tiled_camera.data.output["semantic_segmentation"][..., :3], gt_pose=object_pose - # ) - # elif "diffuse_albedo" in self.cfg.tiled_camera.data_types: - # pose_loss, embeddings = self.feature_extractor.step( - # albedo_img=self._tiled_camera.data.output["diffuse_albedo"], gt_pose=object_pose - # ) - # elif "simple_shading" in self.cfg.tiled_camera.data_types: - # pose_loss, embeddings = self.feature_extractor.step( - # simple_shading_img=self._tiled_camera.data.output["simple_shading"], gt_pose=object_pose - # ) - - # self.embeddings = embeddings.clone().detach() - # # compute keypoints for goal cube - # compute_keypoints( - # pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints - # ) + pose_loss, embeddings = self.feature_extractor.step( + self._tiled_camera.data.output["rgb"], + self._tiled_camera.data.output["depth"], + self._tiled_camera.data.output["semantic_segmentation"][..., :3], + object_pose, + ) + + self.embeddings = embeddings.clone().detach() + # compute keypoints for goal cube + compute_keypoints( + pose=torch.cat((torch.zeros_like(self.goal_pos), self.goal_rot), dim=-1), out=self.goal_keypoints + ) obs = torch.cat( ( - # self.embeddings, - object_pose, + self.embeddings, self.goal_keypoints.view(-1, 24), ), dim=-1, ) + # log pose loss from CNN training - # if "log" not in self.extras: - # self.extras["log"] = dict() - # self.extras["log"]["pose_loss"] = pose_loss + if "log" not in self.extras: + self.extras["log"] = dict() + self.extras["log"]["pose_loss"] = pose_loss return obs @@ -319,8 +269,7 @@ def _compute_proprio_observations(self): def _compute_states(self): """Asymmetric states for the critic.""" sim_states = self.compute_full_state() - # state = torch.cat((sim_states, self.embeddings), dim=-1) - state = sim_states + state = torch.cat((sim_states, self.embeddings), dim=-1) return state def _get_observations(self) -> dict: From 8fd06bc0ef26f3b3a17a953d06c4c9f269d317a6 Mon Sep 17 00:00:00 2001 From: mtrepte Date: Fri, 21 Nov 2025 16:52:25 -0800 Subject: [PATCH 3/3] adsf --- .../isaaclab/sensors/camera/tiled_camera.py | 16 ++++++++++++---- .../direct/shadow_hand/shadow_hand_vision_env.py | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py index bf057d27954..a36111853e7 100644 --- a/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py +++ b/source/isaaclab/isaaclab/sensors/camera/tiled_camera.py @@ -189,8 +189,12 @@ def _initialize_impl(self): ) self._render_product_paths = [rp.path] - rep.AnnotatorRegistry.register_annotator_from_aov(aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4) - rep.AnnotatorRegistry.register_annotator_from_aov(aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4) + rep.AnnotatorRegistry.register_annotator_from_aov( + aov="DiffuseAlbedoSD", output_data_type=np.uint8, output_channels=4 + ) + rep.AnnotatorRegistry.register_annotator_from_aov( + aov="SimpleShadingSD", output_data_type=np.uint8, output_channels=4 + ) # Define the annotators based on requested data types self._annotators = dict() for annotator_type in self.cfg.data_types: @@ -198,10 +202,14 @@ def _initialize_impl(self): annotator = rep.AnnotatorRegistry.get_annotator("rgb", device=self.device, do_array_copy=False) self._annotators["rgba"] = annotator elif annotator_type == "diffuse_albedo": - annotator = rep.AnnotatorRegistry.get_annotator("DiffuseAlbedoSD", device=self.device, do_array_copy=False) + annotator = rep.AnnotatorRegistry.get_annotator( + "DiffuseAlbedoSD", device=self.device, do_array_copy=False + ) self._annotators["diffuse_albedo"] = annotator elif annotator_type == "simple_shading": - annotator = rep.AnnotatorRegistry.get_annotator("SimpleShadingSD", device=self.device, do_array_copy=False) + annotator = rep.AnnotatorRegistry.get_annotator( + "SimpleShadingSD", device=self.device, do_array_copy=False + ) self._annotators["simple_shading"] = annotator elif annotator_type == "depth" or annotator_type == "distance_to_image_plane": # keep depth for backwards compatibility diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py index c5a51a34850..6a8890c7ed2 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/shadow_hand/shadow_hand_vision_env.py @@ -73,6 +73,7 @@ class ShadowHandVisionRGBEnvCfg(ShadowHandEnvCfg): observation_space = 164 + 27 # state observation + vision CNN embedding state_space = 187 + 27 # asymettric states + vision CNN embedding + @configclass class ShadowHandVisionDiffuseAlbedoEnvCfg(ShadowHandEnvCfg): # scene