NVIDIA · artbataev · Jan 16, 2024 · Nov 21, 2023 · Nov 21, 2023 · Nov 22, 2023
diff --git a/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py b/nemo/collections/asr/modules/hybrid_autoregressive_transducer.py
@@ -138,9 +138,9 @@ def return_hat_ilm(self):
     def return_hat_ilm(self, hat_subtract_ilm):
         self._return_hat_ilm = hat_subtract_ilm
 
-    def joint(self, f: torch.Tensor, g: torch.Tensor) -> Union[torch.Tensor, HATJointOutput]:
+    def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> Union[torch.Tensor, HATJointOutput]:
         """
-        Compute the joint step of the network.
+        Compute the joint step of the network after Encoder/Decoder projection.
 
         Here,
         B = Batch size
@@ -169,14 +169,8 @@ def joint(self, f: torch.Tensor, g: torch.Tensor) -> Union[torch.Tensor, HATJoin
             Log softmaxed tensor of shape (B, T, U, V + 1).
             Internal LM probability (B, 1, U, V) -- in case of return_ilm==True.
         """
-        # f = [B, T, H1]
-        f = self.enc(f)
-        f.unsqueeze_(dim=2)  # (B, T, 1, H)
-
-        # g = [B, U, H2]
-        g = self.pred(g)
-        g.unsqueeze_(dim=1)  # (B, 1, U, H)
-
+        f = f.unsqueeze(dim=2)  # (B, T, 1, H)
+        g = g.unsqueeze(dim=1)  # (B, 1, U, H)
         inp = f + g  # [B, T, U, H]
 
         del f

diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
@@ -398,6 +398,22 @@ def batch_copy_states(
 
         return old_states
 
+    def mask_select_states(
+        self, states: Optional[List[torch.Tensor]], mask: torch.Tensor
+    ) -> Optional[List[torch.Tensor]]:
+        """
+        Return states by mask selection
+        Args:
+            states: states for the batch
+            mask: boolean mask for selecting states; batch dimension should be the same as for states
+
+        Returns:
+            states filtered by mask
+        """
+        if states is None:
+            return None
+        return [states[0][mask]]
+
     def batch_score_hypothesis(
         self, hypotheses: List[rnnt_utils.Hypothesis], cache: Dict[Tuple[int], Any], batch_states: List[torch.Tensor]
     ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
@@ -1047,6 +1063,21 @@ def batch_copy_states(
 
         return old_states
 
+    def mask_select_states(
+        self, states: Tuple[torch.Tensor, torch.Tensor], mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Return states by mask selection
+        Args:
+            states: states for the batch
+            mask: boolean mask for selecting states; batch dimension should be the same as for states
+
+        Returns:
+            states filtered by mask
+        """
+        # LSTM in PyTorch returns a tuple of 2 tensors as a state
+        return states[0][:, mask], states[1][:, mask]
+
     # Adapter method overrides
     def add_adapter(self, name: str, cfg: DictConfig):
         # Update the config with correct input dim
@@ -1382,9 +1413,33 @@ def forward(
 
             return losses, wer, wer_num, wer_denom
 
-    def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor:
+    def project_encoder(self, encoder_output: torch.Tensor) -> torch.Tensor:
+        """
+        Project the encoder output to the joint hidden dimension.
+
+        Args:
+            encoder_output: A torch.Tensor of shape [B, T, D]
+
+        Returns:
+            A torch.Tensor of shape [B, T, H]
+        """
+        return self.enc(encoder_output)
+
+    def project_prednet(self, prednet_output: torch.Tensor) -> torch.Tensor:
+        """
+        Project the Prediction Network (Decoder) output to the joint hidden dimension.
+
+        Args:
+            prednet_output: A torch.Tensor of shape [B, U, D]
+
+        Returns:
+            A torch.Tensor of shape [B, U, H]
+        """
+        return self.pred(prednet_output)
+
+    def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor:
         """
-        Compute the joint step of the network.
+        Compute the joint step of the network after projection.
 
         Here,
         B = Batch size
@@ -1412,14 +1467,8 @@ def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor:
         Returns:
             Logits / log softmaxed tensor of shape (B, T, U, V + 1).
         """
-        # f = [B, T, H1]
-        f = self.enc(f)
-        f.unsqueeze_(dim=2)  # (B, T, 1, H)
-
-        # g = [B, U, H2]
-        g = self.pred(g)
-        g.unsqueeze_(dim=1)  # (B, 1, U, H)
-
+        f = f.unsqueeze(dim=2)  # (B, T, 1, H)
+        g = g.unsqueeze(dim=1)  # (B, 1, U, H)
         inp = f + g  # [B, T, U, H]
 
         del f, g
@@ -1536,7 +1585,7 @@ def set_fuse_loss_wer(self, fuse_loss_wer, loss=None, metric=None):
 
     @property
     def fused_batch_size(self):
-        return self._fuse_loss_wer
+        return self._fused_batch_size
 
     def set_fused_batch_size(self, fused_batch_size):
         self._fused_batch_size = fused_batch_size

diff --git a/nemo/collections/asr/modules/rnnt_abstract.py b/nemo/collections/asr/modules/rnnt_abstract.py
@@ -28,6 +28,45 @@ class AbstractRNNTJoint(NeuralModule, ABC):
     """
 
     @abstractmethod
+    def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> Any:
+        """
+        Compute the joint step of the network after the projection step.
+        Args:
+            f: Output of the Encoder model after projection. A torch.Tensor of shape [B, T, H]
+            g: Output of the Decoder model (Prediction Network) after projection. A torch.Tensor of shape [B, U, H]
+
+        Returns:
+            Logits / log softmaxed tensor of shape (B, T, U, V + 1).
+            Arbitrary return type, preferably torch.Tensor, but not limited to (e.g., see HatJoint)
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def project_encoder(self, encoder_output: torch.Tensor) -> torch.Tensor:
+        """
+        Project the encoder output to the joint hidden dimension.
+
+        Args:
+            encoder_output: A torch.Tensor of shape [B, T, D]
+
+        Returns:
+            A torch.Tensor of shape [B, T, H]
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def project_prednet(self, prednet_output: torch.Tensor) -> torch.Tensor:
+        """
+        Project the Prediction Network (Decoder) output to the joint hidden dimension.
+
+        Args:
+            prednet_output: A torch.Tensor of shape [B, U, D]
+
+        Returns:
+            A torch.Tensor of shape [B, U, H]
+        """
+        raise NotImplementedError()
+
     def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor:
         """
         Compute the joint step of the network.
@@ -58,7 +97,7 @@ def joint(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tensor:
         Returns:
             Logits / log softmaxed tensor of shape (B, T, U, V + 1).
         """
-        raise NotImplementedError()
+        return self.joint_after_projection(self.project_encoder(f), self.project_prednet(g))
 
     @property
     def num_classes_with_blank(self):
@@ -277,3 +316,15 @@ def batch_copy_states(
                 (L x B x H, L x B x H)
         """
         raise NotImplementedError()
+
+    def mask_select_states(self, states: Any, mask: torch.Tensor) -> Any:
+        """
+        Return states by mask selection
+        Args:
+            states: states for the batch (preferably a list of tensors, but not limited to)
+            mask: boolean mask for selecting states; batch dimension should be the same as for states
+
+        Returns:
+            states filtered by mask (same type as `states`)
+        """
+        raise NotImplementedError()
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -316,6 +316,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_alignments=self.preserve_alignments,
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
+                        loop_labels=self.cfg.greedy.get('loop_labels', True),
                     )
                 else:
                     self.decoding = rnnt_greedy_decoding.GreedyBatchedTDTInfer(
@@ -1495,8 +1496,8 @@ class RNNTDecodingConfig:
     rnnt_timestamp_type: str = "all"  # can be char, word or all for both
 
     # greedy decoding config
-    greedy: rnnt_greedy_decoding.GreedyRNNTInferConfig = field(
-        default_factory=lambda: rnnt_greedy_decoding.GreedyRNNTInferConfig()
+    greedy: rnnt_greedy_decoding.GreedyBatchedRNNTInferConfig = field(
+        default_factory=rnnt_greedy_decoding.GreedyBatchedRNNTInferConfig
     )
 
     # beam decoding config