Adjust stochastic depth dropout probability calculation (NVIDIA#6120)

Signed-off-by: Ante Jukić <[email protected]> Signed-off-by: hsiehjackson <[email protected]>
hsiehjackson · Jun 2, 2023 · c648c90 · c648c90
1 parent 1b835ba
commit c648c90
Show file tree

Hide file tree

Showing 12 changed files with 34 additions and 27 deletions.
diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
@@ -592,7 +592,7 @@ use it, specify the following parameters in the encoder config file to reproduce
         # ...
         stochastic_depth_drop_prob: 0.3
         stochastic_depth_mode: linear  # linear or uniform
-        stochastic_depth_start_layer: 0
+        stochastic_depth_start_layer: 1
 
 See :ref:`documentation of ConformerEncoder <conformer-encoder-api>` for more details. Note that stochastic depth
 is supported for both CTC and Transducer model variations (or any other kind of model/loss that's using

diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml
@@ -131,7 +131,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder

diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml
@@ -141,7 +141,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.RNNTDecoder

diff --git a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml
@@ -140,7 +140,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder

diff --git a/examples/asr/conf/conformer/conformer_ctc_char.yaml b/examples/asr/conf/conformer/conformer_ctc_char.yaml
@@ -115,7 +115,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder

diff --git a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml
@@ -144,7 +144,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.RNNTDecoder

diff --git a/examples/asr/conf/conformer/conformer_transducer_char.yaml b/examples/asr/conf/conformer/conformer_transducer_char.yaml
@@ -139,7 +139,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.RNNTDecoder

diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
@@ -122,7 +122,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.ConvASRDecoder

diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
@@ -131,7 +131,7 @@ model:
     # set to non-zero to enable stochastic depth
     stochastic_depth_drop_prob: 0.0
     stochastic_depth_mode: linear  # linear or uniform
-    stochastic_depth_start_layer: 0
+    stochastic_depth_start_layer: 1
 
   decoder:
     _target_: nemo.collections.asr.modules.RNNTDecoder

diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
@@ -117,7 +117,7 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin):
         stochastic_depth_start_layer (int): starting layer for stochastic depth.
             All layers before this will never be dropped. Note that drop
             probability will be adjusted accordingly if mode is "linear" when
-            start layer is > 0. Defaults to 0.
+            start layer is > 1. Defaults to 1.
     """
 
     def input_example(self, max_batch=1, max_dim=256):
@@ -207,7 +207,7 @@ def __init__(
         dropout_att=0.0,
         stochastic_depth_drop_prob: float = 0.0,
         stochastic_depth_mode: str = "linear",
-        stochastic_depth_start_layer: int = 0,
+        stochastic_depth_start_layer: int = 1,
     ):
         super().__init__()
         d_ff = d_model * ff_expansion_factor

diff --git a/nemo/collections/asr/parts/utils/regularization_utils.py b/nemo/collections/asr/parts/utils/regularization_utils.py
@@ -19,9 +19,11 @@ def compute_stochastic_depth_drop_probs(
     num_layers: int,
     stochastic_depth_drop_prob: float = 0.0,
     stochastic_depth_mode: str = "linear",
-    stochastic_depth_start_layer: int = 0,
+    stochastic_depth_start_layer: int = 1,
 ) -> List[float]:
     """Computes drop probabilities for stochastic depth regularization technique.
+    The first layer is never dropped and the starting layer needs to be greater
+    or equal to 1.
 
     Args:
         num_layers (int): number of layers in the network.
@@ -36,22 +38,27 @@ def compute_stochastic_depth_drop_probs(
         stochastic_depth_start_layer (int): starting layer for stochastic depth.
             All layers before this will never be dropped. Note that drop
             probability will be adjusted accordingly if mode is "linear" when
-            start layer is > 0. Defaults to 0.
+            start layer is > 1. Defaults to 1.
     Returns:
         List[float]: list of drop probabilities for all layers
     """
     if not (0 <= stochastic_depth_drop_prob < 1.0):
         raise ValueError("stochastic_depth_drop_prob has to be in [0, 1).")
-    if not (0 <= stochastic_depth_start_layer <= num_layers):
-        raise ValueError("stochastic_depth_start_layer has to be in [0, num layers].")
-    L = num_layers - stochastic_depth_start_layer
+    if not (1 <= stochastic_depth_start_layer <= num_layers):
+        raise ValueError("stochastic_depth_start_layer has to be in [1, num layers].")
+
+    # Layers before `stochastic_depth_start_layer` are never dropped
     layer_drop_probs = [0.0] * stochastic_depth_start_layer
-    if stochastic_depth_mode == "linear":
-        # we are dividing by L - 1 to ensure we start from 0 probability
-        # (never drop the first layer) and end with desired drop probability.
-        layer_drop_probs += [l / (L - 1) * stochastic_depth_drop_prob for l in range(L)]
-    elif stochastic_depth_mode == "uniform":
-        layer_drop_probs += [stochastic_depth_drop_prob] * L
-    else:
-        raise ValueError('stochastic_depth_mode has to be one of ["linear", "uniform"].')
+
+    # Layers starting with `stochastic_depth_start_layer` may be dropped
+    if (L := num_layers - stochastic_depth_start_layer) > 0:
+        if stochastic_depth_mode == "linear":
+            # we start with 1/L * drop_prob and and end with the desired drop probability.
+            layer_drop_probs += [l / L * stochastic_depth_drop_prob for l in range(1, L + 1)]
+        elif stochastic_depth_mode == "uniform":
+            layer_drop_probs += [stochastic_depth_drop_prob] * L
+        else:
+            raise ValueError(
+                f'stochastic_depth_mode has to be one of ["linear", "uniform"]. Current value: {stochastic_depth_mode}'
+            )
     return layer_drop_probs
diff --git a/tests/collections/asr/test_conformer_encoder.py b/tests/collections/asr/test_conformer_encoder.py
@@ -30,7 +30,7 @@ def test_stochastic_depth_model_creation(self):
 
         # linear mode
         for drop_prob in [0.3, 0.5, 0.9]:
-            for start_layer in [0, 2]:
+            for start_layer in [1, 3]:
                 model = ConformerEncoder(
                     feat_in=10,
                     n_layers=n_layers,
@@ -40,7 +40,7 @@ def test_stochastic_depth_model_creation(self):
                     stochastic_depth_start_layer=start_layer,
                 )
                 L = n_layers - start_layer
-                assert model.layer_drop_probs == [0.0] * start_layer + [drop_prob * l / (L - 1) for l in range(L)]
+                assert model.layer_drop_probs == [0.0] * start_layer + [drop_prob * l / L for l in range(1, L + 1)]
 
         # uniform mode
         for drop_prob in [0.3, 0.5, 0.9]:
@@ -71,7 +71,7 @@ def test_stochastic_depth_model_creation(self):
         with pytest.raises(ValueError, match="stochastic_depth_mode has to be one of"):
             ConformerEncoder(feat_in=10, n_layers=n_layers, d_model=4, feat_out=8, stochastic_depth_mode="weird")
 
-        for start_layer in [-1, 5]:
+        for start_layer in [-1, 0, 5]:
             with pytest.raises(ValueError, match="stochastic_depth_start_layer has to be in"):
                 ConformerEncoder(
                     feat_in=10, n_layers=n_layers, d_model=4, feat_out=8, stochastic_depth_start_layer=start_layer,