diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst index 3302f635a84f..80ec488fe0c3 100644 --- a/docs/source/asr/configs.rst +++ b/docs/source/asr/configs.rst @@ -592,7 +592,7 @@ use it, specify the following parameters in the encoder config file to reproduce # ... stochastic_depth_drop_prob: 0.3 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 See :ref:`documentation of ConformerEncoder ` for more details. Note that stochastic depth is supported for both CTC and Transducer model variations (or any other kind of model/loss that's using diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml index 023b9d22f1d3..0ad49f4c7261 100644 --- a/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/cache_aware_streaming/conformer_ctc_bpe_streaming.yaml @@ -131,7 +131,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder diff --git a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml index 472c4abc6099..cd6b7d4a6d45 100644 --- a/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml +++ b/examples/asr/conf/conformer/cache_aware_streaming/conformer_transducer_bpe_streaming.yaml @@ -141,7 +141,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.RNNTDecoder diff --git a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml index 3f4ba0da194c..e9b22996ddac 100644 --- a/examples/asr/conf/conformer/conformer_ctc_bpe.yaml +++ b/examples/asr/conf/conformer/conformer_ctc_bpe.yaml @@ -140,7 +140,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder diff --git a/examples/asr/conf/conformer/conformer_ctc_char.yaml b/examples/asr/conf/conformer/conformer_ctc_char.yaml index 70f3fdbaa30b..e56057bd5588 100644 --- a/examples/asr/conf/conformer/conformer_ctc_char.yaml +++ b/examples/asr/conf/conformer/conformer_ctc_char.yaml @@ -115,7 +115,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder diff --git a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml index a7a012de2c4a..bb92ce5cb0a6 100644 --- a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml +++ b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml @@ -144,7 +144,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.RNNTDecoder diff --git a/examples/asr/conf/conformer/conformer_transducer_char.yaml b/examples/asr/conf/conformer/conformer_transducer_char.yaml index 789653641f19..977287fd7890 100644 --- a/examples/asr/conf/conformer/conformer_transducer_char.yaml +++ b/examples/asr/conf/conformer/conformer_transducer_char.yaml @@ -139,7 +139,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.RNNTDecoder diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml index d67c83c30873..f67ede9a5073 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml @@ -122,7 +122,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml index 4f6803b795f6..746b8a11109a 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml @@ -131,7 +131,7 @@ model: # set to non-zero to enable stochastic depth stochastic_depth_drop_prob: 0.0 stochastic_depth_mode: linear # linear or uniform - stochastic_depth_start_layer: 0 + stochastic_depth_start_layer: 1 decoder: _target_: nemo.collections.asr.modules.RNNTDecoder diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py index bbbc40733205..8d3ac260258e 100644 --- a/nemo/collections/asr/modules/conformer_encoder.py +++ b/nemo/collections/asr/modules/conformer_encoder.py @@ -117,7 +117,7 @@ class ConformerEncoder(NeuralModule, StreamingEncoder, Exportable, AccessMixin): stochastic_depth_start_layer (int): starting layer for stochastic depth. All layers before this will never be dropped. Note that drop probability will be adjusted accordingly if mode is "linear" when - start layer is > 0. Defaults to 0. + start layer is > 1. Defaults to 1. """ def input_example(self, max_batch=1, max_dim=256): @@ -207,7 +207,7 @@ def __init__( dropout_att=0.0, stochastic_depth_drop_prob: float = 0.0, stochastic_depth_mode: str = "linear", - stochastic_depth_start_layer: int = 0, + stochastic_depth_start_layer: int = 1, ): super().__init__() d_ff = d_model * ff_expansion_factor diff --git a/nemo/collections/asr/parts/utils/regularization_utils.py b/nemo/collections/asr/parts/utils/regularization_utils.py index 20a19c079c46..871b4889ad68 100644 --- a/nemo/collections/asr/parts/utils/regularization_utils.py +++ b/nemo/collections/asr/parts/utils/regularization_utils.py @@ -19,9 +19,11 @@ def compute_stochastic_depth_drop_probs( num_layers: int, stochastic_depth_drop_prob: float = 0.0, stochastic_depth_mode: str = "linear", - stochastic_depth_start_layer: int = 0, + stochastic_depth_start_layer: int = 1, ) -> List[float]: """Computes drop probabilities for stochastic depth regularization technique. + The first layer is never dropped and the starting layer needs to be greater + or equal to 1. Args: num_layers (int): number of layers in the network. @@ -36,22 +38,27 @@ def compute_stochastic_depth_drop_probs( stochastic_depth_start_layer (int): starting layer for stochastic depth. All layers before this will never be dropped. Note that drop probability will be adjusted accordingly if mode is "linear" when - start layer is > 0. Defaults to 0. + start layer is > 1. Defaults to 1. Returns: List[float]: list of drop probabilities for all layers """ if not (0 <= stochastic_depth_drop_prob < 1.0): raise ValueError("stochastic_depth_drop_prob has to be in [0, 1).") - if not (0 <= stochastic_depth_start_layer <= num_layers): - raise ValueError("stochastic_depth_start_layer has to be in [0, num layers].") - L = num_layers - stochastic_depth_start_layer + if not (1 <= stochastic_depth_start_layer <= num_layers): + raise ValueError("stochastic_depth_start_layer has to be in [1, num layers].") + + # Layers before `stochastic_depth_start_layer` are never dropped layer_drop_probs = [0.0] * stochastic_depth_start_layer - if stochastic_depth_mode == "linear": - # we are dividing by L - 1 to ensure we start from 0 probability - # (never drop the first layer) and end with desired drop probability. - layer_drop_probs += [l / (L - 1) * stochastic_depth_drop_prob for l in range(L)] - elif stochastic_depth_mode == "uniform": - layer_drop_probs += [stochastic_depth_drop_prob] * L - else: - raise ValueError('stochastic_depth_mode has to be one of ["linear", "uniform"].') + + # Layers starting with `stochastic_depth_start_layer` may be dropped + if (L := num_layers - stochastic_depth_start_layer) > 0: + if stochastic_depth_mode == "linear": + # we start with 1/L * drop_prob and and end with the desired drop probability. + layer_drop_probs += [l / L * stochastic_depth_drop_prob for l in range(1, L + 1)] + elif stochastic_depth_mode == "uniform": + layer_drop_probs += [stochastic_depth_drop_prob] * L + else: + raise ValueError( + f'stochastic_depth_mode has to be one of ["linear", "uniform"]. Current value: {stochastic_depth_mode}' + ) return layer_drop_probs diff --git a/tests/collections/asr/test_conformer_encoder.py b/tests/collections/asr/test_conformer_encoder.py index 9cfe0747fa37..aa972ae6fd47 100644 --- a/tests/collections/asr/test_conformer_encoder.py +++ b/tests/collections/asr/test_conformer_encoder.py @@ -30,7 +30,7 @@ def test_stochastic_depth_model_creation(self): # linear mode for drop_prob in [0.3, 0.5, 0.9]: - for start_layer in [0, 2]: + for start_layer in [1, 3]: model = ConformerEncoder( feat_in=10, n_layers=n_layers, @@ -40,7 +40,7 @@ def test_stochastic_depth_model_creation(self): stochastic_depth_start_layer=start_layer, ) L = n_layers - start_layer - assert model.layer_drop_probs == [0.0] * start_layer + [drop_prob * l / (L - 1) for l in range(L)] + assert model.layer_drop_probs == [0.0] * start_layer + [drop_prob * l / L for l in range(1, L + 1)] # uniform mode for drop_prob in [0.3, 0.5, 0.9]: @@ -71,7 +71,7 @@ def test_stochastic_depth_model_creation(self): with pytest.raises(ValueError, match="stochastic_depth_mode has to be one of"): ConformerEncoder(feat_in=10, n_layers=n_layers, d_model=4, feat_out=8, stochastic_depth_mode="weird") - for start_layer in [-1, 5]: + for start_layer in [-1, 0, 5]: with pytest.raises(ValueError, match="stochastic_depth_start_layer has to be in"): ConformerEncoder( feat_in=10, n_layers=n_layers, d_model=4, feat_out=8, stochastic_depth_start_layer=start_layer,