wenet-e2e · robin1001 · Feb 1, 2024 · Feb 1, 2024
diff --git a/examples/wenetspeech/s0/README.md b/examples/wenetspeech/s0/README.md
@@ -28,14 +28,15 @@
 
 * Feature info: using fbank feature, with dither 1.0, with cmvn
 * Training info: lr 0.002, batch size dynamic24000, 24 gpus on 3090, acc_grad 16, 80 epochs, 4.5 days
-* Decoding info: ctc_weight 0.5, reverse_weight 0.0, average_num 10, blank penalty 2.5
+* Decoding info: ctc_weight 0.5, reverse_weight 0.0, average_num 10, blank penalty 2.5, length penalty 8.5 for dev/testmeeting and 0.0 for testnet
 
 | Decoding mode - Chunk size    | Dev  | Test\_Net | Test\_Meeting |
 |:-----------------------------:|:----:|:---------:|:-------------:|
 | ctc prefix beam search - full      | 7.21 % N=328207 C=309358 S=14175 D=4674 I=4801 | 9.46 % N=414285 C=381373 S=26013 D=6899 I=6295 | 14.02 % N=220358 C=195224 S=17266 D=7868 I=5754 |
 | ctc prefix beam search - 16        | 7.93 % N=328207 C=307192 S=16529 D=4486 I=5000 | 11.14 % N=414285 C=374733 S=30241 D=9311 I=6596 | 16.37 % N=220358 C=191394 S=22435 D=6529 I=7116 |
 | attention rescoring - full    | 7.10 % N=328207 C=308457 S=13215 D=6535 I=3537 | 8.83 % N=414285 C=381936 S=24808 D=7541 I=4215 | 13.64 % N=220358 C=194438 S=16238 D=9682 I=4133 |
 | attention rescoring - 16      | 7.57 % N=328207 C=307065 S=15169 D=5973 I=3687 | 10.13 % N=414285 C=376854 S=28486 D=8945 I=4541 | 15.55 % N=220358 C=191270 S=21136 D=7952 I=5184 |
+| attention - full    | 7.73 % N=328207 C=306688 S=13166 D=8353 I=3845 | 9.44 % N=414285 C=378096 S=24532 D=11657 I=2908 | 14.98 % N=220358 C=191881 S=15303 D=13174 I=4540 |
 
 ## U2++ conformer (wenetspeech plus aishell4)
 

diff --git a/wenet/bin/recognize.py b/wenet/bin/recognize.py
@@ -48,7 +48,7 @@ def get_args():
                         type=int,
                         default=10,
                         help='beam size for search')
-    parser.add_argument('--penalty',
+    parser.add_argument('--length_penalty',
                         type=float,
                         default=0.0,
                         help='length penalty')
@@ -256,7 +256,8 @@ def main():
                 reverse_weight=args.reverse_weight,
                 context_graph=context_graph,
                 blank_id=blank_id,
-                blank_penalty=args.blank_penalty)
+                blank_penalty=args.blank_penalty,
+                length_penalty=args.length_penalty)
             for i, key in enumerate(keys):
                 for mode, hyps in results.items():
                     tokens = hyps[i].tokens

diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
@@ -255,6 +255,7 @@ def decode(
         context_graph: ContextGraph = None,
         blank_id: int = 0,
         blank_penalty: float = 0.0,
+        length_penalty: float = 0.0,
     ) -> Dict[str, List[DecodeResult]]:
         """ Decode input speech
 
@@ -291,7 +292,7 @@ def decode(
         results = {}
         if 'attention' in methods:
             results['attention'] = attention_beam_search(
-                self, encoder_out, encoder_mask, beam_size)
+                self, encoder_out, encoder_mask, beam_size, length_penalty)
         if 'ctc_greedy_search' in methods:
             results['ctc_greedy_search'] = ctc_greedy_search(
                 ctc_probs, encoder_lens, blank_id)

diff --git a/wenet/transformer/search.py b/wenet/transformer/search.py
@@ -252,6 +252,7 @@ def attention_beam_search(
     encoder_out: torch.Tensor,
     encoder_mask: torch.Tensor,
     beam_size: int = 10,
+    length_penalty: float = 0.0,
 ) -> List[DecodeResult]:
     device = encoder_out.device
     batch_size = encoder_out.shape[0]
@@ -336,7 +337,8 @@ def attention_beam_search(
 
     # 3. Select best of best
     scores = scores.view(batch_size, beam_size)
-    # TODO: length normalization
+    lengths = hyps.ne(model.eos).sum(dim=1).view(batch_size, beam_size).float()
+    scores = scores / lengths.pow(length_penalty)
     best_scores, best_index = scores.max(dim=-1)
     best_hyps_index = best_index + torch.arange(
         batch_size, dtype=torch.long, device=device) * beam_size