Skip to content

Commit

Permalink
fix(whisper): support arbitrary ctc blank id
Browse files Browse the repository at this point in the history
  • Loading branch information
xingchensong committed Nov 23, 2023
1 parent eafd44a commit 0b26a62
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 2 deletions.
4 changes: 3 additions & 1 deletion wenet/transformer/ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,23 @@ def __init__(
encoder_output_size: int,
dropout_rate: float = 0.0,
reduce: bool = True,
blank_id: int = 0,
):
""" Construct CTC module
Args:
odim: dimension of outputs
encoder_output_size: number of encoder projection units
dropout_rate: dropout rate (0.0 ~ 1.0)
reduce: reduce the CTC loss into a scalar
blank_id: blank label.
"""
super().__init__()
eprojs = encoder_output_size
self.dropout_rate = dropout_rate
self.ctc_lo = torch.nn.Linear(eprojs, odim)

reduction_type = "sum" if reduce else "none"
self.ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)
self.ctc_loss = torch.nn.CTCLoss(blank=blank_id, reduction=reduction_type)

def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor,
ys_pad: torch.Tensor, ys_lens: torch.Tensor) -> torch.Tensor:
Expand Down
3 changes: 2 additions & 1 deletion wenet/utils/init_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def init_model(args, configs):
assert configs['decoder_conf']['r_num_blocks'] > 0
decoder = BiTransformerDecoder(vocab_size, encoder.output_size(),
**configs['decoder_conf'])
ctc = CTC(vocab_size, encoder.output_size())
ctc = CTC(vocab_size, encoder.output_size(),
blank_id=configs['ctc_conf']['ctc_blank_id'])

# Init joint CTC/Attention or Transducer model
if 'predictor' in configs:
Expand Down
11 changes: 11 additions & 0 deletions wenet/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,17 @@ def check_modify_and_save_config(args, configs):
symbol_table = read_symbol_table(args.symbol_table)
vocab_size = len(symbol_table)

if 'ctc_conf' not in configs:
configs['ctc_conf'] = {}

if '<blank>' in symbol_table:
if 'ctc_blank_id' in configs['ctc_conf']:
assert configs['ctc_conf']['ctc_blank_id'] == symbol_table['<blank>']
else:
configs['ctc_conf']['ctc_blank_id'] = symbol_table['<blank>']
else:
assert 'ctc_blank_id' in configs['ctc_conf'], "PLZ set ctc_blank_id in yaml"

configs['input_dim'] = input_dim
configs['output_dim'] = configs.get('output_dim', vocab_size)
configs['cmvn_file'] = args.cmvn
Expand Down
3 changes: 3 additions & 0 deletions wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
configs['decoder_conf']['key_bias'] = False
configs['decoder_conf']['activation_type'] = "gelu"

configs['ctc_conf'] = {}
configs['ctc_conf']['ctc_blank_id'] = 50362 # <nospeech>

configs['model_conf'] = {}
configs['model_conf']['ctc_weight'] = 0.3
configs['model_conf']['lsm_weight'] = 0.1
Expand Down

0 comments on commit 0b26a62

Please sign in to comment.