Skip to content

Commit

Permalink
[text] rm WenetTokenizer (#2218)
Browse files Browse the repository at this point in the history
* [text] rm WenetTokenizer

* [text] fix ut
  • Loading branch information
Mddct authored Dec 12, 2023
1 parent d2b337d commit 92d9b66
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 272 deletions.
7 changes: 3 additions & 4 deletions test/wenet/dataset/test_processor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

import wenet.dataset.processor as processor
from wenet.text.wenet_tokenizer import WenetTokenizer
from wenet.utils.init_tokenizer import init_tokenizer


@pytest.mark.parametrize("symbol_table_path", [
Expand Down Expand Up @@ -139,9 +139,8 @@ def test_tokenize(symbol_table_path):
"label": [24, 46, 2, 43, 1, 35, 27, 7, 56]
}]

tokenizer = WenetTokenizer(symbol_table_path,
bpe_model,
split_with_space=False)
configs = {'split_with_space': False}
tokenizer = init_tokenizer(configs, symbol_table_path, bpe_model)
outs = processor.tokenize(txts, tokenizer)
for (hyp, ref) in zip(outs, refs):
assert (len(hyp["tokens"]) == len(ref["tokens"]))
Expand Down
176 changes: 0 additions & 176 deletions test/wenet/text/test_wenet_tokenzier.py

This file was deleted.

90 changes: 0 additions & 90 deletions wenet/text/wenet_tokenizer.py

This file was deleted.

3 changes: 1 addition & 2 deletions wenet/utils/init_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ def init_tokenizer(configs,
bpe_model=None,
non_lang_syms=None) -> BaseTokenizer:
# TODO:
# 1 huggface tokenizer
# 2 paraformer tokenizer
# 1 paraformer tokenizer

if configs.get("whisper", False):
tokenizer = WhisperTokenizer(
Expand Down

0 comments on commit 92d9b66

Please sign in to comment.