Skip to content

Commit

Permalink
Delete some duplicate codes (#832)
Browse files Browse the repository at this point in the history
- Delete some duplicate codes
- Fix the problem of not being able to process unlogged words
  • Loading branch information
T-baby authored Aug 25, 2020
1 parent 2afdd19 commit a2d3359
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions paddlehub/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ def get_vocab(self):

def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, None)
v = self.vocab.get(token, None)
if v:
return v
else:
return 0

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
Expand Down Expand Up @@ -123,8 +127,8 @@ def convert_tokens_to_ids(self, tokens):
ids = []
for token in tokens:
wid = self._convert_token_to_id(token)
if wid:
ids.append(self._convert_token_to_id(token))
if wid is not None:
ids.append(wid)
return ids

def tokenize(self, text):
Expand Down Expand Up @@ -204,14 +208,14 @@ def get_input_ids(text):
if isinstance(text, str):
tokens = self.tokenize(text)
ids = self.convert_tokens_to_ids(tokens)
return self.convert_tokens_to_ids(tokens)
return ids
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
text[0], int):
return text
else:
raise ValueError(
Expand Down Expand Up @@ -350,7 +354,7 @@ def clean_up_tokenization(self, out_string: str) -> str:
"""
out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace(
" !", "!").replace(" ,", ",").replace(" ' ", "'").replace(
" n't",
"n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
" 've", "'ve").replace(" 're", "'re"))
" n't",
"n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
" 've", "'ve").replace(" 're", "'re"))
return out_string

0 comments on commit a2d3359

Please sign in to comment.