Skip to content

Commit

Permalink
fix for emojis (#2675)
Browse files Browse the repository at this point in the history
* fix for emojis

Signed-off-by: ekmb <[email protected]>

* remove redundant line

Signed-off-by: ekmb <[email protected]>

* raise error

Signed-off-by: ekmb <[email protected]>

* use app_state

Signed-off-by: ekmb <[email protected]>

Co-authored-by: Eric Harper <[email protected]>
  • Loading branch information
ekmb and ericharper authored Aug 19, 2021
1 parent 595dc4d commit 1f0bf96
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict, Optional
from typing import Dict, Optional

import numpy as np

Expand Down Expand Up @@ -59,6 +59,11 @@ def get_features(

for j, word in enumerate(words):
word_tokens = tokenizer.text_to_tokens(word)

# to handle emojis that could be neglected during tokenization
if len(word.strip()) > 0 and len(word_tokens) == 0:
word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]

subtokens.extend(word_tokens)

loss_mask.append(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ def get_features(

for j, word in enumerate(words):
word_tokens = tokenizer.text_to_tokens(word)

# to handle emojis that could be neglected during tokenization
if len(word.strip()) > 0 and len(word_tokens) == 0:
word_tokens = [tokenizer.ids_to_tokens(tokenizer.unk_id)]

subtokens.extend(word_tokens)

loss_mask.append(1)
Expand Down
2 changes: 0 additions & 2 deletions nemo/collections/nlp/models/nlp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import hashlib
import json
import os
Expand Down Expand Up @@ -40,7 +39,6 @@
from nemo.core.classes.exportable import Exportable
from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
from nemo.utils import AppState, logging
from nemo.utils.exp_manager import configure_checkpointing
from nemo.utils.get_rank import is_global_rank_zero

__all__ = ['NLPModel']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import os
from typing import Dict, List, Optional, Union

import onnx
import torch
from omegaconf import DictConfig, OmegaConf
from pytorch_lightning import Trainer
Expand Down
8 changes: 5 additions & 3 deletions nemo/collections/nlp/modules/common/lm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from dataclasses import dataclass
from typing import List, Optional, Union

from attr import asdict
Expand All @@ -36,7 +35,7 @@
get_megatron_transformer,
get_nemo_transformer,
)
from nemo.utils import logging
from nemo.utils import AppState, logging

__all__ = ['get_pretrained_lm_models_list', 'get_lm_model']

Expand Down Expand Up @@ -102,7 +101,10 @@ def get_lm_model(
config_dict=config_dict, config_file=config_file, pretrained_model_name=pretrained_model_name,
)

if checkpoint_file and os.path.exists(checkpoint_file):
if checkpoint_file:
app_state = AppState()
if not app_state.is_model_being_restored and not os.path.exists(checkpoint_file):
raise ValueError(f'{checkpoint_file} not found')
model.restore_weights(restore_path=checkpoint_file)

return model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):

graph = delete_space + graph + delete_space
self.fst = graph.optimize()
generator_main(far_file, {"tokenize_and_classify": self.fst})

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
Expand Down

0 comments on commit 1f0bf96

Please sign in to comment.