Remove wordtokenizer example from NLP tokenizer notebook (#3477)

* nb fix Signed-off-by: Abhinav Khattar <[email protected]> * keep token change for later Signed-off-by: Abhinav Khattar <[email protected]> * fix Signed-off-by: Abhinav Khattar <[email protected]>
NVIDIA · Mar 2, 2022 · 9256016 · 9256016
1 parent 4c9e2ed
commit 9256016
Showing 1 changed file with 5 additions and 35 deletions.
diff --git a/tutorials/nlp/02_NLP_Tokenizers.ipynb b/tutorials/nlp/02_NLP_Tokenizers.ipynb
@@ -514,36 +514,6 @@
     "print(tokenizer_spe.vocab_size)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "FM_ei7OSLn1X"
-   },
-   "source": [
-    "## Example: WordTokenizer from Vocabulary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "RmcQLoouME8k"
-   },
-   "outputs": [],
-   "source": [
-    "# If you want to use a simple tokenizer like WordTokenizer without first generating the tokenizer.model first \n",
-    "# we provide the alternative class WordTokenizer or CharTokenizer that takes a user vocabulary as input\n",
-    "\n",
-    "# initialize tokenizer with vocabulary and specify optional special tokens\n",
-    "tokenizer_word = nemo_nlp.modules.get_tokenizer(tokenizer_name=\"word\", vocab_file=vocab_file, special_tokens=special_tokens_dict)\n",
-    "\n",
-    "# specified special tokens are added to the vocabulary\n",
-    "print(tokenizer_word.vocab_size)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -567,12 +537,12 @@
     "text=\"hello world\"\n",
     "\n",
     "# create tokens\n",
-    "tokenized = [tokenizer_word.bos_token] + tokenizer_word.text_to_tokens(text) + [tokenizer_word.eos_token]\n",
+    "tokenized = [tokenizer_spe.bos_token] + tokenizer_spe.text_to_tokens(text) + [tokenizer_spe.eos_token]\n",
     "print(tokenized)\n",
     "\n",
     "# turn token into input_ids for a neural model, such as BERTModule\n",
     "\n",
-    "print(tokenizer_word.tokens_to_ids(tokenized))"
+    "print(tokenizer_spe.tokens_to_ids(tokenized))"
    ]
   }
  ],
@@ -585,7 +555,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -599,7 +569,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.12"
   },
   "pycharm": {
    "stem_cell": {
@@ -613,4 +583,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}