diff --git a/changelog/7089.bugfix.md b/changelog/7089.bugfix.md new file mode 100644 index 000000000000..ca3d70c2412d --- /dev/null +++ b/changelog/7089.bugfix.md @@ -0,0 +1,11 @@ +Fix [ConveRTTokenizer](components.mdx#converttokenizer) failing because of wrong model URL by making the `model_url` parameter of `ConveRTTokenizer` mandatory. + +Since the ConveRT model was taken [offline](https://github.com/RasaHQ/rasa/issues/6806), we can no longer use +the earlier public URL of the model. Additionally, since the licence for the model is unknown, +we cannot host it ourselves. Users can still use the component by setting `model_url` to a community/self-hosted +model URL or path to a local directory containing model files. For example: +```yaml +pipeline: + - name: ConveRTTokenizer + model_url: +``` \ No newline at end of file diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx index d94ad71cf26a..67aaf3b23ba1 100644 --- a/docs/docs/components.mdx +++ b/docs/docs/components.mdx @@ -453,10 +453,16 @@ word vectors in your pipeline. "intent_split_symbol": "_" # Regular expression to detect tokens "token_pattern": None - # Remote URL of hosted model - "model_url": TF_HUB_MODULE_URL + # Remote URL/Local directory of model files(Required) + "model_url": None ``` + :::note + Since the public URL of the ConveRT model was taken offline recently, it is now mandatory + to set the parameter `model_url` to a community/self-hosted URL or path to a local directory containing model files. + + ::: + ### LanguageModelTokenizer diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py index f439b0cf0630..3fda08e6b5f5 100644 --- a/rasa/nlu/tokenizers/convert_tokenizer.py +++ b/rasa/nlu/tokenizers/convert_tokenizer.py @@ -7,17 +7,27 @@ from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.shared.nlu.training_data.message import Message from rasa.utils import common +from rasa.nlu import utils as nlu_utils import rasa.utils.train_utils as train_utils +from rasa.exceptions import RasaException import tensorflow as tf +import os -TF_HUB_MODULE_URL = ( +# URL to the old remote location of the model which +# users might use. The model is no longer hosted here. +ORIGINAL_TF_HUB_MODULE_URL = ( "https://github.com/PolyAI-LDN/polyai-models/releases/download/v1.0/model.tar.gz" ) +# Warning: This URL is only intended for running pytests on ConveRT +# related components. This URL should not be allowed to be used by the user. +RESTRICTED_ACCESS_URL = "https://storage.googleapis.com/continuous-integration-model-storage/convert_tf2.tar.gz" + class ConveRTTokenizer(WhitespaceTokenizer): """Tokenizer using ConveRT model. + Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert) model from TFHub and computes sub-word tokens for dense featurizable attributes of each message object. @@ -30,25 +40,129 @@ class ConveRTTokenizer(WhitespaceTokenizer): "intent_split_symbol": "_", # Regular expression to detect tokens "token_pattern": None, - # Remote URL of hosted model - "model_url": TF_HUB_MODULE_URL, + # Remote URL/Local path to model files + "model_url": None, } def __init__(self, component_config: Dict[Text, Any] = None) -> None: - """Construct a new tokenizer using the WhitespaceTokenizer framework.""" + """Construct a new tokenizer using the WhitespaceTokenizer framework. + Args: + component_config: User configuration for the component + """ super().__init__(component_config) - self.model_url = self.component_config.get("model_url", TF_HUB_MODULE_URL) + self.model_url = self._get_validated_model_url() self.module = train_utils.load_tf_hub_model(self.model_url) self.tokenize_signature = self.module.signatures["tokenize"] + @staticmethod + def _validate_model_files_exist(model_directory: Text) -> None: + """Check if essential model files exist inside the model_directory. + + Args: + model_directory: Directory to investigate + """ + files_to_check = [ + os.path.join(model_directory, "saved_model.pb"), + os.path.join(model_directory, "variables/variables.index"), + os.path.join(model_directory, "variables/variables.data-00001-of-00002"), + os.path.join(model_directory, "variables/variables.data-00000-of-00002"), + ] + + for file_path in files_to_check: + if not os.path.exists(file_path): + raise RasaException( + f"""File {file_path} does not exist. + Re-check the files inside the directory {model_directory}. + It should contain the following model + files - [{", ".join(files_to_check)}]""" + ) + + def _get_validated_model_url(self) -> Text: + """Validates the specified `model_url` parameter. + + The `model_url` parameter cannot be left empty. It can either + be set to a remote URL where the model is hosted or it can be + a path to a local directory. + + Returns: + Validated path to model + """ + model_url = self.component_config.get("model_url", None) + + if not model_url: + raise RasaException( + f"""Parameter "model_url" was not specified in the configuration + of "{ConveRTTokenizer.__name__}". + You can either use a community hosted URL of the model + or if you have a local copy of the model, pass the + path to the directory containing the model files.""" + ) + + if model_url == ORIGINAL_TF_HUB_MODULE_URL: + # Can't use the originally hosted URL + raise RasaException( + f"""Parameter "model_url" of "{ConveRTTokenizer.__name__}" was + set to "{model_url}" which does not contain the model any longer. + You can either use a community hosted URL or if you have a + local copy of the model, pass the path to the directory + containing the model files.""" + ) + + if model_url == RESTRICTED_ACCESS_URL: + # Can't use the URL that is reserved for tests only + raise RasaException( + f"""Parameter "model_url" of "{ConveRTTokenizer.__name__}" was + set to "{model_url}" which is strictly reserved for pytests of Rasa Open Source only. + Due to licensing issues you are not allowed to use the model from this URL. + You can either use a community hosted URL or if you have a + local copy of the model, pass the path to the directory + containing the model files.""" + ) + + if os.path.isfile(model_url): + # Definitely invalid since the specified path should be a directory + raise RasaException( + f"""Parameter "model_url" of "{ConveRTTokenizer.__name__}" was + set to the path of a file which is invalid. You + can either use a community hosted URL or if you have a + local copy of the model, pass the path to the directory + containing the model files.""" + ) + + if nlu_utils.is_url(model_url): + return model_url + + if os.path.isdir(model_url): + # Looks like a local directory. Inspect the directory + # to see if model files exist. + self._validate_model_files_exist(model_url) + # Convert the path to an absolute one since + # TFHUB doesn't like relative paths + return os.path.abspath(model_url) + + raise RasaException( + f"""{model_url} is neither a valid remote URL nor a local directory. + You can either use a community hosted URL or if you have a + local copy of the model, pass the path to + the directory containing the model files.""" + ) + @classmethod def cache_key( cls, component_meta: Dict[Text, Any], model_metadata: Metadata ) -> Optional[Text]: + """Cache the component for future use. + + Args: + component_meta: configuration for the component. + model_metadata: configuration for the whole pipeline. + + Returns: key of the cache for future retrievals. + """ _config = common.update_existing_keys(cls.defaults, component_meta) return f"{cls.name}-{get_dict_hash(_config)}" diff --git a/rasa/nlu/utils/__init__.py b/rasa/nlu/utils/__init__.py index b1d00249b613..e68a3839c675 100644 --- a/rasa/nlu/utils/__init__.py +++ b/rasa/nlu/utils/__init__.py @@ -47,11 +47,24 @@ def is_model_dir(model_dir: Text) -> bool: def is_url(resource_name: Text) -> bool: - """Return True if string is an http, ftp, or file URL path. - - This implementation is the same as the one used by matplotlib""" - - URL_REGEX = re.compile(r"http://|https://|ftp://|file://|file:\\") + """Check whether the url specified is a well formed one. + + Regex adapted from https://stackoverflow.com/a/7160778/3001665 + + Args: + resource_name: Remote URL to validate + + Returns: `True` if valid, otherwise `False`. + """ + URL_REGEX = re.compile( + r"^(?:http|ftp|file)s?://" # http:// or https:// or file:// + r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain + r"localhost|" # localhost + r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # or ip + r"(?::\d+)?" # optional port + r"(?:/?|[/?]\S+)$", + re.IGNORECASE, + ) return URL_REGEX.match(resource_name) is not None diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py index a2f170b9fe57..e4b90d5d1347 100644 --- a/tests/nlu/featurizers/test_convert_featurizer.py +++ b/tests/nlu/featurizers/test_convert_featurizer.py @@ -1,7 +1,12 @@ import numpy as np import pytest +from typing import Text +from _pytest.monkeypatch import MonkeyPatch -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer +from rasa.nlu.tokenizers.convert_tokenizer import ( + ConveRTTokenizer, + RESTRICTED_ACCESS_URL, +) from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.nlu.constants import TOKENS_NAMES @@ -10,13 +15,15 @@ from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer -# TODO -# skip tests as the ConveRT model is not publicly available anymore (see https://github.com/RasaHQ/rasa/issues/6806) +@pytest.mark.skip_on_windows +def test_convert_featurizer_process(component_builder, monkeypatch: MonkeyPatch): + monkeypatch.setattr( + ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL + ) -@pytest.mark.skip -def test_convert_featurizer_process(component_builder): - tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) + component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} + tokenizer = ConveRTTokenizer(component_config) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" @@ -41,9 +48,14 @@ def test_convert_featurizer_process(component_builder): assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) -@pytest.mark.skip -def test_convert_featurizer_train(component_builder): - tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) +@pytest.mark.skip_on_windows +def test_convert_featurizer_train(component_builder, monkeypatch: MonkeyPatch): + + monkeypatch.setattr( + ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL + ) + component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} + tokenizer = ConveRTTokenizer(component_config) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" @@ -88,6 +100,7 @@ def test_convert_featurizer_train(component_builder): assert sent_vecs is None +@pytest.mark.skip_on_windows @pytest.mark.parametrize( "sentence, expected_text", [ @@ -98,9 +111,15 @@ def test_convert_featurizer_train(component_builder): ("ńöñàśçií", "ńöñàśçií"), ], ) -@pytest.mark.skip -def test_convert_featurizer_tokens_to_text(component_builder, sentence, expected_text): - tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) +def test_convert_featurizer_tokens_to_text( + sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch +): + + monkeypatch.setattr( + ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL + ) + component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} + tokenizer = ConveRTTokenizer(component_config) tokens = tokenizer.tokenize(Message(data={TEXT: sentence}), attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] diff --git a/tests/nlu/test_utils.py b/tests/nlu/test_utils.py index cbb7928ecf09..ec8258146606 100644 --- a/tests/nlu/test_utils.py +++ b/tests/nlu/test_utils.py @@ -4,6 +4,7 @@ import pytest import tempfile import shutil +from typing import Text from rasa.shared.exceptions import RasaException import rasa.shared.nlu.training_data.message @@ -103,6 +104,19 @@ def test_remove_model_invalid(empty_model_dir): os.remove(test_file_path) -def test_is_url(): - assert not utils.is_url("./some/file/path") - assert utils.is_url("https://rasa.com/") +@pytest.mark.parametrize( + "url, result", + [ + ("a/b/c", False), + ("a", False), + ("https://192.168.1.1", True), + ("http://192.168.1.1", True), + ("https://google.com", True), + ("https://www.google.com", True), + ("http://google.com", True), + ("http://www.google.com", True), + ("http://a/b/c", False), + ], +) +def test_is_url(url: Text, result: bool): + assert result == utils.is_url(url) diff --git a/tests/nlu/tokenizers/test_convert_tokenizer.py b/tests/nlu/tokenizers/test_convert_tokenizer.py index e53bb7ecd8e0..ca2770cae6b9 100644 --- a/tests/nlu/tokenizers/test_convert_tokenizer.py +++ b/tests/nlu/tokenizers/test_convert_tokenizer.py @@ -1,15 +1,22 @@ import pytest +from typing import Text, List, Tuple, Optional +from pathlib import Path +import os +from _pytest.monkeypatch import MonkeyPatch from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.nlu.constants import TOKENS_NAMES, NUMBER_OF_SUB_TOKENS from rasa.shared.nlu.constants import TEXT, INTENT -from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer - -# TODO -# skip tests as the ConveRT model is not publicly available anymore (see https://github.com/RasaHQ/rasa/issues/6806) +from rasa.nlu.tokenizers.convert_tokenizer import ( + ConveRTTokenizer, + RESTRICTED_ACCESS_URL, + ORIGINAL_TF_HUB_MODULE_URL, +) +from rasa.exceptions import RasaException +@pytest.mark.skip_on_windows @pytest.mark.parametrize( "text, expected_tokens, expected_indices", [ @@ -25,19 +32,28 @@ ("ńöñàśçií", ["ńöñàśçií"], [(0, 8)]), ], ) -@pytest.mark.skip def test_convert_tokenizer_edge_cases( - component_builder, text, expected_tokens, expected_indices + text: Text, + expected_tokens: List[Text], + expected_indices: List[Tuple[int]], + monkeypatch: MonkeyPatch, ): - tk = component_builder.create_component_from_class(ConveRTTokenizer) - tokens = tk.tokenize(Message(data={TEXT: text}), attribute=TEXT) + monkeypatch.setattr( + ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL + ) + + component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} + tokenizer = ConveRTTokenizer(component_config) + + tokens = tokenizer.tokenize(Message(data={TEXT: text}), attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] +@pytest.mark.skip_on_windows @pytest.mark.parametrize( "text, expected_tokens", [ @@ -45,35 +61,109 @@ def test_convert_tokenizer_edge_cases( ("Forecast for LUNCH", ["Forecast for LUNCH"]), ], ) -@pytest.mark.skip -def test_custom_intent_symbol(component_builder, text, expected_tokens): - tk = component_builder.create_component_from_class( - ConveRTTokenizer, intent_tokenization_flag=True, intent_split_symbol="+" +def test_custom_intent_symbol( + text: Text, expected_tokens: List[Text], monkeypatch: MonkeyPatch +): + + monkeypatch.setattr( + ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL ) + component_config = { + "name": "ConveRTTokenizer", + "model_url": RESTRICTED_ACCESS_URL, + "intent_tokenization": True, + "intent_split_symbol": "+", + } + + tokenizer = ConveRTTokenizer(component_config) + message = Message(data={TEXT: text}) message.set(INTENT, text) - tk.train(TrainingData([message])) + tokenizer.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens +@pytest.mark.skip_on_windows @pytest.mark.parametrize( "text, expected_number_of_sub_tokens", [("Aarhus is a city", [2, 1, 1, 1]), ("sentence embeddings", [1, 3])], ) -@pytest.mark.skip def test_convert_tokenizer_number_of_sub_tokens( - component_builder, text, expected_number_of_sub_tokens + text: Text, expected_number_of_sub_tokens: List[int], monkeypatch: MonkeyPatch ): - tk = component_builder.create_component_from_class(ConveRTTokenizer) + monkeypatch.setattr( + ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL + ) + component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} + tokenizer = ConveRTTokenizer(component_config) message = Message(data={TEXT: text}) message.set(INTENT, text) - tk.train(TrainingData([message])) + tokenizer.train(TrainingData([message])) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) ] == expected_number_of_sub_tokens + + +@pytest.mark.skip_on_windows +@pytest.mark.parametrize( + "model_url, exception_phrase", + [ + (ORIGINAL_TF_HUB_MODULE_URL, "which does not contain the model any longer"), + ( + RESTRICTED_ACCESS_URL, + "which is strictly reserved for pytests of Rasa Open Source only", + ), + (None, """"model_url" was not specified in the configuration"""), + ("", """"model_url" was not specified in the configuration"""), + ], +) +def test_raise_invalid_urls(model_url: Optional[Text], exception_phrase: Text): + + component_config = {"name": "ConveRTTokenizer", "model_url": model_url} + with pytest.raises(RasaException) as excinfo: + _ = ConveRTTokenizer(component_config) + + assert exception_phrase in str(excinfo.value) + + +@pytest.mark.skip_on_windows +def test_raise_wrong_model_directory(tmp_path: Path): + + component_config = {"name": "ConveRTTokenizer", "model_url": str(tmp_path)} + + with pytest.raises(RasaException) as excinfo: + _ = ConveRTTokenizer(component_config) + + assert "Re-check the files inside the directory" in str(excinfo.value) + + +@pytest.mark.skip_on_windows +def test_raise_wrong_model_file(tmp_path: Path): + + # create a dummy file + temp_file = os.path.join(tmp_path, "saved_model.pb") + f = open(temp_file, "wb") + f.close() + component_config = {"name": "ConveRTTokenizer", "model_url": temp_file} + + with pytest.raises(RasaException) as excinfo: + _ = ConveRTTokenizer(component_config) + + assert "set to the path of a file which is invalid" in str(excinfo.value) + + +@pytest.mark.skip_on_windows +def test_raise_invalid_path(): + + component_config = {"name": "ConveRTTokenizer", "model_url": "saved_model.pb"} + + with pytest.raises(RasaException) as excinfo: + _ = ConveRTTokenizer(component_config) + + assert "neither a valid remote URL nor a local directory" in str(excinfo.value)