Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restrict acceptable values for model_url parameter of ConveRTTokenizer #7089

Merged
merged 25 commits into from
Oct 29, 2020
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ed54e96
add checks inside class
dakshvar22 Oct 22, 2020
a54f499
Merge branch '2.0.x' into convert_url_fix
dakshvar22 Oct 22, 2020
2108645
Merge branch '2.0.x' into convert_url_fix
dakshvar22 Oct 26, 2020
08180b2
add all checks for graceful exits
dakshvar22 Oct 27, 2020
aadee55
Merge branch '2.0.x' into convert_url_fix
dakshvar22 Oct 27, 2020
92ad2be
add warning in docs and changelog
dakshvar22 Oct 27, 2020
cc492c8
add warning in docs and changelog
dakshvar22 Oct 27, 2020
99dda04
comments
dakshvar22 Oct 27, 2020
10831c4
change URL
dakshvar22 Oct 27, 2020
f0a4fcf
fix whitespace
dakshvar22 Oct 27, 2020
65649e2
fix tests
dakshvar22 Oct 28, 2020
0da65a5
Merge branch '2.0.x' into convert_url_fix
dakshvar22 Oct 28, 2020
f88b309
Update rasa/nlu/tokenizers/convert_tokenizer.py
dakshvar22 Oct 28, 2020
209c760
skip windows tests
dakshvar22 Oct 28, 2020
d22c17f
Merge branch '2.0.x' into convert_url_fix
dakshvar22 Oct 28, 2020
40f17c8
fix docstring issues
dakshvar22 Oct 28, 2020
eaeb2e9
Merge branch 'convert_url_fix' of github.com:RasaHQ/rasa into convert…
dakshvar22 Oct 28, 2020
3473415
linter
dakshvar22 Oct 28, 2020
c54c537
partial review comments
dakshvar22 Oct 29, 2020
e2dd59d
added extra test
dakshvar22 Oct 29, 2020
4ad3c4c
monkeypatch
dakshvar22 Oct 29, 2020
bdcfdc5
last review comments
dakshvar22 Oct 29, 2020
1a49b52
Update rasa/utils/io.py
dakshvar22 Oct 29, 2020
263c336
make linter happy
dakshvar22 Oct 29, 2020
07ae279
Merge branch 'convert_url_fix' of github.com:RasaHQ/rasa into convert…
dakshvar22 Oct 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions changelog/7089.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Fix [ConveRTTokenizer](https://rasa.com/docs/rasa/components#converttokenizer) failing because of wrong model URL by making the `model_url` parameter of `ConveRTTokenizer` mandatory.
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved

Since the ConveRT model was taken [offline](https://github.com/RasaHQ/rasa/issues/6806), we can no longer use
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
the earlier public URL of the model. Additionally, since the licence for the model is unknown,
we cannot host it ourselves. Users can still use the component by setting `model_url` to a community/self-hosted
model URL or path to a local directory containing model files. For example:
```yaml
pipeline:
- name: ConveRTTokenizer
model_url: <remote/local path to model>
```
10 changes: 8 additions & 2 deletions docs/docs/components.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,16 @@ word vectors in your pipeline.
"intent_split_symbol": "_"
# Regular expression to detect tokens
"token_pattern": None
# Remote URL of hosted model
"model_url": TF_HUB_MODULE_URL
# Remote URL/Local directory of model files(Required)
"model_url": None
```

:::note
Since the public URL of the ConveRT model was taken offline recently, it is now mandatory
to set the parameter `model_url` to a community/self-hosted URL or path to a local directory containing model files.

:::


### LanguageModelTokenizer

Expand Down
117 changes: 111 additions & 6 deletions rasa/nlu/tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,27 @@
from rasa.nlu.tokenizers.tokenizer import Token
from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from rasa.shared.nlu.training_data.message import Message
from rasa.utils import common
from rasa.utils import common, io
import rasa.utils.train_utils as train_utils
from rasa.exceptions import RasaException
import tensorflow as tf
import os


TF_HUB_MODULE_URL = (
# URL to the old remote location of the model which
# users might use. The model is no longer hosted here.
ORIGINAL_TF_HUB_MODULE_URL = (
"https://github.com/PolyAI-LDN/polyai-models/releases/download/v1.0/model.tar.gz"
)

# Warning: This URL is only intended for running pytests on ConveRT
# related components. This URL should not be allowed to be used by the user.
RESTRICTED_ACCESS_URL = "https://storage.googleapis.com/continuous-integration-model-storage/convert_tf2.tar.gz"


class ConveRTTokenizer(WhitespaceTokenizer):
"""Tokenizer using ConveRT model.

Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
model from TFHub and computes sub-word tokens for dense
featurizable attributes of each message object.
Expand All @@ -30,21 +39,117 @@ class ConveRTTokenizer(WhitespaceTokenizer):
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
# Remote URL of hosted model
"model_url": TF_HUB_MODULE_URL,
# Remote URL/Local path to model files
"model_url": None,
}

def __init__(self, component_config: Dict[Text, Any] = None) -> None:
"""Construct a new tokenizer using the WhitespaceTokenizer framework."""
"""Construct a new tokenizer using the WhitespaceTokenizer framework.

Args:
component_config: User configuration for the component
"""
super().__init__(component_config)

self.model_url = self.component_config.get("model_url", TF_HUB_MODULE_URL)
self.model_url = self._get_validated_model_url()

self.module = train_utils.load_tf_hub_model(self.model_url)

self.tokenize_signature = self.module.signatures["tokenize"]

@staticmethod
def _validate_model_files_exist(model_directory: Text) -> None:
"""Check if essential model files exist inside the model_directory.

Args:
model_directory: Directory to investigate
"""
files_to_check = [
os.path.join(model_directory, "saved_model.pb"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optional: let's use the pathlib to concatenate /deal with the paths.

os.path.join(model_directory, "variables/variables.index"),
os.path.join(model_directory, "variables/variables.data-00001-of-00002"),
os.path.join(model_directory, "variables/variables.data-00000-of-00002"),
]

for file_path in files_to_check:
if not os.path.exists(file_path):
raise RasaException(
f"""File {file_path} does not exist.
Re-check the files inside the directory {model_directory}.
It should contain the following model
files - [{", ".join(files_to_check)}]"""
)

def _get_validated_model_url(self) -> Text:
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
"""Validates the specified `model_url` parameter.

The `model_url` parameter cannot be left empty. It can either
be set to a remote URL where the model is hosted or it can be
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
a path to a local directory.

Returns:
Validated path to model
"""
model_url = self.component_config.get("model_url", None)

if not model_url:
raise RasaException(
"""Parameter "model_url" was not specified in the configuration of "ConveRTTokenizer".
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
You can either use a community hosted URL of the model
or if you have a local copy of the model, pass the
path to the directory containing the model files."""
)

if io.is_valid_remote_url(model_url):

if model_url == ORIGINAL_TF_HUB_MODULE_URL:
# Can't use the originally hosted URL
raise RasaException(
f"""Parameter "model_url" of "ConveRTTokenizer" was
set to "{model_url}" which does not contain the model any longer.
You can either use a community hosted URL or if you have a
local copy of the model, pass the path to the directory
containing the model files."""
)

if model_url == RESTRICTED_ACCESS_URL:
# Can't use the URL that is reserved for tests only
raise RasaException(
f"""Parameter "model_url" of "ConveRTTokenizer" was
set to "{model_url}" which is strictly reserved for pytests of Rasa Open Source only.
Due to licensing issues you are not allowed to use the model from this URL.
You can either use a community hosted URL or if you have a
local copy of the model, pass the path to the directory
containing the model files."""
)

elif os.path.isfile(model_url):
# Definitely invalid since the specified path should be a directory
raise RasaException(
"""Parameter "model_url" of "ConveRTTokenizer" was
set to the path of a file which is invalid. You
can either use a community hosted URL or if you have a
local copy of the model, pass the path to the directory
containing the model files."""
)

elif os.path.isdir(model_url):
# Looks like a local directory. Inspect the directory
# to see if model files exist.
self._validate_model_files_exist(model_url)
# Convert the path to an absolute one since
# TFHUB doesn't like relative paths
model_url = os.path.abspath(model_url)

else:
raise RasaException(
f"""{model_url} is neither a valid remote URL nor a local directory.
You can either use a community hosted URL or if you have a
local copy of the model, pass the path to
the directory containing the model files."""
)
return model_url

@classmethod
def cache_key(
cls, component_meta: Dict[Text, Any], model_metadata: Metadata
Expand Down
24 changes: 24 additions & 0 deletions rasa/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from io import BytesIO as IOReader
from pathlib import Path
from typing import Text, Any, Union, List, Type, Callable, TYPE_CHECKING
import re
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved

import rasa.shared.constants
import rasa.shared.utils.io
Expand Down Expand Up @@ -212,3 +213,26 @@ def json_pickle(file_name: Union[Text, Path], obj: Any) -> None:
jsonpickle_numpy.register_handlers()

rasa.shared.utils.io.write_text_file(jsonpickle.dumps(obj), file_name)


def is_valid_remote_url(url: Text) -> bool:
"""Check whether the url specified is a well formed one.

Regex taken from https://stackoverflow.com/a/7160778/3001665

Args:
url: Remote URL to validate

Returns: Boolean for validity
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
"""
regex = re.compile(
wochinge marked this conversation as resolved.
Show resolved Hide resolved
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)

return re.match(regex, url) is not None
42 changes: 30 additions & 12 deletions tests/nlu/featurizers/test_convert_featurizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import numpy as np
import pytest
from typing import Text

from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
from rasa.nlu.tokenizers.convert_tokenizer import (
ConveRTTokenizer,
RESTRICTED_ACCESS_URL,
)
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.message import Message
from rasa.nlu.constants import TOKENS_NAMES
Expand All @@ -10,13 +14,15 @@
from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer


# TODO
# skip tests as the ConveRT model is not publicly available anymore (see https://github.com/RasaHQ/rasa/issues/6806)
@pytest.mark.skip_on_windows
def test_convert_featurizer_process(component_builder, monkeypatch):

monkeypatch.setattr(
ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL
)

@pytest.mark.skip
def test_convert_featurizer_process(component_builder):
tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL}
tokenizer = ConveRTTokenizer(component_config)
featurizer = component_builder.create_component_from_class(ConveRTFeaturizer)

sentence = "Hey how are you today ?"
Expand All @@ -41,9 +47,14 @@ def test_convert_featurizer_process(component_builder):
assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)


@pytest.mark.skip
def test_convert_featurizer_train(component_builder):
tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
@pytest.mark.skip_on_windows
def test_convert_featurizer_train(component_builder, monkeypatch):

monkeypatch.setattr(
ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL
)
component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL}
tokenizer = ConveRTTokenizer(component_config)
featurizer = component_builder.create_component_from_class(ConveRTFeaturizer)

sentence = "Hey how are you today ?"
Expand Down Expand Up @@ -88,6 +99,7 @@ def test_convert_featurizer_train(component_builder):
assert sent_vecs is None


@pytest.mark.skip_on_windows
@pytest.mark.parametrize(
"sentence, expected_text",
[
Expand All @@ -98,9 +110,15 @@ def test_convert_featurizer_train(component_builder):
("ńöñàśçií", "ńöñàśçií"),
],
)
@pytest.mark.skip
def test_convert_featurizer_tokens_to_text(component_builder, sentence, expected_text):
tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
def test_convert_featurizer_tokens_to_text(
sentence: Text, expected_text: Text, monkeypatch
dakshvar22 marked this conversation as resolved.
Show resolved Hide resolved
):

monkeypatch.setattr(
ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL
)
component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL}
tokenizer = ConveRTTokenizer(component_config)
tokens = tokenizer.tokenize(Message(data={TEXT: sentence}), attribute=TEXT)

actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]
Expand Down
Loading