From 0e2975af0543fe0b22bf2ed0cbed1b44b23c77c2 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 11 Feb 2021 17:32:51 +0100 Subject: [PATCH 1/5] add tests for profiling --- tests/shared/importers/test_importer.py | 69 +++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/shared/importers/test_importer.py b/tests/shared/importers/test_importer.py index 1f8d98c12bb2..392ddd0851ba 100644 --- a/tests/shared/importers/test_importer.py +++ b/tests/shared/importers/test_importer.py @@ -1,8 +1,11 @@ +import cProfile import os +import sys from pathlib import Path from typing import Text, Dict, Type, List, Any import pytest +from _pytest.monkeypatch import MonkeyPatch from rasa.shared.constants import ( DEFAULT_CONFIG_PATH, @@ -24,6 +27,7 @@ from rasa.shared.importers.rasa import RasaFileImporter from rasa.shared.nlu.constants import ACTION_TEXT, ACTION_NAME, INTENT, TEXT from rasa.shared.nlu.training_data.message import Message +from rasa.train import train_async @pytest.fixture() @@ -350,3 +354,68 @@ async def test_nlu_data_domain_sync_responses(project: Text): # Responses were sync between "test_responses.yml" and the "domain.yml" assert "utter_rasa" in domain.templates.keys() + + +async def test_profile_training_data_loading(): + dataset = os.environ.get( + "DATASET_PATH", + str( + Path.home() / "Workspace" / "training-data" / "public" / "MultiWOZ" / "rasa" + ), + ) + + dataset = Path(dataset) + + profile = cProfile.Profile() + profile.enable() + + importer = TrainingDataImporter.load_from_dict( + config_path=str(dataset / "config.yml"), + domain_path=str(dataset / "domain.yml"), + training_data_paths=[str(dataset / "data")], + ) + + # Access data to make sure all steps were performed + domain = await importer.get_domain() + stories = await importer.get_stories() + nlu_data = await importer.get_nlu_data() + config = await importer.get_config() + + profile.disable() + + profile.dump_stats("./test_inference.prof") + + +async def test_profile_training_data_loading2(monkeypatch: MonkeyPatch): + dataset = os.environ.get( + "DATASET_PATH", + str( + Path.home() / "Workspace" / "training-data" / "public" / "MultiWOZ" / "rasa" + ), + ) + + dataset = Path(dataset) + + async def _do_training( + file_importer: TrainingDataImporter, *args: Any, **kwargs + ) -> None: + # Access data to make sure all steps were performed + domain = await file_importer.get_domain() + stories = await file_importer.get_stories() + nlu_data = await file_importer.get_nlu_data() + config = await file_importer.get_config() + + # skip actual training + monkeypatch.setattr(sys.modules["rasa.train"], "_do_training", _do_training) + + profile = cProfile.Profile() + profile.enable() + + await train_async( + domain=str(dataset / "domain.yml"), + config=str(dataset / "config.yml"), + training_files=str(dataset / "data"), + ) + + profile.disable() + profile.dump_stats("./test_inference2.prof") From 4d5693cd3ebb99047567b728e75f3a241bdeafe1 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 11 Feb 2021 17:40:14 +0100 Subject: [PATCH 2/5] deactivate timeout --- tests/shared/importers/test_importer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/shared/importers/test_importer.py b/tests/shared/importers/test_importer.py index 392ddd0851ba..c6198173f23a 100644 --- a/tests/shared/importers/test_importer.py +++ b/tests/shared/importers/test_importer.py @@ -356,6 +356,7 @@ async def test_nlu_data_domain_sync_responses(project: Text): assert "utter_rasa" in domain.templates.keys() +@pytest.mark.timeout(0) async def test_profile_training_data_loading(): dataset = os.environ.get( "DATASET_PATH", @@ -386,6 +387,7 @@ async def test_profile_training_data_loading(): profile.dump_stats("./test_inference.prof") +@pytest.mark.timeout(0) async def test_profile_training_data_loading2(monkeypatch: MonkeyPatch): dataset = os.environ.get( "DATASET_PATH", From f63723d86e40c4a1074c08bed4719f8e988678cb Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 12 Feb 2021 10:03:53 +0100 Subject: [PATCH 3/5] include multiwoz loading with huggingface --- tests/shared/importers/test_importer.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/shared/importers/test_importer.py b/tests/shared/importers/test_importer.py index c6198173f23a..a9f91076e437 100644 --- a/tests/shared/importers/test_importer.py +++ b/tests/shared/importers/test_importer.py @@ -1,6 +1,7 @@ import cProfile import os import sys +import time from pathlib import Path from typing import Text, Dict, Type, List, Any @@ -421,3 +422,22 @@ async def _do_training( profile.disable() profile.dump_stats("./test_inference2.prof") + + +def test_load_multiwoz_with_huggingface(): + # You need to install the library for this first: `pip install datasets` + import datasets + + # You can view all datasets using + # all_datasets = datasets.list_datasets() + + start = time.time() + profile = cProfile.Profile() + profile.enable() + + woz = datasets.load_dataset("multi_woz_v22", ignore_verifications=True) + + profile.disable() + profile.dump_stats("./profiling_multiwoz_huggingface.prof") + + print(f"Loading this took {time.time() - start} seconds.") From dc9f06d709317164a69df8dc238bbfa6af1ef486 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 12 Feb 2021 16:56:56 +0100 Subject: [PATCH 4/5] speed up yaml reading with hacks --- rasa/model.py | 51 ++++++++++++------- .../story_reader/yaml_story_reader.py | 11 +++- .../nlu/training_data/formats/rasa_yaml.py | 10 +++- rasa/shared/utils/io.py | 4 +- tests/shared/importers/test_importer.py | 13 ++++- 5 files changed, 64 insertions(+), 25 deletions(-) diff --git a/rasa/model.py b/rasa/model.py index ff057cf08596..90215f24ba3d 100644 --- a/rasa/model.py +++ b/rasa/model.py @@ -323,28 +323,43 @@ async def model_fingerprint(file_importer: "TrainingDataImporter") -> Fingerprin domain.templates = {} return { - FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config( - config, exclude_keys=CONFIG_KEYS - ), - FINGERPRINT_CONFIG_CORE_KEY: _get_fingerprint_of_config( - config, include_keys=CONFIG_KEYS_CORE - ), - FINGERPRINT_CONFIG_NLU_KEY: _get_fingerprint_of_config( - config, include_keys=CONFIG_KEYS_NLU - ), - FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY: _get_fingerprint_of_config_without_epochs( - config - ), - FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: domain.fingerprint(), - FINGERPRINT_NLG_KEY: rasa.shared.utils.io.deep_container_fingerprint(responses), - FINGERPRINT_PROJECT: project_fingerprint(), - FINGERPRINT_NLU_DATA_KEY: nlu_data.fingerprint(), - FINGERPRINT_NLU_LABELS_KEY: nlu_data.label_fingerprint(), - FINGERPRINT_STORIES_KEY: stories.fingerprint(), + FINGERPRINT_CONFIG_KEY: 1, + FINGERPRINT_CONFIG_CORE_KEY: 2, + FINGERPRINT_CONFIG_NLU_KEY: 3, + FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY: 4, + FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: 5, + FINGERPRINT_NLG_KEY: 5, + FINGERPRINT_PROJECT: 6, + FINGERPRINT_NLU_DATA_KEY: 6, + FINGERPRINT_NLU_LABELS_KEY: 7, + FINGERPRINT_STORIES_KEY: 8, FINGERPRINT_TRAINED_AT_KEY: time.time(), FINGERPRINT_RASA_VERSION_KEY: rasa.__version__, } + # return { + # FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config( + # config, exclude_keys=CONFIG_KEYS + # ), + # FINGERPRINT_CONFIG_CORE_KEY: _get_fingerprint_of_config( + # config, include_keys=CONFIG_KEYS_CORE + # ), + # FINGERPRINT_CONFIG_NLU_KEY: _get_fingerprint_of_config( + # config, include_keys=CONFIG_KEYS_NLU + # ), + # FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY: _get_fingerprint_of_config_without_epochs( + # config + # ), + # FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: domain.fingerprint(), + # FINGERPRINT_NLG_KEY: rasa.shared.utils.io.deep_container_fingerprint(responses), + # FINGERPRINT_PROJECT: project_fingerprint(), + # FINGERPRINT_NLU_DATA_KEY: nlu_data.fingerprint(), + # FINGERPRINT_NLU_LABELS_KEY: nlu_data.label_fingerprint(), + # FINGERPRINT_STORIES_KEY: stories.fingerprint(), + # FINGERPRINT_TRAINED_AT_KEY: time.time(), + # FINGERPRINT_RASA_VERSION_KEY: rasa.__version__, + # } + def _get_fingerprint_of_config( config: Optional[Dict[Text, Any]], diff --git a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py index e1cb446e8971..d901becd3faf 100644 --- a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py +++ b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py @@ -178,8 +178,15 @@ def is_key_in_yaml(cls, file_path: Union[Text, Path], *keys: Text) -> bool: YamlException: if the file seems to be a YAML file (extension) but can not be read / parsed. """ - content = rasa.shared.utils.io.read_yaml_file(file_path) - return any(key in content for key in keys) + + return cls.is_key_in_yaml2(file_path, *keys) + # content = rasa.shared.utils.io.read_yaml_file(file_path) + # return any(key in content for key in keys) + + @classmethod + def is_key_in_yaml2(cls, file_path: Union[Text, Path], *keys: Text) -> bool: + with open(file_path) as f: + return any(any(line.startswith(key) for key in keys) for line in f) @classmethod def _has_test_prefix(cls, file_path: Text) -> bool: diff --git a/rasa/shared/nlu/training_data/formats/rasa_yaml.py b/rasa/shared/nlu/training_data/formats/rasa_yaml.py index d8d6f480e5a2..f8718fbc36e2 100644 --- a/rasa/shared/nlu/training_data/formats/rasa_yaml.py +++ b/rasa/shared/nlu/training_data/formats/rasa_yaml.py @@ -350,9 +350,15 @@ def is_yaml_nlu_file(filename: Text) -> bool: if not rasa.shared.data.is_likely_yaml_file(filename): return False - content = rasa.shared.utils.io.read_yaml_file(filename) + return RasaYAMLReader.is_key_in_yaml2(filename, KEY_NLU, KEY_RESPONSES) + # content = rasa.shared.utils.io.read_yaml_file(filename) - return any(key in content for key in {KEY_NLU, KEY_RESPONSES}) + # return any(key in content for key in {KEY_NLU, KEY_RESPONSES}) + + @classmethod + def is_key_in_yaml2(cls, file_path: Union[Text, Path], *keys: Text) -> bool: + with open(file_path) as f: + return any(any(line.startswith(key) for key in keys) for line in f) class RasaYAMLWriter(TrainingDataWriter): diff --git a/rasa/shared/utils/io.py b/rasa/shared/utils/io.py index 008b5dc068a1..5316b1ec5948 100644 --- a/rasa/shared/utils/io.py +++ b/rasa/shared/utils/io.py @@ -328,9 +328,9 @@ def read_yaml(content: Text, reader_type: Union[Text, List[Text]] = "safe") -> A Raises: ruamel.yaml.parser.ParserError: If there was an error when parsing the YAML. """ - fix_yaml_loader() + # fix_yaml_loader() - replace_environment_variables() + # replace_environment_variables() yaml_parser = yaml.YAML(typ=reader_type) yaml_parser.version = YAML_VERSION diff --git a/tests/shared/importers/test_importer.py b/tests/shared/importers/test_importer.py index a9f91076e437..14ab072524be 100644 --- a/tests/shared/importers/test_importer.py +++ b/tests/shared/importers/test_importer.py @@ -1,9 +1,11 @@ +import asyncio import cProfile import os import sys import time from pathlib import Path from typing import Text, Dict, Type, List, Any +from unittest.mock import Mock import pytest from _pytest.monkeypatch import MonkeyPatch @@ -408,9 +410,18 @@ async def _do_training( nlu_data = await file_importer.get_nlu_data() config = await file_importer.get_config() + # YAML validation is slow as fuck + import rasa.shared.utils.validation + + monkeypatch.setattr( + rasa.shared.utils.validation, + rasa.shared.utils.validation.validate_yaml_schema.__name__, + Mock(), + ) + # skip actual training + # monkeypatch.setattr(sys.modules["rasa.train"], "_do_training", _do_training) monkeypatch.setattr(sys.modules["rasa.train"], "_do_training", _do_training) - profile = cProfile.Profile() profile.enable() From 66ff48672d02b1983dd146da5bb9adbcc5dd803c Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Mon, 15 Mar 2021 16:17:17 +0100 Subject: [PATCH 5/5] revert custom changes --- rasa/model.py | 51 +++++++------------ .../nlu/training_data/formats/rasa_yaml.py | 10 +--- 2 files changed, 20 insertions(+), 41 deletions(-) diff --git a/rasa/model.py b/rasa/model.py index ecf014bb9989..6a5e2a2e683b 100644 --- a/rasa/model.py +++ b/rasa/model.py @@ -323,43 +323,28 @@ async def model_fingerprint(file_importer: "TrainingDataImporter") -> Fingerprin domain.responses = {} return { - FINGERPRINT_CONFIG_KEY: 1, - FINGERPRINT_CONFIG_CORE_KEY: 2, - FINGERPRINT_CONFIG_NLU_KEY: 3, - FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY: 4, - FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: 5, - FINGERPRINT_NLG_KEY: 5, - FINGERPRINT_PROJECT: 6, - FINGERPRINT_NLU_DATA_KEY: 6, - FINGERPRINT_NLU_LABELS_KEY: 7, - FINGERPRINT_STORIES_KEY: 8, + FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config( + config, exclude_keys=CONFIG_KEYS + ), + FINGERPRINT_CONFIG_CORE_KEY: _get_fingerprint_of_config( + config, include_keys=CONFIG_KEYS_CORE + ), + FINGERPRINT_CONFIG_NLU_KEY: _get_fingerprint_of_config( + config, include_keys=CONFIG_KEYS_NLU + ), + FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY: _get_fingerprint_of_config_without_epochs( + config + ), + FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: domain.fingerprint(), + FINGERPRINT_NLG_KEY: rasa.shared.utils.io.deep_container_fingerprint(responses), + FINGERPRINT_PROJECT: project_fingerprint(), + FINGERPRINT_NLU_DATA_KEY: nlu_data.fingerprint(), + FINGERPRINT_NLU_LABELS_KEY: nlu_data.label_fingerprint(), + FINGERPRINT_STORIES_KEY: stories.fingerprint(), FINGERPRINT_TRAINED_AT_KEY: time.time(), FINGERPRINT_RASA_VERSION_KEY: rasa.__version__, } - # return { - # FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config( - # config, exclude_keys=CONFIG_KEYS - # ), - # FINGERPRINT_CONFIG_CORE_KEY: _get_fingerprint_of_config( - # config, include_keys=CONFIG_KEYS_CORE - # ), - # FINGERPRINT_CONFIG_NLU_KEY: _get_fingerprint_of_config( - # config, include_keys=CONFIG_KEYS_NLU - # ), - # FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY: _get_fingerprint_of_config_without_epochs( - # config - # ), - # FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: domain.fingerprint(), - # FINGERPRINT_NLG_KEY: rasa.shared.utils.io.deep_container_fingerprint(responses), - # FINGERPRINT_PROJECT: project_fingerprint(), - # FINGERPRINT_NLU_DATA_KEY: nlu_data.fingerprint(), - # FINGERPRINT_NLU_LABELS_KEY: nlu_data.label_fingerprint(), - # FINGERPRINT_STORIES_KEY: stories.fingerprint(), - # FINGERPRINT_TRAINED_AT_KEY: time.time(), - # FINGERPRINT_RASA_VERSION_KEY: rasa.__version__, - # } - def _get_fingerprint_of_config( config: Optional[Dict[Text, Any]], diff --git a/rasa/shared/nlu/training_data/formats/rasa_yaml.py b/rasa/shared/nlu/training_data/formats/rasa_yaml.py index ee5f8ba9ecab..ed54e58a8f23 100644 --- a/rasa/shared/nlu/training_data/formats/rasa_yaml.py +++ b/rasa/shared/nlu/training_data/formats/rasa_yaml.py @@ -351,15 +351,9 @@ def is_yaml_nlu_file(filename: Text) -> bool: if not rasa.shared.data.is_likely_yaml_file(filename): return False - return RasaYAMLReader.is_key_in_yaml2(filename, KEY_NLU, KEY_RESPONSES) - # content = rasa.shared.utils.io.read_yaml_file(filename) + content = rasa.shared.utils.io.read_yaml_file(filename) - # return any(key in content for key in {KEY_NLU, KEY_RESPONSES}) - - @classmethod - def is_key_in_yaml2(cls, file_path: Union[Text, Path], *keys: Text) -> bool: - with open(file_path) as f: - return any(any(line.startswith(key) for key in keys) for line in f) + return any(key in content for key in {KEY_NLU, KEY_RESPONSES}) class RasaYAMLWriter(TrainingDataWriter):