Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

profile training data loading #7944

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions tests/shared/importers/test_importer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import asyncio
import cProfile
import os
import sys
import time
from pathlib import Path
from typing import Text, Dict, Type, List, Any
from unittest.mock import Mock

import pytest
from _pytest.monkeypatch import MonkeyPatch

from rasa.shared.constants import (
DEFAULT_CONFIG_PATH,
Expand All @@ -24,6 +30,7 @@
from rasa.shared.importers.rasa import RasaFileImporter
from rasa.shared.nlu.constants import ACTION_TEXT, ACTION_NAME, INTENT, TEXT
from rasa.shared.nlu.training_data.message import Message
from rasa.train import train_async


@pytest.fixture()
Expand Down Expand Up @@ -350,3 +357,98 @@ async def test_nlu_data_domain_sync_responses(project: Text):

# Responses were sync between "test_responses.yml" and the "domain.yml"
assert "utter_rasa" in domain.responses.keys()


@pytest.mark.timeout(0)
async def test_profile_training_data_loading():
dataset = os.environ.get(
"DATASET_PATH",
str(
Path.home() / "Workspace" / "training-data" / "public" / "MultiWOZ" / "rasa"
),
)

dataset = Path(dataset)

profile = cProfile.Profile()
profile.enable()

importer = TrainingDataImporter.load_from_dict(
config_path=str(dataset / "config.yml"),
domain_path=str(dataset / "domain.yml"),
training_data_paths=[str(dataset / "data")],
)

# Access data to make sure all steps were performed
domain = await importer.get_domain()
stories = await importer.get_stories()
nlu_data = await importer.get_nlu_data()
config = await importer.get_config()

profile.disable()

profile.dump_stats("./test_inference.prof")


@pytest.mark.timeout(0)
async def test_profile_training_data_loading2(monkeypatch: MonkeyPatch):
dataset = os.environ.get(
"DATASET_PATH",
str(
Path.home() / "Workspace" / "training-data" / "public" / "MultiWOZ" / "rasa"
),
)

dataset = Path(dataset)

async def _do_training(
file_importer: TrainingDataImporter, *args: Any, **kwargs
) -> None:
# Access data to make sure all steps were performed
domain = await file_importer.get_domain()
stories = await file_importer.get_stories()
nlu_data = await file_importer.get_nlu_data()
config = await file_importer.get_config()

# YAML validation is slow as fuck
import rasa.shared.utils.validation

monkeypatch.setattr(
rasa.shared.utils.validation,
rasa.shared.utils.validation.validate_yaml_schema.__name__,
Mock(),
)

# skip actual training
# monkeypatch.setattr(sys.modules["rasa.train"], "_do_training", _do_training)
monkeypatch.setattr(sys.modules["rasa.train"], "_do_training", _do_training)
profile = cProfile.Profile()
profile.enable()

await train_async(
domain=str(dataset / "domain.yml"),
config=str(dataset / "config.yml"),
training_files=str(dataset / "data"),
)

profile.disable()
profile.dump_stats("./test_inference2.prof")


def test_load_multiwoz_with_huggingface():
# You need to install the library for this first: `pip install datasets`
import datasets

# You can view all datasets using
# all_datasets = datasets.list_datasets()

start = time.time()
profile = cProfile.Profile()
profile.enable()

woz = datasets.load_dataset("multi_woz_v22", ignore_verifications=True)

profile.disable()
profile.dump_stats("./profiling_multiwoz_huggingface.prof")

print(f"Loading this took {time.time() - start} seconds.")