From c4199571742756d7c4295401fa63753dcd1d9de1 Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Wed, 21 Nov 2018 11:23:13 +0100 Subject: [PATCH 01/24] Improve logging in CLI --- snips_nlu/cli/inference.py | 12 ++++++++++-- snips_nlu/cli/metrics.py | 21 +++++++++++++++++---- snips_nlu/cli/training.py | 2 +- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/snips_nlu/cli/inference.py b/snips_nlu/cli/inference.py index cfcfe2819..345fb2a5d 100644 --- a/snips_nlu/cli/inference.py +++ b/snips_nlu/cli/inference.py @@ -1,19 +1,27 @@ from __future__ import unicode_literals, print_function import json +import logging + from builtins import input import plac from snips_nlu import SnipsNLUEngine +from snips_nlu.cli.utils import set_nlu_logger @plac.annotations( training_path=("Path to a trained engine", "positional", None, str), query=("Query to parse. If provided, it disables the interactive " - "behavior.", "option", "q", str)) -def parse(training_path, query): + "behavior.", "option", "q", str), + verbose=("Print logs", "flag", "v"), +) +def parse(training_path, query, verbose=False): """Load a trained NLU engine and play with its parsing API interactively""" + if verbose: + set_nlu_logger(logging.DEBUG) + engine = SnipsNLUEngine.from_path(training_path) if query: diff --git a/snips_nlu/cli/metrics.py b/snips_nlu/cli/metrics.py index a383970ab..e76ea3766 100644 --- a/snips_nlu/cli/metrics.py +++ b/snips_nlu/cli/metrics.py @@ -1,11 +1,14 @@ from __future__ import print_function, unicode_literals import json +import logging + from pathlib import Path import plac from snips_nlu import SnipsNLUEngine, load_resources +from snips_nlu.cli.utils import set_nlu_logger from snips_nlu.utils import json_string @@ -38,10 +41,15 @@ def parse(self, text): "(between 0 and 1)", "option", "t", float), exclude_slot_metrics=("Exclude slot metrics and slot errors in the output", "flag", "s", bool), - include_errors=("Include parsing errors in the output", "flag", "i", bool)) + include_errors=("Include parsing errors in the output", "flag", "i", bool), + verbose=("Print logs", "flag", "v"), +) def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5, train_size_ratio=1.0, exclude_slot_metrics=False, - include_errors=False): + include_errors=False, verbose=False): + if verbose: + set_nlu_logger(logging.DEBUG) + def progression_handler(progress): print("%d%%" % int(progress * 100)) @@ -84,10 +92,15 @@ def progression_handler(progress): config_path=("Path to a NLU engine config file", "option", "c", str), exclude_slot_metrics=("Exclude slot metrics and slot errors in the output", "flag", "s", bool), - include_errors=("Include parsing errors in the output", "flag", "i", bool)) + include_errors=("Include parsing errors in the output", "flag", "i", bool), + verbose=("Print logs", "flag", "v"), +) def train_test_metrics(train_dataset_path, test_dataset_path, output_path, config_path=None, exclude_slot_metrics=False, - include_errors=False): + include_errors=False, verbose=True): + if verbose: + set_nlu_logger(logging.DEBUG) + if config_path is not None: with Path(config_path).open("r", encoding="utf-8") as f: config = json.load(f) diff --git a/snips_nlu/cli/training.py b/snips_nlu/cli/training.py index 682309220..72a12d106 100644 --- a/snips_nlu/cli/training.py +++ b/snips_nlu/cli/training.py @@ -20,7 +20,7 @@ def train(dataset_path, output_path, config_path, verbose): """Train an NLU engine on the provided dataset""" if verbose: - set_nlu_logger(logging.INFO) + set_nlu_logger(logging.DEBUG) with Path(dataset_path).open("r", encoding="utf8") as f: dataset = json.load(f) From ba36802c3d732d1f40067ab274ae2ba9b79f7d75 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 17:51:25 +0100 Subject: [PATCH 02/24] Stream logs to stdout instead of stderr --- snips_nlu/cli/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snips_nlu/cli/utils.py b/snips_nlu/cli/utils.py index a31f19e2a..e04106029 100644 --- a/snips_nlu/cli/utils.py +++ b/snips_nlu/cli/utils.py @@ -105,6 +105,6 @@ def check_resources_alias(resource_name, shortcuts): def set_nlu_logger(level=logging.INFO): logger = logging.getLogger(snips_nlu.__name__) logger.setLevel(level) - handler = logging.StreamHandler() + handler = logging.StreamHandler(sys.stdout) handler.setLevel(level) logger.addHandler(handler) From 72f67e0e0fc99e0fd68ae22b616ac1aa2f5836fe Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 14 Nov 2018 16:05:01 +0100 Subject: [PATCH 03/24] Move dataset utils into dedicated package --- snips_nlu/dataset/__init__.py | 4 ++ snips_nlu/dataset/utils.py | 38 ++++++++++++ .../{dataset.py => dataset/validation.py} | 61 +++++-------------- snips_nlu/tests/test_dataset.py | 17 +++--- 4 files changed, 65 insertions(+), 55 deletions(-) create mode 100644 snips_nlu/dataset/__init__.py create mode 100644 snips_nlu/dataset/utils.py rename snips_nlu/{dataset.py => dataset/validation.py} (76%) diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py new file mode 100644 index 000000000..20b840aa2 --- /dev/null +++ b/snips_nlu/dataset/__init__.py @@ -0,0 +1,4 @@ +from snips_nlu.dataset.utils import ( + extract_intent_entities, extract_utterance_entities, + get_dataset_gazetteer_entities, get_text_from_chunks) +from snips_nlu.dataset.validation import validate_and_format_dataset diff --git a/snips_nlu/dataset/utils.py b/snips_nlu/dataset/utils.py new file mode 100644 index 000000000..2047bf716 --- /dev/null +++ b/snips_nlu/dataset/utils.py @@ -0,0 +1,38 @@ +from future.utils import iteritems, itervalues + +from snips_nlu.constants import ( + DATA, ENTITIES, ENTITY, INTENTS, TEXT, UTTERANCES) +from snips_nlu.entity_parser.builtin_entity_parser import is_gazetteer_entity + + +def extract_utterance_entities(dataset): + entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]} + + for intent in itervalues(dataset[INTENTS]): + for utterance in intent[UTTERANCES]: + for chunk in utterance[DATA]: + if ENTITY in chunk: + entities_values[chunk[ENTITY]].add(chunk[TEXT].strip()) + return {k: list(v) for k, v in iteritems(entities_values)} + + +def extract_intent_entities(dataset, entity_filter=None): + intent_entities = {intent: set() for intent in dataset[INTENTS]} + for intent_name, intent_data in iteritems(dataset[INTENTS]): + for utterance in intent_data[UTTERANCES]: + for chunk in utterance[DATA]: + if ENTITY in chunk: + if entity_filter and not entity_filter(chunk[ENTITY]): + continue + intent_entities[intent_name].add(chunk[ENTITY]) + return intent_entities + + +def get_text_from_chunks(chunks): + return "".join(chunk[TEXT] for chunk in chunks) + + +def get_dataset_gazetteer_entities(dataset, intent=None): + if intent is not None: + return extract_intent_entities(dataset, is_gazetteer_entity)[intent] + return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)} diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset/validation.py similarity index 76% rename from snips_nlu/dataset.py rename to snips_nlu/dataset/validation.py index 22fba7c30..ec2c135c2 100644 --- a/snips_nlu/dataset.py +++ b/snips_nlu/dataset/validation.py @@ -12,36 +12,14 @@ AUTOMATICALLY_EXTENSIBLE, CAPITALIZE, DATA, ENTITIES, ENTITY, INTENTS, LANGUAGE, MATCHING_STRICTNESS, SLOT_NAME, SYNONYMS, TEXT, USE_SYNONYMS, UTTERANCES, VALIDATED, VALUE) +from snips_nlu.dataset import extract_utterance_entities from snips_nlu.entity_parser.builtin_entity_parser import ( - BuiltinEntityParser, is_builtin_entity, is_gazetteer_entity) + BuiltinEntityParser, is_builtin_entity) from snips_nlu.preprocessing import tokenize_light from snips_nlu.string_variations import get_string_variations from snips_nlu.utils import validate_key, validate_keys, validate_type -def extract_utterance_entities(dataset): - entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]} - - for intent in itervalues(dataset[INTENTS]): - for utterance in intent[UTTERANCES]: - for chunk in utterance[DATA]: - if ENTITY in chunk: - entities_values[chunk[ENTITY]].add(chunk[TEXT].strip()) - return {k: list(v) for k, v in iteritems(entities_values)} - - -def extract_intent_entities(dataset, entity_filter=None): - intent_entities = {intent: set() for intent in dataset[INTENTS]} - for intent_name, intent_data in iteritems(dataset[INTENTS]): - for utterance in intent_data[UTTERANCES]: - for chunk in utterance[DATA]: - if ENTITY in chunk: - if entity_filter and not entity_filter(chunk[ENTITY]): - continue - intent_entities[intent_name].add(chunk[ENTITY]) - return intent_entities - - def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it""" # Make this function idempotent @@ -61,7 +39,7 @@ def validate_and_format_dataset(dataset): raise ValueError("Unknown language: '%s'" % language) for intent in itervalues(dataset[INTENTS]): - validate_and_format_intent(intent, dataset[ENTITIES]) + _validate_and_format_intent(intent, dataset[ENTITIES]) utterance_entities_values = extract_utterance_entities(dataset) builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset) @@ -70,15 +48,16 @@ def validate_and_format_dataset(dataset): uterrance_entities = utterance_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ - validate_and_format_builtin_entity(entity, uterrance_entities) + _validate_and_format_builtin_entity(entity, uterrance_entities) else: - dataset[ENTITIES][entity_name] = validate_and_format_custom_entity( + dataset[ENTITIES][ + entity_name] = _validate_and_format_custom_entity( entity, uterrance_entities, language, builtin_entity_parser) dataset[VALIDATED] = True return dataset -def validate_and_format_intent(intent, entities): +def _validate_and_format_intent(intent, entities): validate_type(intent, dict) validate_key(intent, UTTERANCES, object_label="intent dict") validate_type(intent[UTTERANCES], list) @@ -100,11 +79,7 @@ def validate_and_format_intent(intent, entities): return intent -def get_text_from_chunks(chunks): - return "".join(chunk[TEXT] for chunk in chunks) - - -def has_any_capitalization(entity_utterances, language): +def _has_any_capitalization(entity_utterances, language): for utterance in entity_utterances: tokens = tokenize_light(utterance, language) if any(t.isupper() or t.istitle() for t in tokens): @@ -112,7 +87,7 @@ def has_any_capitalization(entity_utterances, language): return False -def add_entity_variations(utterances, entity_variations, entity_value): +def _add_entity_variations(utterances, entity_variations, entity_value): utterances[entity_value] = entity_value for variation in entity_variations[entity_value]: if variation: @@ -129,8 +104,8 @@ def _extract_entity_values(entity): return values -def validate_and_format_custom_entity(entity, queries_entities, language, - builtin_entity_parser): +def _validate_and_format_custom_entity(entity, queries_entities, language, + builtin_entity_parser): validate_type(entity, dict) # TODO: this is here temporarily, only to allow backward compatibility @@ -169,8 +144,8 @@ def validate_and_format_custom_entity(entity, queries_entities, language, # Compute capitalization before normalizing # Normalization lowercase and hence lead to bad capitalization calculation - formatted_entity[CAPITALIZE] = has_any_capitalization(queries_entities, - language) + formatted_entity[CAPITALIZE] = _has_any_capitalization(queries_entities, + language) validated_utterances = dict() # Map original values an synonyms @@ -208,7 +183,7 @@ def validate_and_format_custom_entity(entity, queries_entities, language, for entry in entity[DATA]: entry_value = entry[VALUE] - validated_utterances = add_entity_variations( + validated_utterances = _add_entity_variations( validated_utterances, non_colliding_variations, entry_value) # Merge queries entities @@ -227,12 +202,6 @@ def validate_and_format_custom_entity(entity, queries_entities, language, return formatted_entity -def validate_and_format_builtin_entity(entity, queries_entities): +def _validate_and_format_builtin_entity(entity, queries_entities): validate_type(entity, dict) return {UTTERANCES: set(queries_entities)} - - -def get_dataset_gazetteer_entities(dataset, intent=None): - if intent is not None: - return extract_intent_entities(dataset, is_gazetteer_entity)[intent] - return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)} diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset.py index 0f0298a08..6f9e0aa10 100644 --- a/snips_nlu/tests/test_dataset.py +++ b/snips_nlu/tests/test_dataset.py @@ -5,8 +5,7 @@ from mock import mock -from snips_nlu.constants import ( - ENTITIES, SNIPS_DATETIME) +from snips_nlu.constants import ENTITIES, SNIPS_DATETIME from snips_nlu.dataset import validate_and_format_dataset from snips_nlu.tests.utils import SnipsTest @@ -155,7 +154,7 @@ def test_invalid_language_should_raise_exception(self): validate_and_format_dataset(dataset) self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0])) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_format_dataset_by_adding_synonyms( self, mocked_get_string_variations): # Given @@ -208,7 +207,7 @@ def mock_get_string_variations(variation, language, # Then self.assertDictEqual(expected_dataset, dataset) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_format_dataset_by_adding_entity_values( self, mocked_get_string_variations): # Given @@ -321,7 +320,7 @@ def mock_get_string_variations(variation, language, # Then self.assertEqual(expected_dataset, dataset) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_add_missing_reference_entity_values_when_not_use_synonyms( self, mocked_get_string_variations): # Given @@ -462,7 +461,7 @@ def test_should_not_require_data_for_builtin_entities(self): with self.fail_if_exception("Could not validate dataset"): validate_and_format_dataset(dataset) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_remove_empty_entities_value_and_empty_synonyms( self, mocked_get_string_variations): # Given @@ -576,7 +575,7 @@ def mock_get_string_variations(variation, language, # Then self.assertEqual(expected_dataset, dataset) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_add_capitalize_field( self, mocked_get_string_variations): # Given @@ -752,7 +751,7 @@ def mock_get_string_variations(variation, language, # Then self.assertDictEqual(expected_dataset, dataset) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_normalize_synonyms( self, mocked_get_string_variations): # Given @@ -827,7 +826,7 @@ def mock_get_string_variations(variation, language, # Then self.assertDictEqual(expected_dataset, dataset) - @mock.patch("snips_nlu.dataset.get_string_variations") + @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_dataset_should_handle_synonyms( self, mocked_get_string_variations): # Given From 48eeffebf80a2d478aadcc20ba278c755df45f21 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Thu, 15 Nov 2018 18:31:02 +0100 Subject: [PATCH 04/24] Simplify dataset parsing --- snips_nlu/cli/dataset/assistant_dataset.py | 11 +- snips_nlu/dataset/__init__.py | 2 + .../dataset/entities.py => dataset/entity.py} | 104 ++++++++------ .../intent_dataset.py => dataset/intent.py} | 132 ++++++------------ snips_nlu/tests/test_cli.py | 20 +-- 5 files changed, 120 insertions(+), 149 deletions(-) rename snips_nlu/{cli/dataset/entities.py => dataset/entity.py} (50%) rename snips_nlu/{cli/dataset/intent_dataset.py => dataset/intent.py} (61%) diff --git a/snips_nlu/cli/dataset/assistant_dataset.py b/snips_nlu/cli/dataset/assistant_dataset.py index 4a1982115..7961c34bc 100644 --- a/snips_nlu/cli/dataset/assistant_dataset.py +++ b/snips_nlu/cli/dataset/assistant_dataset.py @@ -1,10 +1,9 @@ # coding=utf-8 -from __future__ import unicode_literals, print_function +from __future__ import print_function, unicode_literals from pathlib import Path -from snips_nlu.cli.dataset.entities import CustomEntity, create_entity -from snips_nlu.cli.dataset.intent_dataset import IntentDataset +from snips_nlu.dataset import Entity, Intent class AssistantDataset(object): @@ -53,10 +52,10 @@ def from_files(cls, language, filenames): "'intent_' or 'entity_' but found: %s" % stem) - intents_datasets = [IntentDataset.from_file(f) + intents_datasets = [Intent.from_file(f) for f in intent_filepaths] - entities = [CustomEntity.from_file(f) for f in entity_filepaths] + entities = [Entity.from_file(f) for f in entity_filepaths] entity_names = set(e.name for e in entities) # Add entities appearing only in the intents data @@ -64,7 +63,7 @@ def from_files(cls, language, filenames): for entity_name in intent_data.entities_names: if entity_name not in entity_names: entity_names.add(entity_name) - entities.append(create_entity(entity_name)) + entities.append(Entity(name=entity_name)) return cls(language, intents_datasets, entities) @property diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py index 20b840aa2..89c0c9744 100644 --- a/snips_nlu/dataset/__init__.py +++ b/snips_nlu/dataset/__init__.py @@ -1,3 +1,5 @@ +from snips_nlu.dataset.entity import Entity, EntityFormatError +from snips_nlu.dataset.intent import Intent, IntentFormatError from snips_nlu.dataset.utils import ( extract_intent_entities, extract_utterance_entities, get_dataset_gazetteer_entities, get_text_from_chunks) diff --git a/snips_nlu/cli/dataset/entities.py b/snips_nlu/dataset/entity.py similarity index 50% rename from snips_nlu/cli/dataset/entities.py rename to snips_nlu/dataset/entity.py index bb854f300..3668fc132 100644 --- a/snips_nlu/cli/dataset/entities.py +++ b/snips_nlu/dataset/entity.py @@ -3,58 +3,95 @@ import csv import re -from abc import ABCMeta, abstractmethod from pathlib import Path import six -from future.utils import with_metaclass +from snips_nlu_ontology import get_all_builtin_entities from snips_nlu.constants import ( AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS, USE_SYNONYMS, VALUE) -from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity AUTO_EXT_REGEX = re.compile(r'^#\sautomatically_extensible=(true|false)\s*$') -class Entity(with_metaclass(ABCMeta, object)): - def __init__(self, name): - self.name = name +class EntityFormatError(TypeError): + pass - @abstractmethod - def json(self): - pass +class Entity(object): + """Entity of an :class:`.AssistantDataset` -class CustomEntity(Entity): - """Custom entity of an :class:`.AssistantDataset` + This class can represents both a custom entity and a builtin entity - Attributes: - utterances (list of :class:`.EntityUtterance`): entity utterances - automatically_extensible (bool): whether or not the entity can be - extended to values not present in the dataset - use_synonyms (bool): whether or not to map entity values using - synonyms + Attributes: + name (str): name of the entity + utterances (list of :class:`.EntityUtterance`): entity utterances + (only for custom entities) + automatically_extensible (bool): whether or not the entity can be + extended to values not present in the dataset (only for custom + entities) + use_synonyms (bool): whether or not to map entity values using + synonyms (only for custom entities) + matching_strictness (float): controls the matching strictness of the + entity (only for custom entities) """ - def __init__(self, name, utterances, automatically_extensible, - use_synonyms, matching_strictness=1.0): - super(CustomEntity, self).__init__(name) + def __init__(self, name, utterances=None, automatically_extensible=True, + use_synonyms=True, matching_strictness=1.0): + if utterances is None: + utterances = [] + self.name = name self.utterances = utterances self.automatically_extensible = automatically_extensible self.use_synonyms = use_synonyms self.matching_strictness = matching_strictness + @property + def is_builtin(self): + return self.name in get_all_builtin_entities() + + @classmethod + def from_yaml(cls, yaml_dict): + """Build an :class:`.Entity` from its YAML definition dict""" + object_type = yaml_dict.get("type") + if object_type and object_type != "entity": + raise EntityFormatError("Wrong type: '%s'" % object_type) + entity_name = yaml_dict.get("name") + if not entity_name: + raise EntityFormatError("No 'name' attribute found") + auto_extensible = yaml_dict.get(AUTOMATICALLY_EXTENSIBLE, True) + use_synonyms = yaml_dict.get(USE_SYNONYMS, True) + matching_strictness = yaml_dict.get("matching_strictness", 1.0) + utterances = [] + for entity_value in yaml_dict.get("values", []): + if isinstance(entity_value, list): + utterance = EntityUtterance(entity_value[0], entity_value[1:]) + elif isinstance(entity_value, str): + utterance = EntityUtterance(entity_value) + else: + raise EntityFormatError( + "YAML entity values must be either strings or lists, but " + "found: %s" % type(entity_value)) + utterances.append(utterance) + + cls(name=entity_name, + utterances=utterances, + automatically_extensible=auto_extensible, + use_synonyms=use_synonyms, + matching_strictness=matching_strictness) + @classmethod def from_file(cls, filepath): filepath = Path(filepath) stem = filepath.stem if not stem.startswith("entity_"): - raise AssertionError("Entity filename should start with 'entity_' " - "but found: %s" % stem) + raise EntityFormatError( + "Entity filename should start with 'entity_' but found: %s" + % stem) entity_name = stem[7:] if not entity_name: - raise AssertionError("Entity name must not be empty") + raise EntityFormatError("Entity name must not be empty") utterances = [] with filepath.open(encoding="utf-8") as f: it = f @@ -82,6 +119,8 @@ def from_file(cls, filepath): @property def json(self): """Returns the entity in json format""" + if self.is_builtin: + return dict() return { AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible, USE_SYNONYMS: self.use_synonyms, @@ -109,25 +148,6 @@ def json(self): return {VALUE: self.value, SYNONYMS: self.synonyms} -class BuiltinEntity(Entity): - """Builtin entity of an :class:`.AssistantDataset`""" - - @property - def json(self): - return dict() - - def utf_8_encoder(f): for line in f: yield line.encode("utf-8") - - -def create_entity(entity_name, utterances=None, automatically_extensible=True, - use_synonyms=True): - if is_builtin_entity(entity_name): - return BuiltinEntity(entity_name) - else: - if utterances is None: - utterances = [] - return CustomEntity(entity_name, utterances, automatically_extensible, - use_synonyms) diff --git a/snips_nlu/cli/dataset/intent_dataset.py b/snips_nlu/dataset/intent.py similarity index 61% rename from snips_nlu/cli/dataset/intent_dataset.py rename to snips_nlu/dataset/intent.py index bebb87575..83cdfdf89 100644 --- a/snips_nlu/cli/dataset/intent_dataset.py +++ b/snips_nlu/dataset/intent.py @@ -1,4 +1,4 @@ -from __future__ import print_function, absolute_import +from __future__ import absolute_import, print_function from abc import ABCMeta, abstractmethod from builtins import object @@ -6,13 +6,18 @@ from future.utils import with_metaclass -from snips_nlu.constants import UTTERANCES, SLOT_NAME, ENTITY, TEXT, DATA +from snips_nlu.constants import DATA, ENTITY, SLOT_NAME, TEXT, UTTERANCES -INTENT_FORMATTING_ERROR = AssertionError( + +class IntentFormatError(TypeError): + pass + + +INTENT_FORMATTING_ERROR = IntentFormatError( "Intent file is not properly formatted") -class IntentDataset(object): +class Intent(object): """Dataset of an intent Can parse utterances from a text file or an iterator. @@ -33,17 +38,19 @@ class IntentDataset(object): def __init__(self, intent_name): self.intent_name = intent_name self.utterances = [] + self.slot_mapping = dict() @classmethod def from_file(cls, filepath): filepath = Path(filepath) stem = filepath.stem if not stem.startswith("intent_"): - raise AssertionError("Intent filename should start with 'intent_' " - "but found: %s" % stem) + raise IntentFormatError( + "Intent filename should start with 'intent_' but found: %s" + % stem) intent_name = stem[7:] if not intent_name: - raise AssertionError("Intent name must not be empty") + raise IntentFormatError("Intent name must not be empty") with filepath.open(encoding="utf-8") as f: lines = iter(l.strip() for l in f if l.strip()) return cls.from_iter(intent_name, lines) @@ -59,6 +66,9 @@ def from_iter(cls, intent_name, samples_iter): def add(self, utterance): """Adds an :class:`.IntentUtterance` to the dataset""" + for chunk in utterance.slot_chunks: + if chunk.name not in self.slot_mapping: + self.slot_mapping[chunk.name] = chunk.entity self.utterances.append(utterance) @property @@ -79,58 +89,19 @@ def entities_names(self): class IntentUtterance(object): - def __init__(self, input, chunks): - self.input = input + def __init__(self, chunks): self.chunks = chunks @property - def annotated(self): - """Annotates with * + def text(self): + return "".join((chunk.text for chunk in self.chunks)) - Returns: The sentence annotated just with stars - - Examples: + @property + def slot_chunks(self): + return (chunk for chunk in self.chunks if isinstance(chunk, SlotChunk)) - >>> from snips_nlu.cli.dataset.intent_dataset import \ - IntentUtterance - >>> p = "the [role:role](president) of [country:country](France)" - >>> u = IntentUtterance.parse(p) - >>> u.annotated - 'the *president* of *France*' - """ - binput = bytearray(self.input, 'utf-8') - acc = 0 - star = ord('*') - for chunk in self.chunks: - if isinstance(chunk, SlotChunk): - binput.insert(chunk.range.start + acc, star) - binput.insert(chunk.range.end + acc + 1, star) - acc += 2 - return binput.decode('utf-8') - - @staticmethod - def stripped(input, chunks): - acc = 0 - s = '' - new_chunks = [] - for chunk in chunks: - start = chunk.range.start - end = chunk.range.end - s += input[start:end] - if isinstance(chunk, SlotChunk): - acc += chunk.tag_range.size - rng = Range(start - acc, end - acc) - new_chunk = SlotChunk(chunk.name, chunk.entity, rng, - chunk.text, chunk.tag_range) - new_chunks.append(new_chunk) - acc += 1 - else: - rng = Range(start - acc, end - acc) - new_chunks.append(TextChunk(chunk.text, rng)) - return s, new_chunks - - @staticmethod - def parse(string): + @classmethod + def parse(cls, string): """Parses an utterance Args: @@ -138,29 +109,28 @@ def parse(string): Examples: - >>> from snips_nlu.cli.dataset.intent_dataset import \ - IntentUtterance + >>> from snips_nlu.dataset.intent import IntentUtterance >>> u = IntentUtterance.\ parse("president of [country:default](France)") + >>> u.text + 'president of France' >>> len(u.chunks) 2 >>> u.chunks[0].text 'president of ' - >>> u.chunks[0].range.start - 0 - >>> u.chunks[0].range.end - 13 + >>> u.chunks[1].name + 'country' + >>> u.chunks[1].entity + 'default' """ sm = SM(string) capture_text(sm) - string, chunks = IntentUtterance.stripped(string, sm.chunks) - return IntentUtterance(string, chunks) + return cls(sm.chunks) class Chunk(with_metaclass(ABCMeta, object)): - def __init__(self, text, range): + def __init__(self, text): self.text = text - self.range = range @abstractmethod def json(self): @@ -168,11 +138,10 @@ def json(self): class SlotChunk(Chunk): - def __init__(self, slot_name, entity, range, text, tag_range): - super(SlotChunk, self).__init__(text, range) + def __init__(self, slot_name, entity, text): + super(SlotChunk, self).__init__(text) self.name = slot_name self.entity = entity - self.tag_range = tag_range @property def json(self): @@ -191,16 +160,6 @@ def json(self): } -class Range(object): - def __init__(self, start, end=None): - self.start = start - self.end = end - - @property - def size(self): - return self.end - self.start + 1 - - class SM(object): """State Machine for parsing""" @@ -209,24 +168,19 @@ def __init__(self, input): self.chunks = [] self.current = 0 - def add_slot(self, slot_start, name, entity): + def add_slot(self, name, entity): """Adds a named slot Args: - slot_start (int): position where the slot tag started name (str): slot name entity (str): entity name """ - tag_range = Range(slot_start - 1) - chunk = SlotChunk(slot_name=name, entity=entity, range=None, text=None, - tag_range=tag_range) + chunk = SlotChunk(slot_name=name, entity=entity, text=None) self.chunks.append(chunk) def add_text(self, text): """Adds a simple text chunk using the current position""" - start = self.current - end = start + len(text) - chunk = TextChunk(text=text, range=Range(start=start, end=end)) + chunk = TextChunk(text=text) self.chunks.append(chunk) def add_tagged(self, text): @@ -234,10 +188,7 @@ def add_tagged(self, text): if not self.chunks: raise AssertionError("Cannot add tagged text because chunks list " "is empty") - chunk = self.chunks[-1] - chunk.text = text - chunk.tag_range.end = self.current - 1 - chunk.range = Range(start=self.current, end=self.current + len(text)) + self.chunks[-1].text = text def find(self, s): return self.input.find(s, self.current) @@ -280,7 +231,6 @@ def capture_text(state): def capture_slot(state): - slot_start = state.current next_pos = state.find(':') if next_pos < 0: raise INTENT_FORMATTING_ERROR @@ -292,7 +242,7 @@ def capture_slot(state): raise INTENT_FORMATTING_ERROR entity = state[:next_pos] state.move(next_pos) - state.add_slot(slot_start, slot_name, entity) + state.add_slot(slot_name, entity) if state.read() != '(': raise INTENT_FORMATTING_ERROR capture_tagged(state) diff --git a/snips_nlu/tests/test_cli.py b/snips_nlu/tests/test_cli.py index 57542d64b..d640ecb1c 100644 --- a/snips_nlu/tests/test_cli.py +++ b/snips_nlu/tests/test_cli.py @@ -7,10 +7,10 @@ from snips_nlu import SnipsNLUEngine from snips_nlu.cli import cross_val_metrics, parse, train, train_test_metrics from snips_nlu.cli.dataset import AssistantDataset -from snips_nlu.cli.dataset.entities import CustomEntity -from snips_nlu.cli.dataset.intent_dataset import IntentDataset from snips_nlu.constants import PACKAGE_PATH -from snips_nlu.dataset import validate_and_format_dataset +from snips_nlu.dataset import ( + Entity, EntityFormatError, Intent, IntentFormatError, + validate_and_format_dataset) from snips_nlu.tests.utils import BEVERAGE_DATASET_PATH, SnipsTest, TEST_PATH @@ -81,7 +81,7 @@ def test_should_generate_intent_from_file(self): intent_file = examples_path / "intent_getWeather.txt" # When - intent_dataset = IntentDataset.from_file(intent_file) + intent_dataset = Intent.from_file(intent_file) intent_dict = intent_dataset.json # Then @@ -156,7 +156,7 @@ def test_should_generate_entity_from_file(self): entity_file = examples_path / "entity_location.txt" # When - entity_dataset = CustomEntity.from_file(entity_file) + entity_dataset = Entity.from_file(entity_file) entity_dict = entity_dataset.json # Then @@ -191,7 +191,7 @@ def test_should_generate_entity_from_file_with_autoextensible(self): entity_file = examples_path / "entity_location_autoextent_false.txt" # When - entity_dataset = CustomEntity.from_file(entity_file) + entity_dataset = Entity.from_file(entity_file) entity_dict = entity_dataset.json # Then @@ -419,8 +419,8 @@ def test_should_fail_generating_intent_with_wrong_file_name(self): intent_file = examples_path / "getWeather.txt" # When / Then - with self.assertRaises(AssertionError): - IntentDataset.from_file(intent_file) + with self.assertRaises(IntentFormatError): + Intent.from_file(intent_file) def test_should_fail_generating_entity_with_wrong_file_name(self): # Given @@ -428,5 +428,5 @@ def test_should_fail_generating_entity_with_wrong_file_name(self): entity_file = examples_path / "location.txt" # When / Then - with self.assertRaises(AssertionError): - CustomEntity.from_file(entity_file) + with self.assertRaises(EntityFormatError): + Entity.from_file(entity_file) From 6442404ff3f3545a5f52cb42564809ed4dbb1bef Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Thu, 15 Nov 2018 19:32:54 +0100 Subject: [PATCH 05/24] Add entity and intent loading from yaml files --- snips_nlu/__init__.py | 2 - snips_nlu/cli/dataset/__init__.py | 1 - .../dataset/examples/intent_getWeather.txt | 4 +- snips_nlu/cli/generate_dataset.py | 2 +- snips_nlu/dataset/__init__.py | 1 + .../dataset.py} | 3 +- snips_nlu/dataset/entity.py | 10 +- snips_nlu/dataset/intent.py | 53 ++- snips_nlu/nlu_engine/__init__.py | 2 +- snips_nlu/tests/test_cli.py | 361 ------------------ snips_nlu/tests/test_dataset_loading.py | 199 ++++++++++ ..._dataset.py => test_dataset_validation.py} | 2 +- snips_nlu/tests/test_entity_loading.py | 212 ++++++++++ snips_nlu/tests/test_intent_loading.py | 90 +++++ 14 files changed, 554 insertions(+), 388 deletions(-) delete mode 100644 snips_nlu/cli/dataset/__init__.py rename snips_nlu/{cli/dataset/assistant_dataset.py => dataset/dataset.py} (97%) create mode 100644 snips_nlu/tests/test_dataset_loading.py rename snips_nlu/tests/{test_dataset.py => test_dataset_validation.py} (99%) create mode 100644 snips_nlu/tests/test_entity_loading.py create mode 100644 snips_nlu/tests/test_intent_loading.py diff --git a/snips_nlu/__init__.py b/snips_nlu/__init__.py index e782bbfba..6414de42d 100644 --- a/snips_nlu/__init__.py +++ b/snips_nlu/__init__.py @@ -1,5 +1,3 @@ -import logging - from snips_nlu_ontology import get_ontology_version from snips_nlu.__about__ import __model_version__, __version__ diff --git a/snips_nlu/cli/dataset/__init__.py b/snips_nlu/cli/dataset/__init__.py deleted file mode 100644 index 3a8808233..000000000 --- a/snips_nlu/cli/dataset/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from snips_nlu.cli.dataset.assistant_dataset import AssistantDataset diff --git a/snips_nlu/cli/dataset/examples/intent_getWeather.txt b/snips_nlu/cli/dataset/examples/intent_getWeather.txt index bc611e565..960ce52bb 100644 --- a/snips_nlu/cli/dataset/examples/intent_getWeather.txt +++ b/snips_nlu/cli/dataset/examples/intent_getWeather.txt @@ -1,3 +1,3 @@ what is the weather in [weatherLocation:location](Paris)? -Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation:location](Moscow)? -How is the weather in [weatherLocation:location](San Francisco) [weatherDate:snips/datetime](today)? \ No newline at end of file +Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)? +How is the weather in [weatherLocation:location](San Francisco) [weatherDate] please? \ No newline at end of file diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py index ffb0cea89..ac923f0dd 100644 --- a/snips_nlu/cli/generate_dataset.py +++ b/snips_nlu/cli/generate_dataset.py @@ -4,7 +4,7 @@ import plac -from snips_nlu.cli.dataset.assistant_dataset import AssistantDataset +from snips_nlu.dataset import AssistantDataset @plac.annotations( diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py index 89c0c9744..9dd099c85 100644 --- a/snips_nlu/dataset/__init__.py +++ b/snips_nlu/dataset/__init__.py @@ -1,3 +1,4 @@ +from snips_nlu.dataset.dataset import AssistantDataset from snips_nlu.dataset.entity import Entity, EntityFormatError from snips_nlu.dataset.intent import Intent, IntentFormatError from snips_nlu.dataset.utils import ( diff --git a/snips_nlu/cli/dataset/assistant_dataset.py b/snips_nlu/dataset/dataset.py similarity index 97% rename from snips_nlu/cli/dataset/assistant_dataset.py rename to snips_nlu/dataset/dataset.py index 7961c34bc..4630e53a2 100644 --- a/snips_nlu/cli/dataset/assistant_dataset.py +++ b/snips_nlu/dataset/dataset.py @@ -3,7 +3,8 @@ from pathlib import Path -from snips_nlu.dataset import Entity, Intent +from snips_nlu.dataset.entity import Entity +from snips_nlu.dataset.intent import Intent class AssistantDataset(object): diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py index 3668fc132..c3b60b87e 100644 --- a/snips_nlu/dataset/entity.py +++ b/snips_nlu/dataset/entity.py @@ -75,11 +75,11 @@ def from_yaml(cls, yaml_dict): "found: %s" % type(entity_value)) utterances.append(utterance) - cls(name=entity_name, - utterances=utterances, - automatically_extensible=auto_extensible, - use_synonyms=use_synonyms, - matching_strictness=matching_strictness) + return cls(name=entity_name, + utterances=utterances, + automatically_extensible=auto_extensible, + use_synonyms=use_synonyms, + matching_strictness=matching_strictness) @classmethod def from_file(cls, filepath): diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index 83cdfdf89..2ed7f3957 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -35,10 +35,32 @@ class Intent(object): utterances (list of :class:`.IntentUtterance`): intent utterances """ - def __init__(self, intent_name): + def __init__(self, intent_name, slot_mapping=None): + if slot_mapping is None: + slot_mapping = dict() self.intent_name = intent_name self.utterances = [] - self.slot_mapping = dict() + self.slot_mapping = slot_mapping + + @classmethod + def from_yaml(cls, yaml_dict): + """Build an :class:`.Intent` from its YAML definition dict""" + object_type = yaml_dict.get("type") + if object_type and object_type != "intent": + raise IntentFormatError("Wrong type: '%s'" % object_type) + intent_name = yaml_dict.get("name") + if not intent_name: + raise IntentFormatError("No 'name' attribute found") + slot_mapping = dict() + for slot in yaml_dict.get("slots", []): + slot_mapping[slot["name"]] = slot["entity"] + dataset = cls(intent_name, slot_mapping) + utterances = (u.strip() for u in yaml_dict["utterances"] if u.strip()) + if not utterances: + raise IntentFormatError( + "Intent must contain at least one utterance") + dataset.add_utterances(utterances) + return dataset @classmethod def from_file(cls, filepath): @@ -51,18 +73,16 @@ def from_file(cls, filepath): intent_name = stem[7:] if not intent_name: raise IntentFormatError("Intent name must not be empty") + dataset = cls(intent_name) with filepath.open(encoding="utf-8") as f: lines = iter(l.strip() for l in f if l.strip()) - return cls.from_iter(intent_name, lines) + dataset.add_utterances(lines) + return dataset - @classmethod - def from_iter(cls, intent_name, samples_iter): - """Generates a dataset from an iterator of samples""" - dataset = cls(intent_name) + def add_utterances(self, samples_iter): for sample in samples_iter: utterance = IntentUtterance.parse(sample) - dataset.add(utterance) - return dataset + self.add(utterance) def add(self, utterance): """Adds an :class:`.IntentUtterance` to the dataset""" @@ -168,7 +188,7 @@ def __init__(self, input): self.chunks = [] self.current = 0 - def add_slot(self, name, entity): + def add_slot(self, name, entity=None): """Adds a named slot Args: @@ -233,7 +253,12 @@ def capture_text(state): def capture_slot(state): next_pos = state.find(':') if next_pos < 0: - raise INTENT_FORMATTING_ERROR + next_pos = state.find(']') + if next_pos < 0: + raise INTENT_FORMATTING_ERROR + slot_name = state[:next_pos] + state.move(next_pos) + state.add_slot(slot_name) else: slot_name = state[:next_pos] state.move(next_pos) @@ -243,9 +268,11 @@ def capture_slot(state): entity = state[:next_pos] state.move(next_pos) state.add_slot(slot_name, entity) - if state.read() != '(': - raise INTENT_FORMATTING_ERROR + if state.peek() == '(': + state.read() capture_tagged(state) + else: + capture_text(state) def capture_tagged(state): diff --git a/snips_nlu/nlu_engine/__init__.py b/snips_nlu/nlu_engine/__init__.py index a4ec8db24..07c8ebc73 100644 --- a/snips_nlu/nlu_engine/__init__.py +++ b/snips_nlu/nlu_engine/__init__.py @@ -1 +1 @@ -from .nlu_engine import SnipsNLUEngine +from snips_nlu.nlu_engine.nlu_engine import SnipsNLUEngine diff --git a/snips_nlu/tests/test_cli.py b/snips_nlu/tests/test_cli.py index d640ecb1c..d55d2f87b 100644 --- a/snips_nlu/tests/test_cli.py +++ b/snips_nlu/tests/test_cli.py @@ -6,11 +6,6 @@ from snips_nlu import SnipsNLUEngine from snips_nlu.cli import cross_val_metrics, parse, train, train_test_metrics -from snips_nlu.cli.dataset import AssistantDataset -from snips_nlu.constants import PACKAGE_PATH -from snips_nlu.dataset import ( - Entity, EntityFormatError, Intent, IntentFormatError, - validate_and_format_dataset) from snips_nlu.tests.utils import BEVERAGE_DATASET_PATH, SnipsTest, TEST_PATH @@ -74,359 +69,3 @@ def test_train_test_metrics(self): # Then if not self.tmp_file_path.exists(): self.fail("No metrics found") - - def test_should_generate_intent_from_file(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - intent_file = examples_path / "intent_getWeather.txt" - - # When - intent_dataset = Intent.from_file(intent_file) - intent_dict = intent_dataset.json - - # Then - expected_intent_dict = { - "utterances": [ - { - "data": [ - { - "text": "what is the weather in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "Paris" - }, - { - "text": "?" - } - ] - }, - { - "data": [ - { - "text": "Will it rain " - }, - { - "entity": "snips/datetime", - "slot_name": "weatherDate", - "text": "tomorrow" - }, - { - "text": " in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "Moscow" - }, - { - "text": "?" - } - ] - }, - { - "data": [ - { - "text": "How is the weather in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "San Francisco" - }, - { - "entity": "snips/datetime", - "slot_name": "weatherDate", - "text": "today" - }, - { - "text": "?" - } - ] - } - ] - } - - self.assertDictEqual(expected_intent_dict, intent_dict) - - def test_should_generate_entity_from_file(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - entity_file = examples_path / "entity_location.txt" - - # When - entity_dataset = Entity.from_file(entity_file) - entity_dict = entity_dataset.json - - # Then - expected_entity_dict = { - "automatically_extensible": True, - "data": [ - { - "synonyms": [ - "big apple" - ], - "value": "new york" - }, - { - "synonyms": [ - "city of lights" - ], - "value": "paris" - }, - { - "synonyms": [], - "value": "london" - } - ], - "use_synonyms": True, - "matching_strictness": 1.0 - } - self.assertDictEqual(expected_entity_dict, entity_dict) - - def test_should_generate_entity_from_file_with_autoextensible(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - entity_file = examples_path / "entity_location_autoextent_false.txt" - - # When - entity_dataset = Entity.from_file(entity_file) - entity_dict = entity_dataset.json - - # Then - expected_entity_dict = { - "automatically_extensible": False, - "data": [ - { - "synonyms": [ - "big apple" - ], - "value": "new york" - }, - { - "synonyms": [ - "city of lights" - ], - "value": "paris" - }, - { - "synonyms": [], - "value": "london" - } - ], - "use_synonyms": True, - "matching_strictness": 1.0 - } - self.assertDictEqual(expected_entity_dict, entity_dict) - - def test_should_generate_dataset_from_files(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - intent_file_1 = examples_path / "intent_whoIsGame.txt" - intent_file_2 = examples_path / "intent_getWeather.txt" - entity_file_1 = examples_path / "entity_location.txt" - - dataset = AssistantDataset.from_files( - "en", [intent_file_1, intent_file_2, entity_file_1]) - dataset_dict = dataset.json - - # When / Then - expected_dataset_dict = { - "entities": { - "company": { - "automatically_extensible": True, - "data": [], - "use_synonyms": True, - "matching_strictness": 1.0, - }, - "country": { - "automatically_extensible": True, - "data": [], - "use_synonyms": True, - "matching_strictness": 1.0, - }, - "location": { - "automatically_extensible": True, - "data": [ - { - "synonyms": [ - "big apple" - ], - "value": "new york" - }, - { - "synonyms": [ - "city of lights" - ], - "value": "paris" - }, - { - "synonyms": [], - "value": "london" - } - ], - "use_synonyms": True, - "matching_strictness": 1.0, - }, - "role": { - "automatically_extensible": True, - "data": [], - "use_synonyms": True, - "matching_strictness": 1.0, - }, - "snips/datetime": {} - }, - "intents": { - "getWeather": { - "utterances": [ - { - "data": [ - { - "text": "what is the weather in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "Paris" - }, - { - "text": "?" - } - ] - }, - { - "data": [ - { - "text": "Will it rain " - }, - { - "entity": "snips/datetime", - "slot_name": "weatherDate", - "text": "tomorrow" - }, - { - "text": " in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "Moscow" - }, - { - "text": "?" - } - ] - }, - { - "data": [ - { - "text": "How is the weather in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "San Francisco" - }, - { - "entity": "snips/datetime", - "slot_name": "weatherDate", - "text": "today" - }, - { - "text": "?" - } - ] - } - ] - }, - "whoIsGame": { - "utterances": [ - { - "data": [ - { - "text": "who is the " - }, - { - "entity": "role", - "slot_name": "role", - "text": "president" - }, - { - "text": " of " - }, - { - "entity": "country", - "slot_name": "country", - "text": "France" - } - ] - }, - { - "data": [ - { - "text": "who is the " - }, - { - "entity": "role", - "slot_name": "role", - "text": "prime minister" - }, - { - "text": " of " - }, - { - "entity": "country", - "slot_name": "country", - "text": "UK" - } - ] - }, - { - "data": [ - { - "text": "who is the " - }, - { - "entity": "role", - "slot_name": "role", - "text": "CEO" - }, - { - "text": " of " - }, - { - "entity": "company", - "slot_name": "company", - "text": "Google" - }, - { - "text": " please" - } - ] - } - ] - } - }, - "language": "en" - } - validate_and_format_dataset(dataset_dict) - self.assertDictEqual(expected_dataset_dict, dataset_dict) - - def test_should_fail_generating_intent_with_wrong_file_name(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - intent_file = examples_path / "getWeather.txt" - - # When / Then - with self.assertRaises(IntentFormatError): - Intent.from_file(intent_file) - - def test_should_fail_generating_entity_with_wrong_file_name(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - entity_file = examples_path / "location.txt" - - # When / Then - with self.assertRaises(EntityFormatError): - Entity.from_file(entity_file) diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py new file mode 100644 index 000000000..8bde8550d --- /dev/null +++ b/snips_nlu/tests/test_dataset_loading.py @@ -0,0 +1,199 @@ +from unittest import TestCase + +from snips_nlu.constants import PACKAGE_PATH +from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset + + +class TestDatasetLoading(TestCase): + def test_should_generate_dataset_from_files(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + intent_file_1 = examples_path / "intent_whoIsGame.txt" + intent_file_2 = examples_path / "intent_getWeather.txt" + entity_file_1 = examples_path / "entity_location.txt" + + dataset = AssistantDataset.from_files( + "en", [intent_file_1, intent_file_2, entity_file_1]) + dataset_dict = dataset.json + + # When / Then + expected_dataset_dict = { + "entities": { + "company": { + "automatically_extensible": True, + "data": [], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "country": { + "automatically_extensible": True, + "data": [], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "location": { + "automatically_extensible": True, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" + }, + { + "synonyms": [ + "city of lights" + ], + "value": "paris" + }, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "role": { + "automatically_extensible": True, + "data": [], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "snips/datetime": {} + }, + "intents": { + "getWeather": { + "utterances": [ + { + "data": [ + { + "text": "what is the weather in " + }, + { + "entity": "location", + "slot_name": "weatherLocation", + "text": "Paris" + }, + { + "text": "?" + } + ] + }, + { + "data": [ + { + "text": "Will it rain " + }, + { + "entity": "snips/datetime", + "slot_name": "weatherDate", + "text": "tomorrow" + }, + { + "text": " in " + }, + { + "entity": "location", + "slot_name": "weatherLocation", + "text": "Moscow" + }, + { + "text": "?" + } + ] + }, + { + "data": [ + { + "text": "How is the weather in " + }, + { + "entity": "location", + "slot_name": "weatherLocation", + "text": "San Francisco" + }, + { + "entity": "snips/datetime", + "slot_name": "weatherDate", + "text": "today" + }, + { + "text": "?" + } + ] + } + ] + }, + "whoIsGame": { + "utterances": [ + { + "data": [ + { + "text": "who is the " + }, + { + "entity": "role", + "slot_name": "role", + "text": "president" + }, + { + "text": " of " + }, + { + "entity": "country", + "slot_name": "country", + "text": "France" + } + ] + }, + { + "data": [ + { + "text": "who is the " + }, + { + "entity": "role", + "slot_name": "role", + "text": "prime minister" + }, + { + "text": " of " + }, + { + "entity": "country", + "slot_name": "country", + "text": "UK" + } + ] + }, + { + "data": [ + { + "text": "who is the " + }, + { + "entity": "role", + "slot_name": "role", + "text": "CEO" + }, + { + "text": " of " + }, + { + "entity": "company", + "slot_name": "company", + "text": "Google" + }, + { + "text": " please" + } + ] + } + ] + } + }, + "language": "en" + } + validate_and_format_dataset(dataset_dict) + self.assertDictEqual(expected_dataset_dict, dataset_dict) diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset_validation.py similarity index 99% rename from snips_nlu/tests/test_dataset.py rename to snips_nlu/tests/test_dataset_validation.py index 6f9e0aa10..1aa9f457d 100644 --- a/snips_nlu/tests/test_dataset.py +++ b/snips_nlu/tests/test_dataset_validation.py @@ -10,7 +10,7 @@ from snips_nlu.tests.utils import SnipsTest -class TestDataset(SnipsTest): +class TestDatasetValidation(SnipsTest): def test_missing_intent_key_should_raise_exception(self): # Given dataset = { diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py new file mode 100644 index 000000000..d49b1a4c3 --- /dev/null +++ b/snips_nlu/tests/test_entity_loading.py @@ -0,0 +1,212 @@ +import io +from unittest import TestCase + +import yaml + +from snips_nlu.constants import PACKAGE_PATH +from snips_nlu.dataset import Entity, EntityFormatError + + +class TestEntityLoading(TestCase): + def test_from_yaml_file(self): + # Given + yaml_stream = io.StringIO(""" +# Location Entity +--- +type: entity +name: location +automatically_extensible: no +use_synonyms: yes +matching_strictness: 0.5 +values: +- [new york, big apple] +- [paris, city of lights] +- london + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When + entity = Entity.from_yaml(yaml_dict) + entity_dict = entity.json + + # Then + expected_entity_dict = { + "automatically_extensible": False, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" + }, + { + "synonyms": [ + "city of lights" + ], + "value": "paris" + }, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True, + "matching_strictness": 0.5 + } + self.assertDictEqual(expected_entity_dict, entity_dict) + + def test_from_yaml_file_with_defaults(self): + # Given + yaml_stream = io.StringIO(""" +# Location Entity +--- +name: location +values: +- [new york, big apple] +- [paris, city of lights] +- london + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When + entity = Entity.from_yaml(yaml_dict) + entity_dict = entity.json + + # Then + expected_entity_dict = { + "automatically_extensible": True, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" + }, + { + "synonyms": [ + "city of lights" + ], + "value": "paris" + }, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True, + "matching_strictness": 1.0 + } + self.assertDictEqual(expected_entity_dict, entity_dict) + + def test_fail_from_yaml_file_when_wrong_type(self): + # Given + yaml_stream = io.StringIO(""" +# Location Entity +--- +type: intent +name: location +values: +- [new york, big apple] +- [paris, city of lights] +- london + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When / Then + with self.assertRaises(EntityFormatError): + Entity.from_yaml(yaml_dict) + + def test_fail_from_yaml_file_when_no_name(self): + # Given + yaml_stream = io.StringIO(""" +# Location Entity +--- +values: +- [new york, big apple] +- [paris, city of lights] +- london + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When / Then + with self.assertRaises(EntityFormatError): + Entity.from_yaml(yaml_dict) + + def test_from_text_file(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + entity_file = examples_path / "entity_location.txt" + + # When + entity = Entity.from_file(entity_file) + entity_dict = entity.json + + # Then + expected_entity_dict = { + "automatically_extensible": True, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" + }, + { + "synonyms": [ + "city of lights" + ], + "value": "paris" + }, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True, + "matching_strictness": 1.0 + } + self.assertDictEqual(expected_entity_dict, entity_dict) + + def test_from_file_with_autoextensible(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + entity_file = examples_path / "entity_location_autoextent_false.txt" + + # When + entity_dataset = Entity.from_file(entity_file) + entity_dict = entity_dataset.json + + # Then + expected_entity_dict = { + "automatically_extensible": False, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" + }, + { + "synonyms": [ + "city of lights" + ], + "value": "paris" + }, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True, + "matching_strictness": 1.0 + } + self.assertDictEqual(expected_entity_dict, entity_dict) + + def test_should_fail_generating_entity_with_wrong_file_name(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + entity_file = examples_path / "location.txt" + + # When / Then + with self.assertRaises(EntityFormatError): + Entity.from_file(entity_file) diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py new file mode 100644 index 000000000..3e732edbf --- /dev/null +++ b/snips_nlu/tests/test_intent_loading.py @@ -0,0 +1,90 @@ +from unittest import TestCase + +from snips_nlu.constants import PACKAGE_PATH +from snips_nlu.dataset import Intent, IntentFormatError + + +class TestIntentLoading(TestCase): + def test_should_generate_intent_from_text_file(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + intent_file = examples_path / "intent_getWeather.txt" + + # When + intent_dataset = Intent.from_file(intent_file) + intent_dict = intent_dataset.json + + # Then + expected_intent_dict = { + "utterances": [ + { + "data": [ + { + "text": "what is the weather in " + }, + { + "entity": "location", + "slot_name": "weatherLocation", + "text": "Paris" + }, + { + "text": "?" + } + ] + }, + { + "data": [ + { + "text": "Will it rain " + }, + { + "entity": "snips/datetime", + "slot_name": "weatherDate", + "text": "tomorrow" + }, + { + "text": " in " + }, + { + "entity": None, + "slot_name": "weatherLocation", + "text": "Moscow" + }, + { + "text": "?" + } + ] + }, + { + "data": [ + { + "text": "How is the weather in " + }, + { + "entity": "location", + "slot_name": "weatherLocation", + "text": "San Francisco" + }, + { + "entity": None, + "slot_name": "weatherDate", + "text": None + }, + { + "text": " please?" + } + ] + } + ] + } + + self.assertDictEqual(expected_intent_dict, intent_dict) + + def test_should_fail_generating_intent_with_wrong_file_name(self): + # Given + examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" + intent_file = examples_path / "getWeather.txt" + + # When / Then + with self.assertRaises(IntentFormatError): + Intent.from_file(intent_file) From e97af48c033d53b18ce578b0448bc1f565717ceb Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Mon, 19 Nov 2018 17:29:39 +0100 Subject: [PATCH 06/24] Add dataset loading from yaml files --- setup.py | 1 + .../cli/dataset/examples/entity_location.txt | 3 - .../entity_location_autoextent_false.txt | 4 - .../dataset/examples/intent_getWeather.txt | 3 - .../cli/dataset/examples/intent_whoIsGame.txt | 3 - snips_nlu/dataset/dataset.py | 99 +++- snips_nlu/dataset/entity.py | 14 +- snips_nlu/dataset/intent.py | 90 ++-- snips_nlu/dataset/validation.py | 7 +- snips_nlu/tests/test_dataset_loading.py | 430 +++++++++++------- snips_nlu/tests/test_entity_loading.py | 48 +- snips_nlu/tests/test_intent_loading.py | 223 ++++++++- snips_nlu/tests/utils.py | 9 +- 13 files changed, 665 insertions(+), 269 deletions(-) delete mode 100644 snips_nlu/cli/dataset/examples/entity_location.txt delete mode 100644 snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt delete mode 100644 snips_nlu/cli/dataset/examples/intent_getWeather.txt delete mode 100644 snips_nlu/cli/dataset/examples/intent_whoIsGame.txt diff --git a/setup.py b/setup.py index 6459693c4..9829a11e2 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,7 @@ "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", "pathlib==1.0.1; python_version < '3.4'", + "pyaml>=17,<18" ] extras_require = { diff --git a/snips_nlu/cli/dataset/examples/entity_location.txt b/snips_nlu/cli/dataset/examples/entity_location.txt deleted file mode 100644 index c6453a733..000000000 --- a/snips_nlu/cli/dataset/examples/entity_location.txt +++ /dev/null @@ -1,3 +0,0 @@ -new york,big apple -paris,city of lights -london \ No newline at end of file diff --git a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt b/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt deleted file mode 100644 index 243c4d290..000000000 --- a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt +++ /dev/null @@ -1,4 +0,0 @@ -# automatically_extensible=false -new york,big apple -paris,city of lights -london \ No newline at end of file diff --git a/snips_nlu/cli/dataset/examples/intent_getWeather.txt b/snips_nlu/cli/dataset/examples/intent_getWeather.txt deleted file mode 100644 index 960ce52bb..000000000 --- a/snips_nlu/cli/dataset/examples/intent_getWeather.txt +++ /dev/null @@ -1,3 +0,0 @@ -what is the weather in [weatherLocation:location](Paris)? -Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)? -How is the weather in [weatherLocation:location](San Francisco) [weatherDate] please? \ No newline at end of file diff --git a/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt b/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt deleted file mode 100644 index 03f369d50..000000000 --- a/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt +++ /dev/null @@ -1,3 +0,0 @@ -who is the [role:role](president) of [country:country](France) -who is the [role:role](prime minister) of [country:country](UK) -who is the [role:role](CEO) of [company:company](Google) please diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py index 4630e53a2..364e67364 100644 --- a/snips_nlu/dataset/dataset.py +++ b/snips_nlu/dataset/dataset.py @@ -1,29 +1,58 @@ # coding=utf-8 from __future__ import print_function, unicode_literals +import io +from itertools import cycle from pathlib import Path +import yaml +from snips_nlu_ontology import get_builtin_entity_examples + from snips_nlu.dataset.entity import Entity from snips_nlu.dataset.intent import Intent +class DatasetFormatError(TypeError): + pass + + class AssistantDataset(object): - """Dataset of an assistant + """Dataset used in the main NLU training API - Merges a list of :class:`.AssistantDataset` into a single dataset ready to - be used by Snips NLU + Consists of intents and entities data. This object can be built either from + text files (:meth:`.AssistantDataset.from_files`) or from YAML files + (:meth:`.AssistantDataset.from_yaml_files`). Attributes: - language (str): language of the assistant - intents_datasets (list of :class:`.IntentDataset`): data of the - assistant intents - entities (list of :class:`.Entity`): data of the assistant entities + language (str): language of the intents + intents (list of :class:`.Intent`): intents data + entities (list of :class:`.Entity`): entities data """ - def __init__(self, language, intent_datasets, entities): + def __init__(self, language, intents, entities): self.language = language - self.intents_datasets = intent_datasets + self.intents = intents self.entities = entities + self._add_missing_entities() + self._ensure_entity_values() + + @classmethod + def from_yaml_files(cls, language, filenames): + entities = [] + intents = [] + for filename in filenames: + with io.open(filename, encoding="utf8") as f: + for doc in yaml.safe_load_all(f): + doc_type = doc.get("type") + if doc_type == "entity": + entities.append(Entity.from_yaml(doc)) + elif doc_type == "intent": + intents.append(Intent.from_yaml(doc)) + else: + raise DatasetFormatError( + "Invalid 'type' value in YAML file '%s': '%s'" + % (filename, doc_type)) + return cls(language, intents, entities) @classmethod def from_files(cls, language, filenames): @@ -53,23 +82,57 @@ def from_files(cls, language, filenames): "'intent_' or 'entity_' but found: %s" % stem) - intents_datasets = [Intent.from_file(f) - for f in intent_filepaths] + intents = [Intent.from_file(f) for f in intent_filepaths] entities = [Entity.from_file(f) for f in entity_filepaths] - entity_names = set(e.name for e in entities) + return cls(language, intents, entities) + + def _add_missing_entities(self): + entity_names = set(e.name for e in self.entities) - # Add entities appearing only in the intents data - for intent_data in intents_datasets: - for entity_name in intent_data.entities_names: + # Add entities appearing only in the intents utterances + for intent in self.intents: + for entity_name in intent.entities_names: if entity_name not in entity_names: entity_names.add(entity_name) - entities.append(Entity(name=entity_name)) - return cls(language, intents_datasets, entities) + self.entities.append(Entity(name=entity_name)) + + def _ensure_entity_values(self): + entities_values = {entity.name: self._get_entity_values(entity) + for entity in self.entities} + for intent in self.intents: + for utterance in intent.utterances: + for chunk in utterance.slot_chunks: + if chunk.text is not None: + continue + try: + chunk.text = next(entities_values[chunk.entity]) + except StopIteration: + raise DatasetFormatError( + "At least one entity value must be provided for " + "entity '%s'" % chunk.entity) + return self + + def _get_entity_values(self, entity): + if entity.is_builtin: + return cycle(get_builtin_entity_examples( + entity.name, self.language)) + values = [v for utterance in entity.utterances + for v in utterance.variations] + values_set = set(values) + for intent in self.intents: + for utterance in intent.utterances: + for chunk in utterance.slot_chunks: + if not chunk.text or chunk.entity != entity.name: + continue + if chunk.text not in values_set: + values_set.add(chunk.text) + values.append(chunk.text) + return cycle(values) @property def json(self): intents = {intent_data.intent_name: intent_data.json - for intent_data in self.intents_datasets} + for intent_data in self.intents} entities = {entity.name: entity.json for entity in self.entities} return dict(language=self.language, intents=intents, entities=entities) diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py index c3b60b87e..1d4923fff 100644 --- a/snips_nlu/dataset/entity.py +++ b/snips_nlu/dataset/entity.py @@ -22,19 +22,19 @@ class EntityFormatError(TypeError): class Entity(object): """Entity of an :class:`.AssistantDataset` - This class can represents both a custom entity and a builtin entity + This class can represents both a custom or a builtin entity Attributes: name (str): name of the entity utterances (list of :class:`.EntityUtterance`): entity utterances (only for custom entities) automatically_extensible (bool): whether or not the entity can be - extended to values not present in the dataset (only for custom + extended to values not present in the data (only for custom entities) use_synonyms (bool): whether or not to map entity values using synonyms (only for custom entities) matching_strictness (float): controls the matching strictness of the - entity (only for custom entities) + entity (only for custom entities). Must be between 0.0 and 1.0. """ def __init__(self, name, utterances=None, automatically_extensible=True, @@ -59,7 +59,7 @@ def from_yaml(cls, yaml_dict): raise EntityFormatError("Wrong type: '%s'" % object_type) entity_name = yaml_dict.get("name") if not entity_name: - raise EntityFormatError("No 'name' attribute found") + raise EntityFormatError("Missing 'name' attribute") auto_extensible = yaml_dict.get(AUTOMATICALLY_EXTENSIBLE, True) use_synonyms = yaml_dict.get(USE_SYNONYMS, True) matching_strictness = yaml_dict.get("matching_strictness", 1.0) @@ -100,6 +100,8 @@ def from_file(cls, filepath): reader = csv.reader(list(it)) autoextent = True for row in reader: + if not row or not row[0].strip(): + continue if six.PY2: row = [cell.decode("utf-8") for cell in row] value = row[0] @@ -143,6 +145,10 @@ def __init__(self, value, synonyms=None): synonyms = [] self.synonyms = synonyms + @property + def variations(self): + return [self.value] + self.synonyms + @property def json(self): return {VALUE: self.value, SYNONYMS: self.synonyms} diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index 2ed7f3957..650bf59e0 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -33,14 +33,17 @@ class Intent(object): Attributes: intent_name (str): name of the intent utterances (list of :class:`.IntentUtterance`): intent utterances + slot_mapping (dict): mapping between slot names and entities """ - def __init__(self, intent_name, slot_mapping=None): + def __init__(self, intent_name, utterances, slot_mapping=None): if slot_mapping is None: slot_mapping = dict() self.intent_name = intent_name - self.utterances = [] + self.utterances = utterances self.slot_mapping = slot_mapping + self._complete_slot_name_mapping() + self._ensure_entity_names() @classmethod def from_yaml(cls, yaml_dict): @@ -50,17 +53,16 @@ def from_yaml(cls, yaml_dict): raise IntentFormatError("Wrong type: '%s'" % object_type) intent_name = yaml_dict.get("name") if not intent_name: - raise IntentFormatError("No 'name' attribute found") + raise IntentFormatError("Missing 'name' attribute") slot_mapping = dict() for slot in yaml_dict.get("slots", []): slot_mapping[slot["name"]] = slot["entity"] - dataset = cls(intent_name, slot_mapping) - utterances = (u.strip() for u in yaml_dict["utterances"] if u.strip()) + utterances = [IntentUtterance.parse(u.strip()) + for u in yaml_dict["utterances"] if u.strip()] if not utterances: raise IntentFormatError( "Intent must contain at least one utterance") - dataset.add_utterances(utterances) - return dataset + return cls(intent_name, utterances, slot_mapping) @classmethod def from_file(cls, filepath): @@ -73,27 +75,30 @@ def from_file(cls, filepath): intent_name = stem[7:] if not intent_name: raise IntentFormatError("Intent name must not be empty") - dataset = cls(intent_name) with filepath.open(encoding="utf-8") as f: lines = iter(l.strip() for l in f if l.strip()) - dataset.add_utterances(lines) - return dataset - - def add_utterances(self, samples_iter): - for sample in samples_iter: - utterance = IntentUtterance.parse(sample) - self.add(utterance) - - def add(self, utterance): - """Adds an :class:`.IntentUtterance` to the dataset""" - for chunk in utterance.slot_chunks: - if chunk.name not in self.slot_mapping: - self.slot_mapping[chunk.name] = chunk.entity - self.utterances.append(utterance) + utterances = [IntentUtterance.parse(sample) for sample in lines] + return cls(intent_name, utterances) + + def _complete_slot_name_mapping(self): + for utterance in self.utterances: + for chunk in utterance.slot_chunks: + if chunk.entity and chunk.slot_name not in self.slot_mapping: + self.slot_mapping[chunk.slot_name] = chunk.entity + return self + + def _ensure_entity_names(self): + for utterance in self.utterances: + for chunk in utterance.slot_chunks: + if chunk.entity: + continue + chunk.entity = self.slot_mapping.get( + chunk.slot_name, chunk.slot_name) + return self @property def json(self): - """Intent dataset in json format""" + """Intent data in json format""" return { UTTERANCES: [ {DATA: [chunk.json for chunk in utterance.chunks]} @@ -103,7 +108,7 @@ def json(self): @property def entities_names(self): - """Set of entity names present in the intent dataset""" + """Set of entity names present in the intent utterances""" return set(chunk.entity for u in self.utterances for chunk in u.chunks if isinstance(chunk, SlotChunk)) @@ -138,7 +143,7 @@ def parse(cls, string): 2 >>> u.chunks[0].text 'president of ' - >>> u.chunks[1].name + >>> u.chunks[1].slot_name 'country' >>> u.chunks[1].entity 'default' @@ -160,14 +165,14 @@ def json(self): class SlotChunk(Chunk): def __init__(self, slot_name, entity, text): super(SlotChunk, self).__init__(text) - self.name = slot_name + self.slot_name = slot_name self.entity = entity @property def json(self): return { TEXT: self.text, - SLOT_NAME: self.name, + SLOT_NAME: self.slot_name, ENTITY: self.entity, } @@ -188,6 +193,10 @@ def __init__(self, input): self.chunks = [] self.current = 0 + @property + def end_of_input(self): + return self.current >= len(self.input) + def add_slot(self, name, entity=None): """Adds a named slot @@ -222,6 +231,8 @@ def move(self, pos): self.current = pos + 1 def peek(self): + if self.end_of_input: + return None return self[0] def read(self): @@ -251,22 +262,19 @@ def capture_text(state): def capture_slot(state): - next_pos = state.find(':') - if next_pos < 0: - next_pos = state.find(']') - if next_pos < 0: - raise INTENT_FORMATTING_ERROR - slot_name = state[:next_pos] - state.move(next_pos) + next_colon_pos = state.find(':') + next_square_bracket_pos = state.find(']') + if next_square_bracket_pos < 0: + raise INTENT_FORMATTING_ERROR + if next_colon_pos < 0 or next_square_bracket_pos < next_colon_pos: + slot_name = state[:next_square_bracket_pos] + state.move(next_square_bracket_pos) state.add_slot(slot_name) else: - slot_name = state[:next_pos] - state.move(next_pos) - next_pos = state.find(']') - if next_pos < 0: - raise INTENT_FORMATTING_ERROR - entity = state[:next_pos] - state.move(next_pos) + slot_name = state[:next_colon_pos] + state.move(next_colon_pos) + entity = state[:next_square_bracket_pos] + state.move(next_square_bracket_pos) state.add_slot(slot_name, entity) if state.peek() == '(': state.read() diff --git a/snips_nlu/dataset/validation.py b/snips_nlu/dataset/validation.py index ec2c135c2..5f5dc7e16 100644 --- a/snips_nlu/dataset/validation.py +++ b/snips_nlu/dataset/validation.py @@ -50,9 +50,10 @@ def validate_and_format_dataset(dataset): dataset[ENTITIES][entity_name] = \ _validate_and_format_builtin_entity(entity, uterrance_entities) else: - dataset[ENTITIES][ - entity_name] = _validate_and_format_custom_entity( - entity, uterrance_entities, language, builtin_entity_parser) + dataset[ENTITIES][entity_name] = \ + _validate_and_format_custom_entity( + entity, uterrance_entities, language, + builtin_entity_parser) dataset[VALIDATED] = True return dataset diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py index 8bde8550d..34bcbc967 100644 --- a/snips_nlu/tests/test_dataset_loading.py +++ b/snips_nlu/tests/test_dataset_loading.py @@ -1,199 +1,293 @@ -from unittest import TestCase +from __future__ import unicode_literals -from snips_nlu.constants import PACKAGE_PATH -from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset +import io +from unittest import TestCase +import mock +from mock import patch -class TestDatasetLoading(TestCase): - def test_should_generate_dataset_from_files(self): - # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - intent_file_1 = examples_path / "intent_whoIsGame.txt" - intent_file_2 = examples_path / "intent_getWeather.txt" - entity_file_1 = examples_path / "entity_location.txt" - - dataset = AssistantDataset.from_files( - "en", [intent_file_1, intent_file_2, entity_file_1]) - dataset_dict = dataset.json +from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset - # When / Then - expected_dataset_dict = { - "entities": { - "company": { - "automatically_extensible": True, - "data": [], - "use_synonyms": True, - "matching_strictness": 1.0, - }, - "country": { - "automatically_extensible": True, - "data": [], - "use_synonyms": True, - "matching_strictness": 1.0, +EXPECTED_DATASET_DICT = { + "entities": { + "company": { + "automatically_extensible": True, + "data": [], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "country": { + "automatically_extensible": True, + "data": [], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "location": { + "automatically_extensible": True, + "data": [ + { + "synonyms": [ + "big apple" + ], + "value": "new york" }, - "location": { - "automatically_extensible": True, + { + "synonyms": [], + "value": "london" + } + ], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "role": { + "automatically_extensible": True, + "data": [], + "use_synonyms": True, + "matching_strictness": 1.0, + }, + "snips/datetime": {} + }, + "intents": { + "getWeather": { + "utterances": [ + { "data": [ { - "synonyms": [ - "big apple" - ], - "value": "new york" + "text": "what is the weather in " }, { - "synonyms": [ - "city of lights" - ], - "value": "paris" + "entity": "location", + "slot_name": "weatherLocation", + "text": "Paris" }, { - "synonyms": [], - "value": "london" + "text": "?" } - ], - "use_synonyms": True, - "matching_strictness": 1.0, - }, - "role": { - "automatically_extensible": True, - "data": [], - "use_synonyms": True, - "matching_strictness": 1.0, + ] }, - "snips/datetime": {} - }, - "intents": { - "getWeather": { - "utterances": [ + { + "data": [ { - "data": [ - { - "text": "what is the weather in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "Paris" - }, - { - "text": "?" - } - ] + "text": "is it raining in " }, { - "data": [ - { - "text": "Will it rain " - }, - { - "entity": "snips/datetime", - "slot_name": "weatherDate", - "text": "tomorrow" - }, - { - "text": " in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "Moscow" - }, - { - "text": "?" - } - ] + "entity": "location", + "slot_name": "weatherLocation", + "text": "new york" }, { - "data": [ - { - "text": "How is the weather in " - }, - { - "entity": "location", - "slot_name": "weatherLocation", - "text": "San Francisco" - }, - { - "entity": "snips/datetime", - "slot_name": "weatherDate", - "text": "today" - }, - { - "text": "?" - } - ] + "entity": "snips/datetime", + "slot_name": "weatherDate", + "text": "Today" + } + ] + } + ] + }, + "whoIsGame": { + "utterances": [ + { + "data": [ + { + "text": "who is the " + }, + { + "entity": "role", + "slot_name": "role", + "text": "president" + }, + { + "text": " of " + }, + { + "entity": "country", + "slot_name": "country", + "text": "France" } ] }, - "whoIsGame": { - "utterances": [ + { + "data": [ + { + "text": "who is the " + }, { - "data": [ - { - "text": "who is the " - }, - { - "entity": "role", - "slot_name": "role", - "text": "president" - }, - { - "text": " of " - }, - { - "entity": "country", - "slot_name": "country", - "text": "France" - } - ] + "entity": "role", + "slot_name": "role", + "text": "CEO" }, { - "data": [ - { - "text": "who is the " - }, - { - "entity": "role", - "slot_name": "role", - "text": "prime minister" - }, - { - "text": " of " - }, - { - "entity": "country", - "slot_name": "country", - "text": "UK" - } - ] + "text": " of " }, { - "data": [ - { - "text": "who is the " - }, - { - "entity": "role", - "slot_name": "role", - "text": "CEO" - }, - { - "text": " of " - }, - { - "entity": "company", - "slot_name": "company", - "text": "Google" - }, - { - "text": " please" - } - ] + "entity": "company", + "slot_name": "company", + "text": "Google" + }, + { + "text": " please" } ] } - }, - "language": "en" + ] } + }, + "language": "en" +} + + +class TestDatasetLoading(TestCase): + @patch("snips_nlu.dataset.dataset.io") + def test_should_generate_dataset_from_yaml_files(self, mock_io): + # Given + intent_file_1 = "whoIsGame.yaml" + intent_file_2 = "getWeather.yaml" + entity_file_1 = "location.yaml" + + who_is_game_yaml = """ +# whoIsGame Intent +--- +type: intent +name: whoIsGame +utterances: + - who is the [role](president) of [country](France) + - who is the [role](CEO) of [company](Google) please + """ + + get_weather_yaml = """ +# getWeather Intent +--- +type: intent +name: getWeather +utterances: + - what is the weather in [weatherLocation:location](Paris)? + - is it raining in [weatherLocation] [weatherDate:snips/datetime] + """ + + location_yaml = """ +# Location Entity +--- +type: entity +name: location +automatically_extensible: true +values: +- [new york, big apple] +- london + """ + + # pylint:disable=unused-argument + def mock_open(filename, **kwargs): + if filename == intent_file_1: + return io.StringIO(who_is_game_yaml) + if filename == intent_file_2: + return io.StringIO(get_weather_yaml) + if filename == entity_file_1: + return io.StringIO(location_yaml) + return None + + # pylint:enable=unused-argument + + mock_io.open.side_effect = mock_open + dataset_files = [intent_file_1, intent_file_2, entity_file_1] + + # When + dataset = AssistantDataset.from_yaml_files("en", dataset_files) + dataset_dict = dataset.json + + # Then + validate_and_format_dataset(dataset_dict) + self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict) + + @mock.patch("snips_nlu.dataset.dataset.io") + def test_should_generate_dataset_from_merged_yaml_file(self, mock_io): + # Given + dataset_file = "dataset.yaml" + dataset_yaml = """ +# whoIsGame Intent +--- +type: intent +name: whoIsGame +utterances: + - who is the [role](president) of [country](France) + - who is the [role](CEO) of [company](Google) please + +# getWeather Intent +--- +type: intent +name: getWeather +utterances: + - what is the weather in [weatherLocation:location](Paris)? + - is it raining in [weatherLocation] [weatherDate:snips/datetime] + +# Location Entity +--- +type: entity +name: location +automatically_extensible: true +values: +- [new york, big apple] +- london + """ + + # pylint:disable=unused-argument + def mock_open(filename, **kwargs): + if filename == dataset_file: + return io.StringIO(dataset_yaml) + return None + + # pylint:enable=unused-argument + + mock_io.open.side_effect = mock_open + + # When + dataset = AssistantDataset.from_yaml_files("en", [dataset_file]) + dataset_dict = dataset.json + + # Then + validate_and_format_dataset(dataset_dict) + self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict) + + def test_should_generate_dataset_from_files(self): + # Given + intent_file_1 = "intent_whoIsGame.txt" + intent_file_2 = "intent_getWeather.txt" + entity_file_1 = "entity_location.txt" + + who_is_game_txt = """ +who is the [role:role](president) of [country:country](France) +who is the [role:role](CEO) of [company:company](Google) please +""" + + get_weather_txt = """ +what is the weather in [weatherLocation:location](Paris)? +is it raining in [weatherLocation] [weatherDate:snips/datetime] +""" + + location_txt = """ +new york,big apple +london + """ + + # pylint:disable=unused-argument + def mock_open(self_, *args, **kwargs): + if str(self_) == intent_file_1: + return io.StringIO(who_is_game_txt) + if str(self_) == intent_file_2: + return io.StringIO(get_weather_txt) + if str(self_) == entity_file_1: + return io.StringIO(location_txt) + return None + + # pylint:enable=unused-argument + + dataset_files = [intent_file_1, intent_file_2, entity_file_1] + + # When + with patch("pathlib.io") as mock_io: + mock_io.open.side_effect = mock_open + dataset = AssistantDataset.from_files("en", dataset_files) + dataset_dict = dataset.json + + # When / Then validate_and_format_dataset(dataset_dict) - self.assertDictEqual(expected_dataset_dict, dataset_dict) + self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict) diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py index d49b1a4c3..09105da00 100644 --- a/snips_nlu/tests/test_entity_loading.py +++ b/snips_nlu/tests/test_entity_loading.py @@ -1,9 +1,11 @@ +from __future__ import unicode_literals + import io from unittest import TestCase import yaml +from mock import patch -from snips_nlu.constants import PACKAGE_PATH from snips_nlu.dataset import Entity, EntityFormatError @@ -132,10 +134,24 @@ def test_fail_from_yaml_file_when_no_name(self): with self.assertRaises(EntityFormatError): Entity.from_yaml(yaml_dict) - def test_from_text_file(self): + @patch("pathlib.io") + def test_from_text_file(self, mock_io): # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - entity_file = examples_path / "entity_location.txt" + entity_file = "entity_location.txt" + location_txt = """ +new york,big apple +paris,city of lights +london + """ + + # pylint:disable=unused-argument + def mock_open(self_, *args, **kwargs): + if str(self_) == entity_file: + return io.StringIO(location_txt) + return None + + # pylint:enable=unused-argument + mock_io.open.side_effect = mock_open # When entity = Entity.from_file(entity_file) @@ -167,10 +183,25 @@ def test_from_text_file(self): } self.assertDictEqual(expected_entity_dict, entity_dict) - def test_from_file_with_autoextensible(self): + @patch("pathlib.io") + def test_from_file_with_autoextensible(self, mock_io): # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - entity_file = examples_path / "entity_location_autoextent_false.txt" + entity_file = "entity_location.txt" + location_txt = """# automatically_extensible=false +new york,big apple +paris,city of lights +london + """ + + # pylint:disable=unused-argument + def mock_open(self_, *args, **kwargs): + if str(self_) == entity_file: + return io.StringIO(location_txt) + return None + + # pylint:enable=unused-argument + + mock_io.open.side_effect = mock_open # When entity_dataset = Entity.from_file(entity_file) @@ -204,8 +235,7 @@ def test_from_file_with_autoextensible(self): def test_should_fail_generating_entity_with_wrong_file_name(self): # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - entity_file = examples_path / "location.txt" + entity_file = "location.txt" # When / Then with self.assertRaises(EntityFormatError): diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py index 3e732edbf..1dfcc8282 100644 --- a/snips_nlu/tests/test_intent_loading.py +++ b/snips_nlu/tests/test_intent_loading.py @@ -1,14 +1,203 @@ +from __future__ import unicode_literals + +import io from unittest import TestCase -from snips_nlu.constants import PACKAGE_PATH +import yaml +from mock import patch + from snips_nlu.dataset import Intent, IntentFormatError class TestIntentLoading(TestCase): - def test_should_generate_intent_from_text_file(self): + def test_should_load_from_yaml_file(self): + # Given + yaml_stream = io.StringIO(""" +# getWeather Intent +--- +type: intent +name: getWeather +utterances: + - what is the weather in [weatherLocation:location](paris) ? + - "Will it rain [date:snips/datetime](tomorrow) in + [weatherLocation:location](london)?" + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When + intent = Intent.from_yaml(yaml_dict) + intent_dict = intent.json + + # Then + expected_intent_dict = { + "utterances": [ + { + "data": [ + { + "text": "what is the weather in " + }, + { + "text": "paris", + "entity": "location", + "slot_name": "weatherLocation" + }, + { + "text": " ?" + } + ] + }, + { + "data": [ + { + "text": "Will it rain " + }, + { + "text": "tomorrow", + "entity": "snips/datetime", + "slot_name": "date" + }, + { + "text": " in " + }, + { + "text": "london", + "entity": "location", + "slot_name": "weatherLocation" + }, + { + "text": "?" + } + ] + } + ] + } + self.assertDictEqual(expected_intent_dict, intent_dict) + + def test_should_load_from_yaml_file_using_slot_mapping(self): # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - intent_file = examples_path / "intent_getWeather.txt" + yaml_stream = io.StringIO(""" +# getWeather Intent +--- +type: intent +name: getWeather +slots: + - name: date + entity: snips/datetime + - name: weatherLocation + entity: location +utterances: + - what is the weather in [weatherLocation](paris) ? + - Will it rain [date] in [weatherLocation](london)? + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When + intent = Intent.from_yaml(yaml_dict) + intent_dict = intent.json + + # Then + expected_intent_dict = { + "utterances": [ + { + "data": [ + { + "text": "what is the weather in " + }, + { + "text": "paris", + "entity": "location", + "slot_name": "weatherLocation" + }, + { + "text": " ?" + } + ] + }, + { + "data": [ + { + "text": "Will it rain " + }, + { + "text": None, + "entity": "snips/datetime", + "slot_name": "date" + }, + { + "text": " in " + }, + { + "text": "london", + "entity": "location", + "slot_name": "weatherLocation" + }, + { + "text": "?" + } + ] + } + ] + } + self.assertDictEqual(expected_intent_dict, intent_dict) + + def test_should_load_from_yaml_file_using_implicit_values(self): + # Given + yaml_stream = io.StringIO(""" +# getWeather Intent +--- +type: intent +name: getWeather +utterances: + - what is the weather in [location] ? + """) + yaml_dict = yaml.safe_load(yaml_stream) + + # When + intent = Intent.from_yaml(yaml_dict) + intent_dict = intent.json + + # Then + expected_intent_dict = { + "utterances": [ + { + "data": [ + { + "text": "what is the weather in " + }, + { + "text": None, + "entity": "location", + "slot_name": "location" + }, + { + "text": " ?" + } + ] + } + ] + } + self.assertDictEqual(expected_intent_dict, intent_dict) + + @patch("pathlib.io") + def test_should_generate_intent_from_text_file(self, mock_io): + # Given + intent_file = "intent_getWeather.txt" + get_weather_txt = """ +what is the weather in [weatherLocation:location](Paris)? +Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)? +How is the weather in [weatherLocation:location] [weatherDate] please? +is it raining in [weatherLocation] [weatherDate:snips/datetime] + """ + + # pylint:disable=unused-argument + def mock_open(self_, *args, **kwargs): + if str(self_) == intent_file: + return io.StringIO(get_weather_txt) + return None + + # pylint:enable=unused-argument + + mock_io.open.side_effect = mock_open # When intent_dataset = Intent.from_file(intent_file) @@ -46,7 +235,7 @@ def test_should_generate_intent_from_text_file(self): "text": " in " }, { - "entity": None, + "entity": "location", "slot_name": "weatherLocation", "text": "Moscow" }, @@ -63,10 +252,10 @@ def test_should_generate_intent_from_text_file(self): { "entity": "location", "slot_name": "weatherLocation", - "text": "San Francisco" + "text": None }, { - "entity": None, + "entity": "snips/datetime", "slot_name": "weatherDate", "text": None }, @@ -74,6 +263,23 @@ def test_should_generate_intent_from_text_file(self): "text": " please?" } ] + }, + { + "data": [ + { + "text": "is it raining in " + }, + { + "entity": "location", + "slot_name": "weatherLocation", + "text": None + }, + { + "entity": "snips/datetime", + "slot_name": "weatherDate", + "text": None + } + ] } ] } @@ -82,8 +288,7 @@ def test_should_generate_intent_from_text_file(self): def test_should_fail_generating_intent_with_wrong_file_name(self): # Given - examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples" - intent_file = examples_path / "getWeather.txt" + intent_file = "getWeather.txt" # When / Then with self.assertRaises(IntentFormatError): diff --git a/snips_nlu/tests/utils.py b/snips_nlu/tests/utils.py index d057ce4e1..7ea7c219f 100644 --- a/snips_nlu/tests/utils.py +++ b/snips_nlu/tests/utils.py @@ -14,10 +14,11 @@ from snips_nlu.utils import json_string, unicode_string TEST_PATH = Path(__file__).parent -SAMPLE_DATASET_PATH = TEST_PATH / "resources" / "sample_dataset.json" -BEVERAGE_DATASET_PATH = TEST_PATH / "resources" / "beverage_dataset.json" -WEATHER_DATASET_PATH = TEST_PATH / "resources" / "weather_dataset.json" -PERFORMANCE_DATASET_PATH = TEST_PATH / "resources" / "performance_dataset.json" +TEST_RESOURCES_PATH = TEST_PATH / "resources" +SAMPLE_DATASET_PATH = TEST_RESOURCES_PATH / "sample_dataset.json" +BEVERAGE_DATASET_PATH = TEST_RESOURCES_PATH / "beverage_dataset.json" +WEATHER_DATASET_PATH = TEST_RESOURCES_PATH / "weather_dataset.json" +PERFORMANCE_DATASET_PATH = TEST_RESOURCES_PATH / "performance_dataset.json" # pylint: disable=invalid-name From 6e57058acff63430e56b075bdba3736201746036 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Mon, 19 Nov 2018 17:56:24 +0100 Subject: [PATCH 07/24] Fix issue with yaml loaders --- snips_nlu/dataset/entity.py | 1 + snips_nlu/dataset/utils.py | 13 +++++++++++++ snips_nlu/utils.py | 20 ++------------------ 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py index 1d4923fff..08d8ed800 100644 --- a/snips_nlu/dataset/entity.py +++ b/snips_nlu/dataset/entity.py @@ -3,6 +3,7 @@ import csv import re +from builtins import str from pathlib import Path import six diff --git a/snips_nlu/dataset/utils.py b/snips_nlu/dataset/utils.py index 2047bf716..b4fc0c33b 100644 --- a/snips_nlu/dataset/utils.py +++ b/snips_nlu/dataset/utils.py @@ -1,10 +1,23 @@ +from __future__ import unicode_literals + from future.utils import iteritems, itervalues +from yaml import Loader, SafeLoader from snips_nlu.constants import ( DATA, ENTITIES, ENTITY, INTENTS, TEXT, UTTERANCES) from snips_nlu.entity_parser.builtin_entity_parser import is_gazetteer_entity +def construct_yaml_str(self, node): + # Override the default string handling function + # to always return unicode objects + return self.construct_scalar(node) + + +Loader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str) +SafeLoader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str) + + def extract_utterance_entities(dataset): entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]} diff --git a/snips_nlu/utils.py b/snips_nlu/utils.py index 198722b7a..b12adfdbb 100644 --- a/snips_nlu/utils.py +++ b/snips_nlu/utils.py @@ -7,7 +7,7 @@ import os import shutil from builtins import bytes, object, str -from collections import Mapping, OrderedDict, namedtuple +from collections import OrderedDict from contextlib import contextmanager from datetime import datetime from functools import wraps @@ -24,6 +24,7 @@ REGEX_PUNCT = {'\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}', '^', '$', '#', '&', '-', '~'} + # pylint: disable=invalid-name class abstractclassmethod(classmethod): @@ -97,12 +98,6 @@ def validate_keys(obj, keys, object_label=None): validate_key(obj, key, object_label) -def validate_range(rng): - if not isinstance(rng, (list, tuple)) or len(rng) != 2 or rng[0] > rng[1]: - raise ValueError("range must be a length 2 list or tuple and must be " - "valid") - - class LimitedSizeDict(OrderedDict): def __init__(self, *args, **kwds): if "size_limit" not in kwds: @@ -138,17 +133,6 @@ def __setitem__(self, key, value): super(UnupdatableDict, self).__setitem__(key, value) -def namedtuple_with_defaults(typename, field_names, default_values=()): - T = namedtuple(typename, field_names) # pylint: disable=C0103 - T.__new__.__defaults__ = (None,) * len(T._fields) - if isinstance(default_values, Mapping): - prototype = T(**default_values) - else: - prototype = T(*default_values) - T.__new__.__defaults__ = tuple(prototype) - return T - - def mkdir_p(path): """Reproduces the 'mkdir -p shell' command From f3d81c465aacefe9bfca6fcd36eed6e69c9bbc91 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 20 Nov 2018 16:38:23 +0100 Subject: [PATCH 08/24] Deprecate dataset text files format --- setup.py | 3 ++- snips_nlu/cli/generate_dataset.py | 5 ++++- snips_nlu/dataset/dataset.py | 5 +++++ snips_nlu/dataset/entity.py | 4 ++++ snips_nlu/dataset/intent.py | 4 ++++ snips_nlu/tests/test_dataset_loading.py | 2 ++ snips_nlu/tests/test_entity_loading.py | 4 ++++ snips_nlu/tests/test_intent_loading.py | 3 +++ 8 files changed, 28 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9829a11e2..fe523edad 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,8 @@ "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", "pathlib==1.0.1; python_version < '3.4'", - "pyaml>=17,<18" + "pyaml>=17,<18", + "deprecation>=2,<3" ] extras_require = { diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py index ac923f0dd..faca6043d 100644 --- a/snips_nlu/cli/generate_dataset.py +++ b/snips_nlu/cli/generate_dataset.py @@ -13,5 +13,8 @@ "filename")) def generate_dataset(language, *files): """Create a Snips NLU dataset from text friendly files""" - dataset = AssistantDataset.from_files(language, list(files)) + if any(f.endswith(".yml") or f.endswith(".yaml") for f in files): + dataset = AssistantDataset.from_yaml_files(language, list(files)) + else: + dataset = AssistantDataset.from_files(language, list(files)) print(json.dumps(dataset.json, indent=2, sort_keys=True)) diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py index 364e67364..d9cef2318 100644 --- a/snips_nlu/dataset/dataset.py +++ b/snips_nlu/dataset/dataset.py @@ -6,8 +6,10 @@ from pathlib import Path import yaml +from deprecation import deprecated from snips_nlu_ontology import get_builtin_entity_examples +from snips_nlu.__about__ import __version__ from snips_nlu.dataset.entity import Entity from snips_nlu.dataset.intent import Intent @@ -55,6 +57,9 @@ def from_yaml_files(cls, language, filenames): return cls(language, intents, entities) @classmethod + @deprecated(deprecated_in="0.18.0", removed_in="0.19.0", + current_version=__version__, + details="Use from_yaml_files instead") def from_files(cls, language, filenames): """Creates an :class:`.AssistantDataset` from a language and a list of intent and entity files diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py index 08d8ed800..04c9743ce 100644 --- a/snips_nlu/dataset/entity.py +++ b/snips_nlu/dataset/entity.py @@ -7,8 +7,10 @@ from pathlib import Path import six +from deprecation import deprecated from snips_nlu_ontology import get_all_builtin_entities +from snips_nlu.__about__ import __version__ from snips_nlu.constants import ( AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS, USE_SYNONYMS, VALUE) @@ -83,6 +85,8 @@ def from_yaml(cls, yaml_dict): matching_strictness=matching_strictness) @classmethod + @deprecated(deprecated_in="0.18.0", removed_in="0.19.0", + current_version=__version__, details="Use from_yaml instead") def from_file(cls, filepath): filepath = Path(filepath) stem = filepath.stem diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index 650bf59e0..58dbaa679 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -4,8 +4,10 @@ from builtins import object from pathlib import Path +from deprecation import deprecated from future.utils import with_metaclass +from snips_nlu.__about__ import __version__ from snips_nlu.constants import DATA, ENTITY, SLOT_NAME, TEXT, UTTERANCES @@ -65,6 +67,8 @@ def from_yaml(cls, yaml_dict): return cls(intent_name, utterances, slot_mapping) @classmethod + @deprecated(deprecated_in="0.18.0", removed_in="0.19.0", + current_version=__version__, details="Use from_yaml instead") def from_file(cls, filepath): filepath = Path(filepath) stem = filepath.stem diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py index 34bcbc967..458fba82f 100644 --- a/snips_nlu/tests/test_dataset_loading.py +++ b/snips_nlu/tests/test_dataset_loading.py @@ -4,6 +4,7 @@ from unittest import TestCase import mock +from deprecation import fail_if_not_removed from mock import patch from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset @@ -247,6 +248,7 @@ def mock_open(filename, **kwargs): validate_and_format_dataset(dataset_dict) self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict) + @fail_if_not_removed def test_should_generate_dataset_from_files(self): # Given intent_file_1 = "intent_whoIsGame.txt" diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py index 09105da00..4e49c7553 100644 --- a/snips_nlu/tests/test_entity_loading.py +++ b/snips_nlu/tests/test_entity_loading.py @@ -4,6 +4,7 @@ from unittest import TestCase import yaml +from deprecation import fail_if_not_removed from mock import patch from snips_nlu.dataset import Entity, EntityFormatError @@ -135,6 +136,7 @@ def test_fail_from_yaml_file_when_no_name(self): Entity.from_yaml(yaml_dict) @patch("pathlib.io") + @fail_if_not_removed def test_from_text_file(self, mock_io): # Given entity_file = "entity_location.txt" @@ -184,6 +186,7 @@ def mock_open(self_, *args, **kwargs): self.assertDictEqual(expected_entity_dict, entity_dict) @patch("pathlib.io") + @fail_if_not_removed def test_from_file_with_autoextensible(self, mock_io): # Given entity_file = "entity_location.txt" @@ -233,6 +236,7 @@ def mock_open(self_, *args, **kwargs): } self.assertDictEqual(expected_entity_dict, entity_dict) + @fail_if_not_removed def test_should_fail_generating_entity_with_wrong_file_name(self): # Given entity_file = "location.txt" diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py index 1dfcc8282..50332d18f 100644 --- a/snips_nlu/tests/test_intent_loading.py +++ b/snips_nlu/tests/test_intent_loading.py @@ -4,6 +4,7 @@ from unittest import TestCase import yaml +from deprecation import fail_if_not_removed from mock import patch from snips_nlu.dataset import Intent, IntentFormatError @@ -179,6 +180,7 @@ def test_should_load_from_yaml_file_using_implicit_values(self): self.assertDictEqual(expected_intent_dict, intent_dict) @patch("pathlib.io") + @fail_if_not_removed def test_should_generate_intent_from_text_file(self, mock_io): # Given intent_file = "intent_getWeather.txt" @@ -286,6 +288,7 @@ def mock_open(self_, *args, **kwargs): self.assertDictEqual(expected_intent_dict, intent_dict) + @fail_if_not_removed def test_should_fail_generating_intent_with_wrong_file_name(self): # Given intent_file = "getWeather.txt" From a53a173f5c58312755aedd141123d7405452f582 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 20 Nov 2018 19:27:33 +0100 Subject: [PATCH 09/24] Add dedicated documentation section on dataset format --- docs/source/dataset.rst | 249 +++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 3 +- docs/source/quickstart.rst | 4 +- docs/source/tutorial.rst | 14 +-- 4 files changed, 260 insertions(+), 10 deletions(-) create mode 100644 docs/source/dataset.rst diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst new file mode 100644 index 000000000..826c1f638 --- /dev/null +++ b/docs/source/dataset.rst @@ -0,0 +1,249 @@ +.. _dataset: + +Training Dataset Format +======================= + +The Snips NLU library leverages machine learning algorithms and some training +data in order to produce a powerful intent recognition engine. + +The better your training data is, and the more accurate your NLU engine will +be. Thus, it is worth spending a bit of time to create a dataset that +corresponds well to your use case. + +Snips NLU accepts two different dataset formats. The first one, which relies +on YAML, is the preferred option if you want to create or edit a dataset +manually. +The other dataset format uses JSON and should rather be used if you plan to +create or edit datasets programmatically. + +YAML format +----------- + +The YAML dataset format allows you to define intents and entities using the +`YAML `_ syntax. + +------ +Entity +------ + +Here is what an entity file looks like: + +.. code-block:: yaml + + # City Entity + --- + type: entity # allows to differentiate between entities and intents files + name: city # name of the entity + values: + - london # single entity value + - [new york, big apple] # entity value with a synonym + - [paris, city of lights] + +You can specify entity values either using single YAML scalars (e.g. ``london``), +or using lists if you want to define some synonyms (e.g. +``[paris, city of lights]``) + +Here is a more comprehensive example which contains additional attributes that +are optional: + +.. code-block:: yaml + + # City Entity + --- + type: entity + name: city + automatically_extensible: false # default value is true + use_synonyms: false # default value is true + matching_strictness: 0.8 # default value is 1.0 + values: + - london + - [new york, big apple] + - [paris, city of lights] + +------ +Intent +------ + +Here is the format used to describe an intent: + +.. code-block:: yaml + + # searchFlight Intent + --- + type: intent + name: searchFlight # name of the intent + utterances: + - find me a flight from [origin:city](Paris) to [destination:city](New York) + - I need a flight leaving [date:snips/datetime](this weekend) to [destination:city](Berlin) + - show me flights to go to [arrival:city](new york) leaving [date:snips/datetime](this evening) + +We use a standard markdown-like annotation syntax to annotate slots within +utterances. The ``[origin:city](Paris)`` chunk describes a slot with its three +components: + + - ``origin``: the slot name + - ``city``: the slot type + - ``Paris``: the slot value + +Note that different slot names can share the same slot type. This is the case +for the ``origin`` and ``destination`` slot names in the previous example, which +have the same slot type ``city``. + +If you are to write more than just three utterances, you can actually specify +the slot mapping explicitly in the intent file and remove it from the +utterances. This will result in simpler annotations: + +.. code-block:: yaml + + # searchFlight Intent + --- + type: intent + name: searchFlight # name of the intent + slots: + - name: origin + entity: city + - name: destination + entity: city + - name: date + entity: snips/datetime + utterances: + - find me a flight from [origin](Paris) to [destination](New York) + - I need a flight leaving [date](this weekend) to [destination](Berlin) + - show me flights to go to [arrival](new york) leaving [date](this evening) + +------- +Dataset +------- + +You are free to organize the yaml documents as you want. Either having one yaml +file for each intent and each entity, or gathering some documents together +(e.g. all entities together, or all intents together) in the same yaml file. +Here is the yaml file corresponding to the previous ``city`` entity and +``searchFlight`` intent merged together: + +.. code-block:: yaml + + # City Entity + --- + type: entity # allows to differentiate between entities and intents files + name: city # name of the entity + values: + - london # single entity value + - [new york, big apple] # entity value with a synonym + - [paris, city of lights] + + # searchFlight Intent + --- + type: intent + name: searchFlight # name of the intent + slots: + - name: origin + entity: city + - name: destination + entity: city + - name: date + entity: snips/datetime + utterances: + - find me a flight from [origin](Paris) to [destination](New York) + - I need a flight leaving [date](this weekend) to [destination](Berlin) + - show me flights to go to [arrival](new york) leaving [date](this evening) + +Once your intents and entities are created using the YAML format described +previously, you can produce a dataset using the +:ref:`Command Line Interface (CLI) `: + +.. code-block:: console + + snips-nlu generate-dataset en city.yaml searchFlight.yaml > dataset.json + +Or alternatively if you merged the yaml documents into a single file: + +.. code-block:: console + + snips-nlu generate-dataset en dataset.yaml > dataset.json + +This will generate a JSON dataset and write it in the ``dataset.json`` file. +The format of the generated file is the second allowed format that is described +in the next section. + +JSON format +----------- + +The JSON format is the format which is eventually used by the training API. It +was designed to be easy to parse. + +We created a `sample dataset`_ that you can check to better understand the +format. + +There are three attributes at the root of the JSON document: + + - ``"language"``: the language of the dataset in :ref:`ISO format ` + - ``"intents"``: a dictionary mapping between intents names and intents data + - ``"entities"``: a dictionary mapping between entities names and entities data + +Here is how the entities are represented in this format: + +.. code-block:: json + + { + "entities": { + "snips/datetime": {}, + "city": { + "data": [ + { + "value": "london", + "synonyms": [] + }, + { + "value": "new york", + "synonyms": [ + "big apple" + ] + }, + { + "value": "paris", + "synonyms": [ + "city of lights" + ] + } + ], + "use_synonyms": true, + "automatically_extensible": true, + "matching_strictness": 1.0 + } + } + } + +Note that the ``"snips/datetime"`` entity data is empty as it is a +:ref:`builtin entity `. + +The intent utterances are defined using the following format: + +.. code-block:: json + + { + "data": [ + { + "text": "find me a flight from " + }, + { + "text": "Paris", + "entity": "city", + "slot_name": "origin" + }, + { + "text": " to " + }, + { + "text": "New York", + "entity": "city", + "slot_name": "destination" + } + ] + } + +Once you have created a JSON dataset, either directly or with YAML files, you +can use it to train an NLU engine. To do so, you can use the CLI as documented +:ref:`here`, or the :ref:`python API `. + +.. _sample dataset: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu_samples/sample_dataset.json \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 94dbbe1ab..5b976aedc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,9 +81,10 @@ the :ref:`api` documentation or alternatively check the `github repository`_. installation quickstart tutorial - cli + dataset data_model languages + cli api diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index e36dddbb6..cbb2b4115 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -43,7 +43,7 @@ resources used to improve performance with the :func:`.load_resources` function. nlu_engine = SnipsNLUEngine() Now that we have our engine object created, we need to feed it with our sample -dataset. In general, this action will require some *machine learning* hence we +dataset. In general, this action will require some *machine learning*, so we will actually *fit* the engine: .. code-block:: python @@ -52,7 +52,7 @@ will actually *fit* the engine: Our NLU engine is now trained to recognize new utterances that extend beyond -what is strictly contained in the dataset, it is able to *generalize*. +what is strictly contained in the dataset: it is able to *generalize*. Let's try to parse something now! diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 0ae3c17da..0585b0e57 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -4,22 +4,20 @@ Tutorial ======== In this section, we will build an NLU assistant for home automation tasks. It -will be able to understand queries about lights and thermostats. More precisely -our assistant will contain three :ref:`intents `: +will be able to understand queries about lights and thermostats. More +precisely, our assistant will contain three :ref:`intents `: - ``turnLightOn`` - ``turnLightOff`` - ``setTemperature`` The first two intents will be about turning on and off the lights in a specific -room. Thus, these intents will have one :ref:`slot` which will be the ``room``. -The third intent will let you control the temperature of a specific room, thus -it will have two slots: the ``roomTemperature`` and the ``room``. +room. These intents will have one :ref:`slot` which will be the ``room``. +The third intent will let you control the temperature of a specific room. It +will have two slots: the ``roomTemperature`` and the ``room``. The first step is to create an appropriate dataset for this task. -.. _dataset: - Snips dataset format -------------------- @@ -251,6 +249,8 @@ That will raise a ``NotTrained`` error, as we did not train the engine with the dataset that we created. +.. _training_the_engine: + Training the engine ------------------- From da950258a4295f216276e26d6d3ef0a932e3e4bc Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 12:18:33 +0100 Subject: [PATCH 10/24] Update documentation with new YAML format --- docs/source/cli.rst | 70 +++-------- docs/source/dataset.rst | 4 +- docs/source/tutorial.rst | 251 +++++++++++++-------------------------- 3 files changed, 101 insertions(+), 224 deletions(-) diff --git a/docs/source/cli.rst b/docs/source/cli.rst index a5334df0c..c031f7600 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -14,70 +14,30 @@ is typically used by running ``snips-nlu [args]`` or alternatively Creating a dataset ------------------ -As seen in the :ref:`tutorial` section, a command allows you to generate a -dataset from a :ref:`language ` and a list of text files describing -:ref:`intents ` and :ref:`entities `: +As seen in the :ref:`tutorial ` section, a command allows you to generate a +dataset from a :ref:`language ` and a list of YAML files containing +data for :ref:`intents ` and :ref:`entities `: .. code-block:: bash - snips-nlu generate-dataset en intent_1.txt intent_2.txt entity_1.txt + snips-nlu generate-dataset en my_first_intent.yaml my_second_intent.yaml my_entity.yaml -This will print a Json string to the standard output. If you want to store the -dataset directly in a Json file, you just have to pipe the previous command like -below: - -.. code-block:: bash - - snips-nlu generate-dataset en intent_1.txt intent_2.txt entity_1.txt > dataset.json - - -Each intent file corresponds to a single intent, and the name of the file must -start with ``intent_``. The same is true for entity files, which must start -with ``entity_``. - -An intent file is a text file in which each row corresponds to an utterance. -Slots, along with their corresponding slot type (entity), can be defined using -the following syntax: - -.. code-block:: console +.. note:: - Find me a flight from [departure:city](Paris) to [destination:city](London) - Find me a flight from [departure:city](Moscow) [departureDate:snips/datetime](tomorrow around 9pm) + You don't have to use separated files for each intent and entity. You could + for instance merge all intents together in a single ``intents.yaml`` file, + or even merge all intents and entities in a single ``dataset.yaml`` file. -In this example, there are three different slots -- ``departure``, -``destination`` and ``departureDate`` -- and two different entities -- ``city`` -and ``snips/datetime`` (which is a :ref:`builtin entity `). -Check :ref:`this section ` to have more details about the -difference between slots and entities. - -An entity file is a comma separated text file in which each row corresponds to -an entity value, optionally followed with its :ref:`synonyms `. The syntax used -is the following: - -.. code-block:: console - - bedroom - garden,yard,backyard - -Here, the entity (room) has two values which are ``"bedroom"`` and ``"garden"``. -Two synonyms, ``"yard"`` and ``"backyard"``, are defined for ``"garden"``. -If a value or a synonym contains a comma, the value must be put between -double quotes ``"``. - -If the value contains double quotes, it must be doubled -to be escaped like this: ``"A value with a "","" in it"`` which corresponds -to the actual value ``A value with a "," in it``. - -.. Note:: +This will print a JSON string to the standard output. If you want to store the +dataset directly in a JSON file, you just have to pipe the previous command like +below: - By default entities are generated as :ref:`automatically extensible `, - i.e. the recognition will accept additional values than the ones listed in - the entity file. This behavior can be changed by adding at the beginning of - the entity file the following: +.. code-block:: bash - .. code-block:: bash + snips-nlu generate-dataset en my_first_intent.yaml my_second_intent.yaml my_entity.yaml > dataset.json - # automatically_extensible=false +Check the :ref:`Training Dataset Format ` section for more details +about the format used to describe the training data. .. _training_cli: diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst index 826c1f638..1e3628884 100644 --- a/docs/source/dataset.rst +++ b/docs/source/dataset.rst @@ -16,6 +16,8 @@ manually. The other dataset format uses JSON and should rather be used if you plan to create or edit datasets programmatically. +.. _yaml_format: + YAML format ----------- @@ -244,6 +246,6 @@ The intent utterances are defined using the following format: Once you have created a JSON dataset, either directly or with YAML files, you can use it to train an NLU engine. To do so, you can use the CLI as documented -:ref:`here`, or the :ref:`python API `. +:ref:`here `, or the :ref:`python API `. .. _sample dataset: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu_samples/sample_dataset.json \ No newline at end of file diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 0585b0e57..72b6576b2 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -18,183 +18,98 @@ will have two slots: the ``roomTemperature`` and the ``room``. The first step is to create an appropriate dataset for this task. -Snips dataset format --------------------- - -The format used by Snips to describe the input data is designed to be simple to -parse as well as easy to read. - -We created a `sample dataset`_ that you can check to better understand the -format. - -You have three options to create your dataset. You can build it manually by -respecting the format used in the sample, you can also use the -:ref:`dataset creation CLI ` included in the lib, or alternatively -you can use `chatito`_ a DSL tool for dataset generation. - -We will go for the second option here and start by creating three files -corresponding to our three intents and one entity file corresponding to the -``room`` entity: - -- ``intent_turnLightOn.txt`` -- ``intent_turnLightOff.txt`` -- ``intent_setTemperature.txt`` -- ``entity_room.txt`` - -The name of each file is important as the tool will map it to the intent or -entity name. In particular, the prefixes ``intent_`` and ``entity_`` are -required in order to distinguish intents from entity files. - -Let's add training examples for the first intent by inserting the following -lines in the first file, ``intent_turnLightOn.txt``: - -.. code-block:: console - - Turn on the lights in the [room:room](kitchen) - give me some light in the [room:room](bathroom) please - Can you light up the [room:room](living room) ? - switch the [room:room](bedroom)'s lights on please - -We use a standard markdown-like annotation syntax to annotate slots within -utterances. The ``[room:room]`` chunks describe the slot with its two -components: :ref:`the slot name and the entity `. In our -case we used the same value, ``room``, to describe both. The parts with -parenthesis, like ``(kitchen)``, correspond to the text value of the slot. - -Let's move on to the second intent, and insert this into -``intent_turnLightOff.txt``: - -.. code-block:: console - - Turn off the lights in the [room:room](entrance) - turn the [room:room](bathroom)'s light out please - switch off the light the [room:room](kitchen), will you? - Switch the [room:room](bedroom)'s lights off please - -And now the last file, ``intent_setTemperature.txt``: - -.. code-block:: console - - Set the temperature to [roomTemperature:snips/temperature](19 degrees) in the [room:room](bedroom) - please set the [room:room](living room)'s temperature to [roomTemperature:snips/temperature](twenty two degrees celsius) - I want [roomTemperature:snips/temperature](75 degrees fahrenheit) in the [room:room](bathroom) please - Can you increase the temperature to [roomTemperature:snips/temperature](22 degrees) ? - -As you can see here, we used a new slot, ``[room_temperature:snips/temperature]``, -whose name is ``roomTemperature`` and whose type is ``snips/temperature``. The slot -type used here is a :ref:`builtin entity `. It -allows you to resolve the temperature values properly. - -Let's move to the ``entity_room.txt`` entity file: - -.. code-block:: console - - bedroom - living room,main room - garden,yard,backyard - -The entity file is a comma (``,``) separated file. Each line corresponds to an -entity value followed by its potential :ref:`synonyms `. - -We are now ready to generate our dataset: +Training Data +------------- + +Check the :ref:`Training Dataset Format ` section for more details +about the format used to describe the training data. + +In this tutorial, we will create our dataset using the +:ref:`YAML format `, and create a ``dataset.yaml`` file with the +following content: + +.. code-block:: yaml + + # turnLightOn intent + --- + type: intent + name: turnLightOn + slots: + - name: room + entity: room + utterances: + - Turn on the lights in the [room](kitchen) + - give me some light in the [room](bathroom) please + - Can you light up the [room](living room) ? + - switch the [room](bedroom)'s lights on please + + # turnLightOff intent + --- + type: intent + name: turnLightOff + slots: + - name: room + entity: room + utterances: + - Turn off the lights in the [room](entrance) + - turn the [room](bathroom)'s light out please + - switch off the light the [room](kitchen), will you? + - Switch the [room](bedroom)'s lights off please + + # setTemperature intent + --- + type: intent + name: setTemperature + slots: + - name: room + entity: room + - name: roomTemperature + entity: snips/temperature + utterances: + - Set the temperature to [roomTemperature](19 degrees) in the [room](bedroom) + - please set the [room](living room)'s temperature to [roomTemperature](twenty two degrees celsius) + - I want [roomTemperature](75 degrees fahrenheit) in the [room](bathroom) please + - Can you increase the temperature to [roomTemperature](22 degrees) ? + + # room entity + --- + type: entity + name: room + automatically_extensible: no + values: + - bedroom + - [living room, main room, lounge] + - [garden, yard, backyard] + +Here, we put all the intents and entities in the same file but we could have +split them in dedicated files as well. + +The ``setTemperature`` intent references a ``roomTemperature`` slot which +relies on the ``snips/temperature`` entity. This entity is a +:ref:`builtin entity `. It allows to resolve the +temperature values properly. + +The ``room`` entity makes use of :ref:`synonyms ` by defining lists +like ``[living room, main room, lounge]``. In this case, ``main room`` and +``lounge`` will point to ``living room``, the first item of the list, which is +the reference value. + +Besides, this entity is marked as not +:ref:`automatically extensible ` which means that the NLU +will only output values that we have defined and will not try to match other +values. + +We are now ready to generate our dataset using the :ref:`CLI `: .. code-block:: bash - snips-nlu generate-dataset en intent_turnLightOn.txt intent_turnLightOff.txt intent_setTemperature.txt entity_room.txt > dataset.json + snips-nlu generate-dataset en dataset.yaml > dataset.json .. note:: We used ``en`` as the language here but other languages are supported, please check the :ref:`languages` section to know more. -Now, the ``"entities"`` part of the generated json looks like that: - -.. code-block:: json - - { - "entities": { - "room": { - "automatically_extensible": true, - "data": [ - { - "synonyms": [], - "value": "bedroom" - }, - { - "synonyms": [ - "main room" - ], - "value": "living room" - }, - { - "synonyms": [ - "yard", - "backyard" - ], - "value": "garden" - } - ], - "matching_strictness": 1.0, - "use_synonyms": true - }, - "snips/temperature": {} - } - } - -You can see that both entities from the intent utterances and from the ``room`` -entity file were added. - -By default, the ``room`` entity is set to be -:ref:`automatically extensible ` but in our case we don't want -to handle any entity value that would not be part of the dataset, so we set -this attribute to ``false``. -Moreover, we are going to add some rooms that were not in the previous sentences -and that we want our assistant to cover. Additionally, we add some -:ref:`synonyms `. Finally, the entities part looks like that: - -.. code-block:: json - - { - "entities": { - "room": { - "automatically_extensible": false, - "data": [ - { - "synonyms": [], - "value": "bathroom" - }, - { - "synonyms": [ - "sleeping room" - ], - "value": "bedroom" - }, - { - "synonyms": [ - "main room", - "lounge" - ], - "value": "living room" - }, - { - "synonyms": [ - "yard", - "backyard" - ], - "value": "garden" - } - ], - "matching_strictness": 1.0, - "use_synonyms": true - }, - "snips/temperature": {} - } - } - - -We don't need to edit the ``snips/temperature`` entity as it is a builtin -entity. - Now that we have our dataset ready, let's move to the next step which is to create an NLU engine. From 0b898c2b7de1dcc79d101380f8b8adcf721aa9d7 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 14:35:30 +0100 Subject: [PATCH 11/24] Add documentation about implicit values in YAML format --- docs/source/dataset.rst | 105 +++++++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 22 deletions(-) diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst index 1e3628884..f20a65d29 100644 --- a/docs/source/dataset.rst +++ b/docs/source/dataset.rst @@ -18,13 +18,13 @@ create or edit datasets programmatically. .. _yaml_format: +=========== YAML format ------------ +=========== The YAML dataset format allows you to define intents and entities using the `YAML `_ syntax. ------- Entity ------ @@ -37,9 +37,9 @@ Here is what an entity file looks like: type: entity # allows to differentiate between entities and intents files name: city # name of the entity values: - - london # single entity value - - [new york, big apple] # entity value with a synonym - - [paris, city of lights] + - london # single entity value + - [new york, big apple] # entity value with a synonym + - [paris, city of lights] You can specify entity values either using single YAML scalars (e.g. ``london``), or using lists if you want to define some synonyms (e.g. @@ -58,11 +58,10 @@ are optional: use_synonyms: false # default value is true matching_strictness: 0.8 # default value is 1.0 values: - - london - - [new york, big apple] - - [paris, city of lights] + - london + - [new york, big apple] + - [paris, city of lights] ------- Intent ------ @@ -100,7 +99,7 @@ utterances. This will result in simpler annotations: # searchFlight Intent --- type: intent - name: searchFlight # name of the intent + name: searchFlight slots: - name: origin entity: city @@ -113,7 +112,6 @@ utterances. This will result in simpler annotations: - I need a flight leaving [date](this weekend) to [destination](Berlin) - show me flights to go to [arrival](new york) leaving [date](this evening) -------- Dataset ------- @@ -125,19 +123,47 @@ Here is the yaml file corresponding to the previous ``city`` entity and .. code-block:: yaml + # searchFlight Intent + --- + type: intent + name: searchFlight + slots: + - name: origin + entity: city + - name: destination + entity: city + - name: date + entity: snips/datetime + utterances: + - find me a flight from [origin](Paris) to [destination](New York) + - I need a flight leaving [date](this weekend) to [destination](Berlin) + - show me flights to go to [arrival](new york) leaving [date](this evening) + # City Entity --- - type: entity # allows to differentiate between entities and intents files - name: city # name of the entity + type: entity + name: city values: - - london # single entity value - - [new york, big apple] # entity value with a synonym - - [paris, city of lights] + - london + - [new york, big apple] + - [paris, city of lights] + +--------------------------------------- +Implicit entity values and slot mapping +--------------------------------------- + +In order to make the annotation process even easier, there is a mechanism that +allows to populate entity values automatically based on the entity values that +are already provided. + +This results in a much simpler dataset file: + +.. code-block:: yaml # searchFlight Intent --- type: intent - name: searchFlight # name of the intent + name: searchFlight slots: - name: origin entity: city @@ -146,9 +172,41 @@ Here is the yaml file corresponding to the previous ``city`` entity and - name: date entity: snips/datetime utterances: - - find me a flight from [origin](Paris) to [destination](New York) - - I need a flight leaving [date](this weekend) to [destination](Berlin) - - show me flights to go to [arrival](new york) leaving [date](this evening) + - find me a flight from [origin] to [destination] + - I need a flight leaving [date] to [destination] + - show me flights to go to [arrival] leaving [date] + + # City Entity + --- + type: entity + name: city + values: + - london + - [new york, big apple] + - [paris, city of lights] + +For this to work, you need to provide at least one value for each +*custom entity*. This can be done either through an entity file, or simply by +providing an entity value in one of the annotated utterances. +Entity values are automatically generated for *builtin entities*. + +Here is a final example of a valid YAML dataset leveraging implicit entity +values as well as implicit slot mapping: + +.. code-block:: yaml + + # searchFlight Intent + --- + type: intent + name: searchFlight + utterances: + - find me a flight from [origin:city](Paris) to [destination:city] + - I need a flight leaving [date:snips/datetime] to [destination] + - show me flights to go to [arrival] leaving [date] + +Note that the city entity was not provided here, but one value (``Paris``) was +provided in the first annotated utterance. The mapping between slot name and +entity is also inferred from the first two utterances. Once your intents and entities are created using the YAML format described previously, you can produce a dataset using the @@ -166,10 +224,13 @@ Or alternatively if you merged the yaml documents into a single file: This will generate a JSON dataset and write it in the ``dataset.json`` file. The format of the generated file is the second allowed format that is described -in the next section. +in the :ref:`JSON format ` section. + +.. _json_format: +=========== JSON format ------------ +=========== The JSON format is the format which is eventually used by the training API. It was designed to be easy to parse. From 32be2bfbc93f53f02d08b407d5b9d87a769dbf6f Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 14:49:14 +0100 Subject: [PATCH 12/24] Fix yaml examples --- docs/source/dataset.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst index f20a65d29..16743a77c 100644 --- a/docs/source/dataset.rst +++ b/docs/source/dataset.rst @@ -76,7 +76,7 @@ Here is the format used to describe an intent: utterances: - find me a flight from [origin:city](Paris) to [destination:city](New York) - I need a flight leaving [date:snips/datetime](this weekend) to [destination:city](Berlin) - - show me flights to go to [arrival:city](new york) leaving [date:snips/datetime](this evening) + - show me flights to go to [destination:city](new york) leaving [date:snips/datetime](this evening) We use a standard markdown-like annotation syntax to annotate slots within utterances. The ``[origin:city](Paris)`` chunk describes a slot with its three @@ -110,7 +110,7 @@ utterances. This will result in simpler annotations: utterances: - find me a flight from [origin](Paris) to [destination](New York) - I need a flight leaving [date](this weekend) to [destination](Berlin) - - show me flights to go to [arrival](new york) leaving [date](this evening) + - show me flights to go to [destination](new york) leaving [date](this evening) Dataset ------- @@ -137,7 +137,7 @@ Here is the yaml file corresponding to the previous ``city`` entity and utterances: - find me a flight from [origin](Paris) to [destination](New York) - I need a flight leaving [date](this weekend) to [destination](Berlin) - - show me flights to go to [arrival](new york) leaving [date](this evening) + - show me flights to go to [destination](new york) leaving [date](this evening) # City Entity --- @@ -174,7 +174,7 @@ This results in a much simpler dataset file: utterances: - find me a flight from [origin] to [destination] - I need a flight leaving [date] to [destination] - - show me flights to go to [arrival] leaving [date] + - show me flights to go to [destination] leaving [date] # City Entity --- @@ -202,7 +202,7 @@ values as well as implicit slot mapping: utterances: - find me a flight from [origin:city](Paris) to [destination:city] - I need a flight leaving [date:snips/datetime] to [destination] - - show me flights to go to [arrival] leaving [date] + - show me flights to go to [destination] leaving [date] Note that the city entity was not provided here, but one value (``Paris``) was provided in the first annotated utterance. The mapping between slot name and From 04407156b484319699dbec9b8d8dea15721c7ecc Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 15:28:04 +0100 Subject: [PATCH 13/24] Rename AssistantDataset to Dataset --- snips_nlu/cli/generate_dataset.py | 6 +++--- snips_nlu/dataset/__init__.py | 2 +- snips_nlu/dataset/dataset.py | 10 +++++----- snips_nlu/dataset/entity.py | 2 +- snips_nlu/dataset/intent.py | 12 +----------- snips_nlu/tests/test_dataset_loading.py | 8 ++++---- 6 files changed, 15 insertions(+), 25 deletions(-) diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py index faca6043d..a62d7cba0 100644 --- a/snips_nlu/cli/generate_dataset.py +++ b/snips_nlu/cli/generate_dataset.py @@ -4,7 +4,7 @@ import plac -from snips_nlu.dataset import AssistantDataset +from snips_nlu.dataset import Dataset @plac.annotations( @@ -14,7 +14,7 @@ def generate_dataset(language, *files): """Create a Snips NLU dataset from text friendly files""" if any(f.endswith(".yml") or f.endswith(".yaml") for f in files): - dataset = AssistantDataset.from_yaml_files(language, list(files)) + dataset = Dataset.from_yaml_files(language, list(files)) else: - dataset = AssistantDataset.from_files(language, list(files)) + dataset = Dataset.from_files(language, list(files)) print(json.dumps(dataset.json, indent=2, sort_keys=True)) diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py index 9dd099c85..a43f2268b 100644 --- a/snips_nlu/dataset/__init__.py +++ b/snips_nlu/dataset/__init__.py @@ -1,4 +1,4 @@ -from snips_nlu.dataset.dataset import AssistantDataset +from snips_nlu.dataset.dataset import Dataset from snips_nlu.dataset.entity import Entity, EntityFormatError from snips_nlu.dataset.intent import Intent, IntentFormatError from snips_nlu.dataset.utils import ( diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py index d9cef2318..e72baad83 100644 --- a/snips_nlu/dataset/dataset.py +++ b/snips_nlu/dataset/dataset.py @@ -18,12 +18,12 @@ class DatasetFormatError(TypeError): pass -class AssistantDataset(object): +class Dataset(object): """Dataset used in the main NLU training API Consists of intents and entities data. This object can be built either from - text files (:meth:`.AssistantDataset.from_files`) or from YAML files - (:meth:`.AssistantDataset.from_yaml_files`). + text files (:meth:`.Dataset.from_files`) or from YAML files + (:meth:`.Dataset.from_yaml_files`). Attributes: language (str): language of the intents @@ -61,8 +61,8 @@ def from_yaml_files(cls, language, filenames): current_version=__version__, details="Use from_yaml_files instead") def from_files(cls, language, filenames): - """Creates an :class:`.AssistantDataset` from a language and a list of - intent and entity files + """Creates a :class:`.Dataset` from a language and a list of intent and + entity files Args: language (str): language of the assistant diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py index 04c9743ce..06fe84df8 100644 --- a/snips_nlu/dataset/entity.py +++ b/snips_nlu/dataset/entity.py @@ -23,7 +23,7 @@ class EntityFormatError(TypeError): class Entity(object): - """Entity of an :class:`.AssistantDataset` + """Entity data of a :class:`.Dataset` This class can represents both a custom or a builtin entity diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index 58dbaa679..c4abe6c19 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -20,17 +20,7 @@ class IntentFormatError(TypeError): class Intent(object): - """Dataset of an intent - - Can parse utterances from a text file or an iterator. - - An example of utterance is: - - "the [role:role](president) of [country:country](France)" - - a Tag is in this format: - - [slot:entity_name](text_to_tag) + """Intent data of a :class:`.Dataset` Attributes: intent_name (str): name of the intent diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py index 458fba82f..7280f73d5 100644 --- a/snips_nlu/tests/test_dataset_loading.py +++ b/snips_nlu/tests/test_dataset_loading.py @@ -7,7 +7,7 @@ from deprecation import fail_if_not_removed from mock import patch -from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset +from snips_nlu.dataset import Dataset, validate_and_format_dataset EXPECTED_DATASET_DICT = { "entities": { @@ -192,7 +192,7 @@ def mock_open(filename, **kwargs): dataset_files = [intent_file_1, intent_file_2, entity_file_1] # When - dataset = AssistantDataset.from_yaml_files("en", dataset_files) + dataset = Dataset.from_yaml_files("en", dataset_files) dataset_dict = dataset.json # Then @@ -241,7 +241,7 @@ def mock_open(filename, **kwargs): mock_io.open.side_effect = mock_open # When - dataset = AssistantDataset.from_yaml_files("en", [dataset_file]) + dataset = Dataset.from_yaml_files("en", [dataset_file]) dataset_dict = dataset.json # Then @@ -287,7 +287,7 @@ def mock_open(self_, *args, **kwargs): # When with patch("pathlib.io") as mock_io: mock_io.open.side_effect = mock_open - dataset = AssistantDataset.from_files("en", dataset_files) + dataset = Dataset.from_files("en", dataset_files) dataset_dict = dataset.json # When / Then From 34b90cf4bbb69c15f8aea93c54d2ccaac3b83574 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 15:28:29 +0100 Subject: [PATCH 14/24] Add API reference for Dataset, Intent and Entity classes --- docs/source/api.rst | 14 +++++++++++ docs/source/dataset.rst | 12 ++++++++++ snips_nlu/dataset/dataset.py | 46 ++++++++++++++++++++++++++++++++++++ snips_nlu/dataset/entity.py | 28 ++++++++++++++++++++-- snips_nlu/dataset/intent.py | 33 +++++++++++++++++++++++--- 5 files changed, 128 insertions(+), 5 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 365e50c8c..9832e7ec6 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -96,6 +96,20 @@ Configurations :members: +Dataset +------- + +.. module:: snips_nlu.dataset + +.. autoclass:: Dataset + :members: + +.. autoclass:: Intent + :members: + +.. autoclass:: Entity + :members: + Result and output format ------------------------ diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst index 16743a77c..d414ba0fd 100644 --- a/docs/source/dataset.rst +++ b/docs/source/dataset.rst @@ -25,6 +25,8 @@ YAML format The YAML dataset format allows you to define intents and entities using the `YAML `_ syntax. +.. _yaml_entity_format: + Entity ------ @@ -62,6 +64,8 @@ are optional: - [new york, big apple] - [paris, city of lights] +.. _yaml_intent_format: + Intent ------ @@ -112,6 +116,9 @@ utterances. This will result in simpler annotations: - I need a flight leaving [date](this weekend) to [destination](Berlin) - show me flights to go to [destination](new york) leaving [date](this evening) + +.. _yaml_dataset_format: + Dataset ------- @@ -148,6 +155,11 @@ Here is the yaml file corresponding to the previous ``city`` entity and - [new york, big apple] - [paris, city of lights] +.. important:: + + If you plan to have more than one entity or intent in a YAML file, you must + separate them using the YAML document separator: ``---`` + --------------------------------------- Implicit entity values and slot mapping --------------------------------------- diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py index e72baad83..a80e8ee6f 100644 --- a/snips_nlu/dataset/dataset.py +++ b/snips_nlu/dataset/dataset.py @@ -40,6 +40,52 @@ def __init__(self, language, intents, entities): @classmethod def from_yaml_files(cls, language, filenames): + """Creates a :class:`.Dataset` from a language and a list of YAML files + containing intents and entities data + + Each file need not correspond to a single entity nor intent. They can + consist in several entities and intents merged together in a single + file. + + A dataset can be defined with a YAML document following the schema + illustrated in the example below: + + .. code-block:: yaml + + # searchFlight Intent + --- + type: intent + name: searchFlight + slots: + - name: origin + entity: city + - name: destination + entity: city + - name: date + entity: snips/datetime + utterances: + - find me a flight from [origin](Paris) to [destination](New York) + - I need a flight leaving [date](this weekend) to [destination](Berlin) + - show me flights to go to [destination](new york) leaving [date](this evening) + + # City Entity + --- + type: entity + name: city + values: + - london + - [new york, big apple] + - [paris, city of lights] + + Raises: + DatasetFormatError: When one of the documents present in the YAML + files has a wrong 'type' attribute, which is not 'entity' nor + 'intent' + IntentFormatError: When the YAML document of an intent does not + correspond to the :ref:`expected intent format ` + EntityFormatError: When the YAML document of an entity does not + correspond to the :ref:`expected entity format ` + """ entities = [] intents = [] for filename in filenames: diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py index 06fe84df8..9d138b525 100644 --- a/snips_nlu/dataset/entity.py +++ b/snips_nlu/dataset/entity.py @@ -25,7 +25,8 @@ class EntityFormatError(TypeError): class Entity(object): """Entity data of a :class:`.Dataset` - This class can represents both a custom or a builtin entity + This class can represents both a custom or a builtin entity. When the + entity is a builtin one, only the `name` attribute is relevant. Attributes: name (str): name of the entity @@ -56,7 +57,29 @@ def is_builtin(self): @classmethod def from_yaml(cls, yaml_dict): - """Build an :class:`.Entity` from its YAML definition dict""" + """Build an :class:`.Entity` from its YAML definition dict + + An entity can be defined with a YAML document following the schema + illustrated in the example below: + + .. code-block:: yaml + + # City Entity + --- + type: entity + name: city + automatically_extensible: false # default value is true + use_synonyms: false # default value is true + matching_strictness: 0.8 # default value is 1.0 + values: + - london + - [new york, big apple] + - [paris, city of lights] + + Raises: + EntityFormatError: When the YAML dict does not correspond to the + :ref:`expected entity format ` + """ object_type = yaml_dict.get("type") if object_type and object_type != "entity": raise EntityFormatError("Wrong type: '%s'" % object_type) @@ -88,6 +111,7 @@ def from_yaml(cls, yaml_dict): @deprecated(deprecated_in="0.18.0", removed_in="0.19.0", current_version=__version__, details="Use from_yaml instead") def from_file(cls, filepath): + """Build an :class:`.Entity` from a text file""" filepath = Path(filepath) stem = filepath.stem if not stem.startswith("entity_"): diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index c4abe6c19..c8f26dbb1 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -24,7 +24,8 @@ class Intent(object): Attributes: intent_name (str): name of the intent - utterances (list of :class:`.IntentUtterance`): intent utterances + utterances (list of :class:`.IntentUtterance`): annotated intent + utterances slot_mapping (dict): mapping between slot names and entities """ @@ -39,7 +40,33 @@ def __init__(self, intent_name, utterances, slot_mapping=None): @classmethod def from_yaml(cls, yaml_dict): - """Build an :class:`.Intent` from its YAML definition dict""" + """Build an :class:`.Intent` from its YAML definition dict + + An intent can be defined with a YAML document following the schema + illustrated in the example below: + + .. code-block:: yaml + + # searchFlight Intent + --- + type: intent + name: searchFlight + slots: + - name: origin + entity: city + - name: destination + entity: city + - name: date + entity: snips/datetime + utterances: + - find me a flight from [origin](Paris) to [destination](New York) + - I need a flight leaving [date](this weekend) to [destination](Berlin) + - show me flights to go to [destination](new york) leaving [date](this evening) + + Raises: + IntentFormatError: When the YAML dict does not correspond to the + :ref:`expected intent format ` + """ object_type = yaml_dict.get("type") if object_type and object_type != "intent": raise IntentFormatError("Wrong type: '%s'" % object_type) @@ -60,6 +87,7 @@ def from_yaml(cls, yaml_dict): @deprecated(deprecated_in="0.18.0", removed_in="0.19.0", current_version=__version__, details="Use from_yaml instead") def from_file(cls, filepath): + """Build an :class:`.Intent` from a text file""" filepath = Path(filepath) stem = filepath.stem if not stem.startswith("intent_"): @@ -102,7 +130,6 @@ def json(self): @property def entities_names(self): - """Set of entity names present in the intent utterances""" return set(chunk.entity for u in self.utterances for chunk in u.chunks if isinstance(chunk, SlotChunk)) From 8ab67bbf2f56bd650324e4cf3e02da6d8ed170be Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 21 Nov 2018 15:56:28 +0100 Subject: [PATCH 15/24] Fix linting issues --- snips_nlu/dataset/dataset.py | 3 +++ snips_nlu/dataset/intent.py | 2 ++ snips_nlu/tests/test_dataset_loading.py | 3 +-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py index a80e8ee6f..d4eb996a2 100644 --- a/snips_nlu/dataset/dataset.py +++ b/snips_nlu/dataset/dataset.py @@ -40,6 +40,7 @@ def __init__(self, language, intents, entities): @classmethod def from_yaml_files(cls, language, filenames): + # pylint:disable=line-too-long """Creates a :class:`.Dataset` from a language and a list of YAML files containing intents and entities data @@ -86,6 +87,7 @@ def from_yaml_files(cls, language, filenames): EntityFormatError: When the YAML document of an entity does not correspond to the :ref:`expected entity format ` """ + # pylint:enable=line-too-long entities = [] intents = [] for filename in filenames: @@ -183,6 +185,7 @@ def _get_entity_values(self, entity): @property def json(self): + """Dataset data in json format""" intents = {intent_data.intent_name: intent_data.json for intent_data in self.intents} entities = {entity.name: entity.json for entity in self.entities} diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index c8f26dbb1..ed8ce436c 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -40,6 +40,7 @@ def __init__(self, intent_name, utterances, slot_mapping=None): @classmethod def from_yaml(cls, yaml_dict): + # pylint:disable=line-too-long """Build an :class:`.Intent` from its YAML definition dict An intent can be defined with a YAML document following the schema @@ -67,6 +68,7 @@ def from_yaml(cls, yaml_dict): IntentFormatError: When the YAML dict does not correspond to the :ref:`expected intent format ` """ + # pylint:enable=line-too-long object_type = yaml_dict.get("type") if object_type and object_type != "intent": raise IntentFormatError("Wrong type: '%s'" % object_type) diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py index 7280f73d5..9b825b049 100644 --- a/snips_nlu/tests/test_dataset_loading.py +++ b/snips_nlu/tests/test_dataset_loading.py @@ -3,7 +3,6 @@ import io from unittest import TestCase -import mock from deprecation import fail_if_not_removed from mock import patch @@ -199,7 +198,7 @@ def mock_open(filename, **kwargs): validate_and_format_dataset(dataset_dict) self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict) - @mock.patch("snips_nlu.dataset.dataset.io") + @patch("snips_nlu.dataset.dataset.io") def test_should_generate_dataset_from_merged_yaml_file(self, mock_io): # Given dataset_file = "dataset.yaml" From f8c9ee995fdcc4de64255eee79e94d779e26918b Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Thu, 22 Nov 2018 18:10:48 +0100 Subject: [PATCH 16/24] Fix issues after review --- docs/source/dataset.rst | 2 +- docs/source/index.rst | 2 +- snips_nlu/dataset/intent.py | 8 ++------ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst index d414ba0fd..19d62b580 100644 --- a/docs/source/dataset.rst +++ b/docs/source/dataset.rst @@ -8,7 +8,7 @@ data in order to produce a powerful intent recognition engine. The better your training data is, and the more accurate your NLU engine will be. Thus, it is worth spending a bit of time to create a dataset that -corresponds well to your use case. +matches well your use case. Snips NLU accepts two different dataset formats. The first one, which relies on YAML, is the preferred option if you want to create or edit a dataset diff --git a/docs/source/index.rst b/docs/source/index.rst index 5b976aedc..faf75bd2d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,8 +81,8 @@ the :ref:`api` documentation or alternatively check the `github repository`_. installation quickstart tutorial - dataset data_model + dataset languages cli api diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py index ed8ce436c..d0261925b 100644 --- a/snips_nlu/dataset/intent.py +++ b/snips_nlu/dataset/intent.py @@ -15,10 +15,6 @@ class IntentFormatError(TypeError): pass -INTENT_FORMATTING_ERROR = IntentFormatError( - "Intent file is not properly formatted") - - class Intent(object): """Intent data of a :class:`.Dataset` @@ -288,7 +284,7 @@ def capture_slot(state): next_colon_pos = state.find(':') next_square_bracket_pos = state.find(']') if next_square_bracket_pos < 0: - raise INTENT_FORMATTING_ERROR + raise IntentFormatError("Missing ending ']' in annotated utterance") if next_colon_pos < 0 or next_square_bracket_pos < next_colon_pos: slot_name = state[:next_square_bracket_pos] state.move(next_square_bracket_pos) @@ -309,7 +305,7 @@ def capture_slot(state): def capture_tagged(state): next_pos = state.find(')') if next_pos < 1: - raise INTENT_FORMATTING_ERROR + raise IntentFormatError("Missing ending ')' in annotated utterance") else: tagged_text = state[:next_pos] state.add_tagged(tagged_text) From 770445d709b6f1844ca414f40039ad81feda7c40 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 20 Nov 2018 10:33:59 +0100 Subject: [PATCH 17/24] Bump snips-nlu-ontology to 0.62 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fe523edad..f1c386f7c 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ "sklearn-crfsuite>=0.3.6,<0.4", "semantic_version>=2.6,<3.0", "snips_nlu_utils>=0.7,<0.8", - "snips_nlu_ontology>=0.61.1,<0.62", + "snips_nlu_ontology>=0.62.0,<0.63", "num2words>=0.5.6,<0.6", "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", From 211ee37a77e0447d5fcd576f2c74d3e072431fd6 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Fri, 23 Nov 2018 10:45:09 +0100 Subject: [PATCH 18/24] Bump package version and model version --- snips_nlu/__about__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py index ae1289b8a..6ea37364c 100644 --- a/snips_nlu/__about__.py +++ b/snips_nlu/__about__.py @@ -11,8 +11,8 @@ __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai" __license__ = "Apache License, Version 2.0" -__version__ = "0.17.4" -__model_version__ = "0.17.0" +__version__ = "0.18.0" +__model_version__ = "0.18.0" __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download" __compatibility__ = "https://raw.githubusercontent.com/snipsco/snips-nlu-language-resources/master/compatibility.json" From ad70ae15afd4c4ffa5b04bb4455f1df04ccc6ca9 Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Fri, 23 Nov 2018 11:02:53 +0100 Subject: [PATCH 19/24] Adapt noise generation to add more unknownwords --- .../log_reg_classifier_utils.py | 62 ++- .../pipeline/configs/intent_classifier.py | 3 +- .../tests/test_log_reg_classifier_utils.py | 527 ++++++++++++++++++ .../tests/test_log_reg_intent_classifier.py | 411 +------------- 4 files changed, 577 insertions(+), 426 deletions(-) create mode 100644 snips_nlu/tests/test_log_reg_classifier_utils.py diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py index 919b8b753..dab48053f 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py +++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py @@ -9,8 +9,8 @@ import numpy as np from future.utils import iteritems, itervalues -from snips_nlu.constants import ( - DATA, ENTITY, INTENTS, TEXT, UNKNOWNWORD, UTTERANCES) +from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT, + UNKNOWNWORD, UTTERANCES) from snips_nlu.data_augmentation import augment_utterances from snips_nlu.dataset import get_text_from_chunks from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity @@ -50,16 +50,16 @@ def get_noise_it(noise, mean_length, std_length, random_state): # pylint: enable=stop-iteration-return -def generate_smart_noise(augmented_utterances, replacement_string, language): +def generate_smart_noise(noise, augmented_utterances, replacement_string, + language): text_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] vocab = [w for u in text_utterances for w in tokenize_light(u, language)] vocab = set(vocab) - noise = get_noise(language) return [w if w in vocab else replacement_string for w in noise] -def generate_noise_utterances(augmented_utterances, num_intents, +def generate_noise_utterances(augmented_utterances, noise, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: @@ -67,11 +67,9 @@ def generate_noise_utterances(augmented_utterances, num_intents, avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( - augmented_utterances, + noise, augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) - else: - noise = get_noise(language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), @@ -89,14 +87,40 @@ def generate_noise_utterances(augmented_utterances, num_intents, for _ in range(noise_size)] -def add_unknown_word_to_utterances(augmented_utterances, replacement_string, - unknown_word_prob, random_state): - for u in augmented_utterances: - for chunk in u[DATA]: - if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \ - and random_state.rand() < unknown_word_prob: - chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT]) - return augmented_utterances +def add_unknown_word_to_utterances(utterances, replacement_string, + unknown_word_prob, max_unknown_words, + random_state): + new_utterances = deepcopy(utterances) + unknown_word_lengths = [i for i in range(1, max_unknown_words + 1)] + for u in new_utterances: + if random_state.rand() < unknown_word_prob: + # num_unknown = random_state.choice(unknown_word_lengths, p=p) + num_unknown = random_state.choice(unknown_word_lengths) + # We choose to put the noise at the end of the sentence and not + # in the middle so that it doesn't impact to much ngrams + # computation + extra_chunk = { + TEXT: " " + " ".join( + replacement_string for _ in range(num_unknown)) + } + u[DATA].append(extra_chunk) + return new_utterances + + +def get_dataset_specific_noise(dataset, language): + """Return a noise list that excludes the dataset entity values""" + entities_values = set() + for ent_name, ent in iteritems(dataset[ENTITIES]): + if is_builtin_entity(ent_name): + continue + for k, v in iteritems(ent[UTTERANCES]): + entities_values.add(k) + entities_values.add(v) + original_noise = get_noise(language) + specific_noise = [n for n in original_noise if n not in entities_values] + if not specific_noise: # Avoid returning an empty noise + return original_noise + return specific_noise def build_training_data(dataset, language, data_augmentation_config, @@ -133,13 +157,15 @@ def build_training_data(dataset, language, data_augmentation_config, augmented_utterances, data_augmentation_config.unknown_words_replacement_string, data_augmentation_config.unknown_word_prob, + data_augmentation_config.max_unknown_words, random_state ) # Adding noise + noise = get_dataset_specific_noise(dataset, language) noisy_utterances = generate_noise_utterances( - augmented_utterances, len(intents), data_augmentation_config, language, - random_state) + augmented_utterances, noise, len(intents), data_augmentation_config, + language, random_state) augmented_utterances += noisy_utterances utterance_classes += [noise_class for _ in noisy_utterances] diff --git a/snips_nlu/pipeline/configs/intent_classifier.py b/snips_nlu/pipeline/configs/intent_classifier.py index 42abb8b6d..8ea83dcd2 100644 --- a/snips_nlu/pipeline/configs/intent_classifier.py +++ b/snips_nlu/pipeline/configs/intent_classifier.py @@ -118,13 +118,14 @@ class IntentClassifierDataAugmentationConfig(Config): def __init__(self, min_utterances=20, noise_factor=5, add_builtin_entities_examples=True, unknown_word_prob=0, - unknown_words_replacement_string=None): + unknown_words_replacement_string=None, max_unknown_words=3): self.min_utterances = min_utterances self.noise_factor = noise_factor self.add_builtin_entities_examples = add_builtin_entities_examples self.unknown_word_prob = unknown_word_prob self.unknown_words_replacement_string = \ unknown_words_replacement_string + self.max_unknown_words = max_unknown_words if unknown_word_prob > 0 and unknown_words_replacement_string is None: raise ValueError("unknown_word_prob is positive (%s) but the " "replacement string is None" % unknown_word_prob) diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py new file mode 100644 index 000000000..ff616a00d --- /dev/null +++ b/snips_nlu/tests/test_log_reg_classifier_utils.py @@ -0,0 +1,527 @@ +# coding=utf-8 +from __future__ import unicode_literals + +from copy import deepcopy +from itertools import cycle + +import numpy as np +from future.utils import itervalues +from mock import MagicMock, patch + +from snips_nlu.constants import INTENTS, LANGUAGE_EN, UTTERANCES +from snips_nlu.dataset import validate_and_format_dataset +from snips_nlu.intent_classifier.log_reg_classifier_utils import ( + add_unknown_word_to_utterances, build_training_data, + generate_noise_utterances, generate_smart_noise, get_noise_it, + remove_builtin_slots, text_to_utterance, get_dataset_specific_noise) +from snips_nlu.pipeline.configs import ( + IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig) +from snips_nlu.tests.test_log_reg_intent_classifier import ( + get_mocked_augment_utterances) +from snips_nlu.tests.utils import (SAMPLE_DATASET, SnipsTest, + get_empty_dataset) + + +class TestLogRegClassifierUtils(SnipsTest): + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils" + ".augment_utterances") + def test_should_build_training_data_with_no_stemming_no_noise( + self, mocked_augment_utterances): + # Given + dataset = validate_and_format_dataset(SAMPLE_DATASET) + mocked_augment_utterances.side_effect = get_mocked_augment_utterances + random_state = np.random.RandomState(1) + + # When + data_augmentation_config = IntentClassifierDataAugmentationConfig( + noise_factor=0) + utterances, _, intent_mapping = build_training_data( + dataset, LANGUAGE_EN, data_augmentation_config, random_state) + + # Then + expected_utterances = [utterance for intent + in itervalues(dataset[INTENTS]) + for utterance in intent[UTTERANCES]] + expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] + self.assertListEqual(expected_utterances, utterances) + self.assertListEqual(expected_intent_mapping, intent_mapping) + + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils" + ".augment_utterances") + def test_should_build_training_data_with_noise( + self, mocked_augment_utterances, mocked_get_noise): + # Given + mocked_noises = ["mocked_noise_%s" % i for i in range(100)] + mocked_get_noise.return_value = mocked_noises + mocked_augment_utterances.side_effect = get_mocked_augment_utterances + + num_intents = 3 + utterances_length = 5 + num_queries_per_intent = 3 + fake_utterance = { + "data": [ + {"text": " ".join("1" for _ in range(utterances_length))} + ] + } + dataset = { + "intents": { + str(i): { + "utterances": [fake_utterance] * num_queries_per_intent + } for i in range(num_intents) + }, + "entities": {} + } + random_state = np.random.RandomState(1) + + # When + np.random.seed(42) + noise_factor = 2 + data_augmentation_config = IntentClassifierDataAugmentationConfig( + noise_factor=noise_factor, unknown_word_prob=0, + unknown_words_replacement_string=None) + utterances, _, intent_mapping = build_training_data( + dataset, LANGUAGE_EN, data_augmentation_config, random_state) + + # Then + expected_utterances = [utterance + for intent in itervalues(dataset[INTENTS]) + for utterance in intent[UTTERANCES]] + np.random.seed(42) + noise = list(mocked_noises) + noise_size = int(min(noise_factor * num_queries_per_intent, + len(noise))) + noise_it = get_noise_it(mocked_noises, utterances_length, 0, + random_state) + noisy_utterances = [text_to_utterance(next(noise_it)) + for _ in range(noise_size)] + expected_utterances += noisy_utterances + expected_intent_mapping = sorted(dataset["intents"]) + expected_intent_mapping.append(None) + self.assertListEqual(expected_utterances, utterances) + self.assertListEqual(intent_mapping, expected_intent_mapping) + + def test_add_unknown_words_to_utterances(self): + # Given + base_utterances = { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + } + ] + } + utterances = [] + for _ in range(6): + utterances.append(deepcopy(base_utterances)) + + rand_it = cycle([0, 1]) + + def mocked_rand(): + return next(rand_it) + + max_unknown_words = 3 + rg_it = cycle([i for i in range(1, max_unknown_words + 1)]) + + def mocked_choice(a): # pylint: disable=unused-argument + return next(rg_it) + + unknownword_prob = .5 + + random_state = MagicMock() + random_state_rand = MagicMock() + random_state_rand.side_effect = mocked_rand + random_state_choice = MagicMock() + random_state_choice.side_effect = mocked_choice + + random_state.rand = random_state_rand + random_state.choice = random_state_choice + + # When + replacement_string = "unknownword" + noisy_utterances = add_unknown_word_to_utterances( + utterances, unknown_word_prob=unknownword_prob, + replacement_string=replacement_string, + max_unknown_words=max_unknown_words, + random_state=random_state + ) + + # Then + expected_utterances = [ + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + }, + { + "text": " unknownword" + } + ] + }, + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + }, + ] + }, + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + }, + { + "text": " unknownword unknownword" + } + ] + }, + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + }, + ] + }, + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + }, + { + "text": " unknownword unknownword unknownword" + } + + ] + }, + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "cat", + "entity": "cat" + }, + ] + } + ] + self.assertEqual(expected_utterances, noisy_utterances) + + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") + def test_generate_noise_utterances_should_replace_unknown_words( + self, mocked_noise): + # Given + utterances = [ + { + "data": [ + { + "text": "hello " + }, + { + "text": " you ", + "entity": "you" + }, + { + "text": " how are you " + }, + { + "text": "bobby", + "entity": "you" + } + ] + } + ] + language = LANGUAGE_EN + base_noise = ["hello", "dear", "you", "fool"] + mocked_noise.return_value = base_noise + replacement_string = "unknownword" + + # When + noise = generate_smart_noise( + base_noise, utterances, replacement_string, language) + + # Then + expected_noise = ["hello", replacement_string, "you", + replacement_string] + self.assertEqual(noise, expected_noise) + + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils" + ".augment_utterances") + def test_should_build_training_data_with_unknown_noise( + self, mocked_augment_utterances, mocked_get_noise): + # Given + mocked_noises = ["mocked_noise_%s" % i for i in range(100)] + mocked_get_noise.return_value = mocked_noises + mocked_augment_utterances.side_effect = get_mocked_augment_utterances + + num_intents = 3 + utterances_length = 5 + num_queries_per_intent = 3 + fake_utterance = { + "data": [ + {"text": " ".join("1" for _ in range(utterances_length))} + ] + } + dataset = { + "intents": { + str(i): { + "utterances": [fake_utterance] * num_queries_per_intent + } for i in range(num_intents) + }, + "entities": {} + } + random_state = np.random.RandomState(1) + + # When + np.random.seed(42) + noise_factor = 2 + replacement_string = "unknownword" + data_augmentation_config = IntentClassifierDataAugmentationConfig( + noise_factor=noise_factor, unknown_word_prob=0, + unknown_words_replacement_string=replacement_string) + utterances, _, intent_mapping = build_training_data( + dataset, LANGUAGE_EN, data_augmentation_config, random_state) + + # Then + expected_utterances = [utterance + for intent in itervalues(dataset[INTENTS]) + for utterance in intent[UTTERANCES]] + np.random.seed(42) + noise = list(mocked_noises) + noise_size = int(min(noise_factor * num_queries_per_intent, + len(noise))) + noisy_utterances = [text_to_utterance(replacement_string) + for _ in range(noise_size)] + expected_utterances += noisy_utterances + expected_intent_mapping = sorted(dataset["intents"]) + expected_intent_mapping.append(None) + self.assertListEqual(expected_utterances, utterances) + self.assertListEqual(expected_intent_mapping, intent_mapping) + + def test_should_build_training_data_with_no_data(self): + # Given + language = LANGUAGE_EN + dataset = validate_and_format_dataset(get_empty_dataset(language)) + random_state = np.random.RandomState(1) + + # When + data_augmentation_config = LogRegIntentClassifierConfig() \ + .data_augmentation_config + utterances, _, intent_mapping = build_training_data( + dataset, language, data_augmentation_config, random_state) + + # Then + expected_utterances = [] + expected_intent_mapping = [] + self.assertListEqual(utterances, expected_utterances) + self.assertListEqual(intent_mapping, expected_intent_mapping) + + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") + def test_generate_noise_utterances(self, mocked_get_noise): + # Given + language = LANGUAGE_EN + num_intents = 2 + noise_factor = 1 + utterances_length = 5 + + noise = [str(i) for i in range(utterances_length)] + mocked_get_noise.return_value = noise + + augmented_utterances = [ + { + "data": [ + { + "text": " ".join( + "{}".format(i) for i in range(utterances_length)) + } + ] + } + ] + num_utterances = 10 + random_state = np.random.RandomState(1) + + augmented_utterances = augmented_utterances * num_utterances + config = IntentClassifierDataAugmentationConfig( + noise_factor=noise_factor) + # When + noise_utterances = generate_noise_utterances( + augmented_utterances, noise, num_intents, config, language, + random_state) + + # Then + joined_noise = text_to_utterance(" ".join(noise)) + for u in noise_utterances: + self.assertEqual(u, joined_noise) + + def test_remove_builtin_slots(self): + # Given + language = LANGUAGE_EN + dataset = { + "entities": { + "snips/number": {} + }, + "intents": { + "dummy_intent_1": { + "utterances": [ + { + "data": [ + { + "text": "I want ", + }, + { + "text": "three", + "slot_name": "number_of_cups", + "entity": "snips/number" + }, + { + "text": " cups", + }, + ] + }, + { + "data": [ + { + "text": "give me ", + }, + { + "text": "twenty two", + "slot_name": "number_of_cups", + "entity": "snips/number" + }, + { + "text": " big cups please", + }, + ] + } + ] + } + }, + "language": language + } + + # When + filtered_dataset = remove_builtin_slots(dataset) + + # Then + expected_dataset = { + "entities": { + "snips/number": {} + }, + "intents": { + "dummy_intent_1": { + "utterances": [ + { + "data": [ + { + "text": "I want ", + }, + { + "text": " cups", + }, + ] + }, + { + "data": [ + { + "text": "give me ", + }, + { + "text": " big cups please", + }, + ] + } + ] + } + }, + "language": language + } + + self.assertDictEqual(expected_dataset, filtered_dataset) + + + @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") + def test_get_dataset_specific_noise(self, mocked_noise): + # Given + dataset = validate_and_format_dataset(SAMPLE_DATASET) + language = "en" + mocked_noise.return_value = ["dummy_a", "yo"] + + + # When + noise = get_dataset_specific_noise(dataset, language) + + # Then + self.assertEqual(["yo"], noise) diff --git a/snips_nlu/tests/test_log_reg_intent_classifier.py b/snips_nlu/tests/test_log_reg_intent_classifier.py index 64ee92a55..1c17c423b 100644 --- a/snips_nlu/tests/test_log_reg_intent_classifier.py +++ b/snips_nlu/tests/test_log_reg_intent_classifier.py @@ -1,26 +1,20 @@ # coding=utf-8 from __future__ import unicode_literals -from builtins import next, range, str - -import numpy as np -from future.utils import itervalues from mock import patch from snips_nlu.constants import ( INTENTS, LANGUAGE_EN, RES_INTENT_NAME, UTTERANCES) from snips_nlu.dataset import validate_and_format_dataset from snips_nlu.entity_parser import BuiltinEntityParser, CustomEntityParser -from snips_nlu.entity_parser.custom_entity_parser_usage import \ - CustomEntityParserUsage +from snips_nlu.entity_parser.custom_entity_parser_usage import ( + CustomEntityParserUsage) from snips_nlu.intent_classifier import LogRegIntentClassifier from snips_nlu.intent_classifier.featurizer import Featurizer from snips_nlu.intent_classifier.log_reg_classifier_utils import ( - add_unknown_word_to_utterances, build_training_data, - generate_noise_utterances, generate_smart_noise, get_noise_it, - remove_builtin_slots, text_to_utterance) + text_to_utterance) from snips_nlu.pipeline.configs import ( - IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig) + LogRegIntentClassifierConfig) from snips_nlu.tests.utils import ( BEVERAGE_DATASET, FixtureTest, SAMPLE_DATASET, get_empty_dataset) from snips_nlu.utils import NotTrained @@ -264,400 +258,3 @@ def test_empty_vocabulary_should_fit_and_return_none_intent( intent_classifier = LogRegIntentClassifier().fit(dataset) intent = intent_classifier.get_intent("no intent there") self.assertEqual(None, intent) - - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils" - ".augment_utterances") - def test_should_build_training_data_with_no_stemming_no_noise( - self, mocked_augment_utterances): - # Given - dataset = SAMPLE_DATASET - mocked_augment_utterances.side_effect = get_mocked_augment_utterances - random_state = np.random.RandomState(1) - - # When - data_augmentation_config = IntentClassifierDataAugmentationConfig( - noise_factor=0) - utterances, _, intent_mapping = build_training_data( - dataset, LANGUAGE_EN, data_augmentation_config, random_state) - - # Then - expected_utterances = [utterance for intent - in itervalues(dataset[INTENTS]) - for utterance in intent[UTTERANCES]] - expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] - self.assertListEqual(expected_utterances, utterances) - self.assertListEqual(expected_intent_mapping, intent_mapping) - - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils" - ".augment_utterances") - def test_should_build_training_data_with_noise( - self, mocked_augment_utterances, mocked_get_noise): - # Given - mocked_noises = ["mocked_noise_%s" % i for i in range(100)] - mocked_get_noise.return_value = mocked_noises - mocked_augment_utterances.side_effect = get_mocked_augment_utterances - - num_intents = 3 - utterances_length = 5 - num_queries_per_intent = 3 - fake_utterance = { - "data": [ - {"text": " ".join("1" for _ in range(utterances_length))} - ] - } - dataset = { - "intents": { - str(i): { - "utterances": [fake_utterance] * num_queries_per_intent - } for i in range(num_intents) - } - } - random_state = np.random.RandomState(1) - - # When - np.random.seed(42) - noise_factor = 2 - data_augmentation_config = IntentClassifierDataAugmentationConfig( - noise_factor=noise_factor, unknown_word_prob=0, - unknown_words_replacement_string=None) - utterances, _, intent_mapping = build_training_data( - dataset, LANGUAGE_EN, data_augmentation_config, random_state) - - # Then - expected_utterances = [utterance - for intent in itervalues(dataset[INTENTS]) - for utterance in intent[UTTERANCES]] - np.random.seed(42) - noise = list(mocked_noises) - noise_size = int(min(noise_factor * num_queries_per_intent, - len(noise))) - noise_it = get_noise_it(mocked_noises, utterances_length, 0, - random_state) - noisy_utterances = [text_to_utterance(next(noise_it)) - for _ in range(noise_size)] - expected_utterances += noisy_utterances - expected_intent_mapping = sorted(dataset["intents"]) - expected_intent_mapping.append(None) - self.assertListEqual(expected_utterances, utterances) - self.assertListEqual(intent_mapping, expected_intent_mapping) - - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils" - ".augment_utterances") - def test_should_build_training_data_with_unknown_noise( - self, mocked_augment_utterances, mocked_get_noise): - # Given - mocked_noises = ["mocked_noise_%s" % i for i in range(100)] - mocked_get_noise.return_value = mocked_noises - mocked_augment_utterances.side_effect = get_mocked_augment_utterances - - num_intents = 3 - utterances_length = 5 - num_queries_per_intent = 3 - fake_utterance = { - "data": [ - {"text": " ".join("1" for _ in range(utterances_length))} - ] - } - dataset = { - "intents": { - str(i): { - "utterances": [fake_utterance] * num_queries_per_intent - } for i in range(num_intents) - } - } - random_state = np.random.RandomState(1) - - # When - np.random.seed(42) - noise_factor = 2 - replacement_string = "unknownword" - data_augmentation_config = IntentClassifierDataAugmentationConfig( - noise_factor=noise_factor, unknown_word_prob=0, - unknown_words_replacement_string=replacement_string) - utterances, _, intent_mapping = build_training_data( - dataset, LANGUAGE_EN, data_augmentation_config, random_state) - - # Then - expected_utterances = [utterance - for intent in itervalues(dataset[INTENTS]) - for utterance in intent[UTTERANCES]] - np.random.seed(42) - noise = list(mocked_noises) - noise_size = int(min(noise_factor * num_queries_per_intent, - len(noise))) - noisy_utterances = [text_to_utterance(replacement_string) - for _ in range(noise_size)] - expected_utterances += noisy_utterances - expected_intent_mapping = sorted(dataset["intents"]) - expected_intent_mapping.append(None) - self.assertListEqual(expected_utterances, utterances) - self.assertListEqual(expected_intent_mapping, intent_mapping) - - def test_should_build_training_data_with_no_data(self): - # Given - language = LANGUAGE_EN - dataset = validate_and_format_dataset(get_empty_dataset(language)) - random_state = np.random.RandomState(1) - - # When - data_augmentation_config = LogRegIntentClassifierConfig() \ - .data_augmentation_config - utterances, _, intent_mapping = build_training_data( - dataset, language, data_augmentation_config, random_state) - - # Then - expected_utterances = [] - expected_intent_mapping = [] - self.assertListEqual(utterances, expected_utterances) - self.assertListEqual(intent_mapping, expected_intent_mapping) - - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") - def test_generate_noise_utterances(self, mocked_get_noise): - # Given - language = LANGUAGE_EN - num_intents = 2 - noise_factor = 1 - utterances_length = 5 - - noise = [str(i) for i in range(utterances_length)] - mocked_get_noise.return_value = noise - - augmented_utterances = [ - { - "data": [ - { - "text": " ".join( - "{}".format(i) for i in range(utterances_length)) - } - ] - } - ] - num_utterances = 10 - random_state = np.random.RandomState(1) - - augmented_utterances = augmented_utterances * num_utterances - config = IntentClassifierDataAugmentationConfig( - noise_factor=noise_factor) - # When - noise_utterances = generate_noise_utterances( - augmented_utterances, num_intents, config, language, random_state) - - # Then - joined_noise = text_to_utterance(" ".join(noise)) - for u in noise_utterances: - self.assertEqual(u, joined_noise) - - def test_add_unknown_words_to_utterances(self): - # Given - utterances = [ - { - "data": [ - { - "text": "hello " - }, - { - "text": " you ", - "entity": "you" - }, - { - "text": " how are you " - }, - { - "text": "dude", - "entity": "you" - } - ] - }, - { - "data": [ - { - "text": "hello " - }, - { - "text": "dude", - "entity": "you" - }, - { - "text": " how are you " - - }, - { - "text": " you ", - "entity": "you" - } - ] - } - ] - unknownword_prob = .5 - random_state = np.random.RandomState(1) - - # When - replacement_string = "unknownword" - noisy_utterances = add_unknown_word_to_utterances( - utterances, unknown_word_prob=unknownword_prob, - replacement_string=replacement_string, random_state=random_state - ) - - # Then - expected_utterances = [ - { - "data": [ - { - "text": "hello " - }, - { - "text": " unknownword ", - "entity": "you" - }, - { - "text": " how are you " - }, - { - "text": "dude", - "entity": "you" - } - ] - }, - { - "data": [ - { - "text": "hello " - }, - { - "text": "unknownword", - "entity": "you" - }, - { - "text": " how are you " - }, - { - "text": " unknownword ", - "entity": "you" - } - ] - } - ] - self.assertEqual(expected_utterances, noisy_utterances) - - @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise") - def test_generate_noise_utterances_should_replace_unknown_words( - self, mocked_noise): - # Given - utterances = [ - { - "data": [ - { - "text": "hello " - }, - { - "text": " you ", - "entity": "you" - }, - { - "text": " how are you " - }, - { - "text": "bobby", - "entity": "you" - } - ] - } - ] - language = LANGUAGE_EN - mocked_noise.return_value = ["hello", "dear", "you", "fool"] - replacement_string = "unknownword" - - # When - noise = generate_smart_noise(utterances, replacement_string, language) - - # Then - expected_noise = ["hello", replacement_string, "you", - replacement_string] - self.assertEqual(noise, expected_noise) - - def test_remove_builtin_slots(self): - # Given - language = LANGUAGE_EN - dataset = { - "entities": { - "snips/number": {} - }, - "intents": { - "dummy_intent_1": { - "utterances": [ - { - "data": [ - { - "text": "I want ", - }, - { - "text": "three", - "slot_name": "number_of_cups", - "entity": "snips/number" - }, - { - "text": " cups", - }, - ] - }, - { - "data": [ - { - "text": "give me ", - }, - { - "text": "twenty two", - "slot_name": "number_of_cups", - "entity": "snips/number" - }, - { - "text": " big cups please", - }, - ] - } - ] - } - }, - "language": language - } - - # When - filtered_dataset = remove_builtin_slots(dataset) - - # Then - expected_dataset = { - "entities": { - "snips/number": {} - }, - "intents": { - "dummy_intent_1": { - "utterances": [ - { - "data": [ - { - "text": "I want ", - }, - { - "text": " cups", - }, - ] - }, - { - "data": [ - { - "text": "give me ", - }, - { - "text": " big cups please", - }, - ] - } - ] - } - }, - "language": language - } - - self.assertDictEqual(expected_dataset, filtered_dataset) From 830f426c2bf1e40b18a327b904fae734396ec3bf Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Thu, 8 Nov 2018 16:41:55 +0100 Subject: [PATCH 20/24] Update default configs --- snips_nlu/default_configs/config_de.py | 1 + snips_nlu/default_configs/config_en.py | 1 + snips_nlu/default_configs/config_es.py | 1 + snips_nlu/default_configs/config_fr.py | 1 + snips_nlu/default_configs/config_it.py | 1 + snips_nlu/default_configs/config_ja.py | 1 + snips_nlu/default_configs/config_ko.py | 1 + 7 files changed, 7 insertions(+) diff --git a/snips_nlu/default_configs/config_de.py b/snips_nlu/default_configs/config_de.py index 99cd61ead..34b6eabd6 100644 --- a/snips_nlu/default_configs/config_de.py +++ b/snips_nlu/default_configs/config_de.py @@ -175,6 +175,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0.0, "unknown_words_replacement_string": None }, diff --git a/snips_nlu/default_configs/config_en.py b/snips_nlu/default_configs/config_en.py index 5c12803f3..a7bbbfa5c 100644 --- a/snips_nlu/default_configs/config_en.py +++ b/snips_nlu/default_configs/config_en.py @@ -152,6 +152,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0, "unknown_words_replacement_string": None }, diff --git a/snips_nlu/default_configs/config_es.py b/snips_nlu/default_configs/config_es.py index dd6e6b8cc..3356b1395 100644 --- a/snips_nlu/default_configs/config_es.py +++ b/snips_nlu/default_configs/config_es.py @@ -139,6 +139,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0.0, "unknown_words_replacement_string": None }, diff --git a/snips_nlu/default_configs/config_fr.py b/snips_nlu/default_configs/config_fr.py index dd6e6b8cc..3356b1395 100644 --- a/snips_nlu/default_configs/config_fr.py +++ b/snips_nlu/default_configs/config_fr.py @@ -139,6 +139,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0.0, "unknown_words_replacement_string": None }, diff --git a/snips_nlu/default_configs/config_it.py b/snips_nlu/default_configs/config_it.py index dd6e6b8cc..3356b1395 100644 --- a/snips_nlu/default_configs/config_it.py +++ b/snips_nlu/default_configs/config_it.py @@ -139,6 +139,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0.0, "unknown_words_replacement_string": None }, diff --git a/snips_nlu/default_configs/config_ja.py b/snips_nlu/default_configs/config_ja.py index 46849b8b3..cfe6fac4a 100644 --- a/snips_nlu/default_configs/config_ja.py +++ b/snips_nlu/default_configs/config_ja.py @@ -195,6 +195,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0.0, "unknown_words_replacement_string": None }, diff --git a/snips_nlu/default_configs/config_ko.py b/snips_nlu/default_configs/config_ko.py index 4da2fd365..0b8c61245 100644 --- a/snips_nlu/default_configs/config_ko.py +++ b/snips_nlu/default_configs/config_ko.py @@ -173,6 +173,7 @@ "min_utterances": 20, "noise_factor": 5, "add_builtin_entities_examples": True, + "max_unknown_words": 0, "unknown_word_prob": 0.0, "unknown_words_replacement_string": None }, From 70439d234c29c9f6d3824096163b0097781bd43a Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Mon, 26 Nov 2018 14:25:06 +0100 Subject: [PATCH 21/24] Fixes for review --- snips_nlu/intent_classifier/log_reg_classifier_utils.py | 4 +--- snips_nlu/tests/test_log_reg_classifier_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py index dab48053f..83ceeb278 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py +++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py @@ -91,11 +91,9 @@ def add_unknown_word_to_utterances(utterances, replacement_string, unknown_word_prob, max_unknown_words, random_state): new_utterances = deepcopy(utterances) - unknown_word_lengths = [i for i in range(1, max_unknown_words + 1)] for u in new_utterances: if random_state.rand() < unknown_word_prob: - # num_unknown = random_state.choice(unknown_word_lengths, p=p) - num_unknown = random_state.choice(unknown_word_lengths) + num_unknown = random_state.randint(1, max_unknown_words + 1) # We choose to put the noise at the end of the sentence and not # in the middle so that it doesn't impact to much ngrams # computation diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py index ff616a00d..589b79c30 100644 --- a/snips_nlu/tests/test_log_reg_classifier_utils.py +++ b/snips_nlu/tests/test_log_reg_classifier_utils.py @@ -133,7 +133,7 @@ def mocked_rand(): max_unknown_words = 3 rg_it = cycle([i for i in range(1, max_unknown_words + 1)]) - def mocked_choice(a): # pylint: disable=unused-argument + def mocked_randint(a, b): # pylint: disable=unused-argument return next(rg_it) unknownword_prob = .5 @@ -142,10 +142,10 @@ def mocked_choice(a): # pylint: disable=unused-argument random_state_rand = MagicMock() random_state_rand.side_effect = mocked_rand random_state_choice = MagicMock() - random_state_choice.side_effect = mocked_choice + random_state_choice.side_effect = mocked_randint random_state.rand = random_state_rand - random_state.choice = random_state_choice + random_state.randint = random_state_choice # When replacement_string = "unknownword" From 3e19c2af8eda16e53224e4339f33dec039a1518c Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Mon, 26 Nov 2018 14:21:42 +0100 Subject: [PATCH 22/24] Set default verbosity to False --- snips_nlu/cli/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snips_nlu/cli/metrics.py b/snips_nlu/cli/metrics.py index e76ea3766..63b1de0aa 100644 --- a/snips_nlu/cli/metrics.py +++ b/snips_nlu/cli/metrics.py @@ -97,7 +97,7 @@ def progression_handler(progress): ) def train_test_metrics(train_dataset_path, test_dataset_path, output_path, config_path=None, exclude_slot_metrics=False, - include_errors=False, verbose=True): + include_errors=False, verbose=False): if verbose: set_nlu_logger(logging.DEBUG) From de7e947257caaf57508a46cb4e064d8e86d5a065 Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Mon, 26 Nov 2018 17:12:11 +0100 Subject: [PATCH 23/24] Add max_unknown_words parameter in serialization --- .../intent_classifier/log_reg_classifier_utils.py | 15 ++++++++------- snips_nlu/pipeline/configs/intent_classifier.py | 4 +++- snips_nlu/tests/test_config.py | 1 + 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py index 83ceeb278..73f06493f 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py +++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py @@ -151,13 +151,14 @@ def build_training_data(dataset, language, data_augmentation_config, augmented_utterances += utterances utterance_classes += [classes_mapping[intent_name] for _ in range(len(utterances))] - augmented_utterances = add_unknown_word_to_utterances( - augmented_utterances, - data_augmentation_config.unknown_words_replacement_string, - data_augmentation_config.unknown_word_prob, - data_augmentation_config.max_unknown_words, - random_state - ) + if data_augmentation_config.unknown_words_replacement_string is not None: + augmented_utterances = add_unknown_word_to_utterances( + augmented_utterances, + data_augmentation_config.unknown_words_replacement_string, + data_augmentation_config.unknown_word_prob, + data_augmentation_config.max_unknown_words, + random_state + ) # Adding noise noise = get_dataset_specific_noise(dataset, language) diff --git a/snips_nlu/pipeline/configs/intent_classifier.py b/snips_nlu/pipeline/configs/intent_classifier.py index 8ea83dcd2..4ac330991 100644 --- a/snips_nlu/pipeline/configs/intent_classifier.py +++ b/snips_nlu/pipeline/configs/intent_classifier.py @@ -118,7 +118,8 @@ class IntentClassifierDataAugmentationConfig(Config): def __init__(self, min_utterances=20, noise_factor=5, add_builtin_entities_examples=True, unknown_word_prob=0, - unknown_words_replacement_string=None, max_unknown_words=3): + unknown_words_replacement_string=None, + max_unknown_words=None): self.min_utterances = min_utterances self.noise_factor = noise_factor self.add_builtin_entities_examples = add_builtin_entities_examples @@ -146,6 +147,7 @@ def to_dict(self): "unknown_word_prob": self.unknown_word_prob, "unknown_words_replacement_string": self.unknown_words_replacement_string, + "max_unknown_words": self.max_unknown_words } @classmethod diff --git a/snips_nlu/tests/test_config.py b/snips_nlu/tests/test_config.py index 16fd8c375..3077ae149 100644 --- a/snips_nlu/tests/test_config.py +++ b/snips_nlu/tests/test_config.py @@ -27,6 +27,7 @@ def test_intent_classifier_data_augmentation_config(self): "add_builtin_entities_examples": False, "unknown_word_prob": 0.1, "unknown_words_replacement_string": "foobar", + "max_unknown_words": None, } # When From 4f338c665771f73cf0d77cf7909b448800950de4 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Mon, 26 Nov 2018 17:49:11 +0100 Subject: [PATCH 24/24] Update Changelog --- CHANGELOG.md | 10 ++++++++++ snips_nlu/intent_classifier/modifiers.py | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 snips_nlu/intent_classifier/modifiers.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 52fdfde1e..0dfe8542b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ # Changelog All notable changes to this project will be documented in this file. +## [0.18.0] - 2018-11-26 +### Added +- New YAML format to create dataset +- Verbose mode in CLI + +### Changed +- Bump `snips-nlu-ontology` to `0.62.0` to improve memory usage + + ## [0.17.4] - 2018-11-20 ### Added - Add a `--config` argument in the metrics CLI @@ -175,6 +184,7 @@ several commands. - Fix compiling issue with `bindgen` dependency when installing from source - Fix issue in `CRFSlotFiller` when handling builtin entities +[0.18.0]: https://github.com/snipsco/snips-nlu/compare/0.17.4...0.18.0 [0.17.4]: https://github.com/snipsco/snips-nlu/compare/0.17.3...0.17.4 [0.17.3]: https://github.com/snipsco/snips-nlu/compare/0.17.2...0.17.3 [0.17.2]: https://github.com/snipsco/snips-nlu/compare/0.17.1...0.17.2 diff --git a/snips_nlu/intent_classifier/modifiers.py b/snips_nlu/intent_classifier/modifiers.py new file mode 100644 index 000000000..50d2bd937 --- /dev/null +++ b/snips_nlu/intent_classifier/modifiers.py @@ -0,0 +1,24 @@ +MODIFIERS = { + "it": { + "piĆ¹", + "piu", + "meno", + "molto", + "non", + "troppo", + "troppa", + "ancora", + "senza", + "con", + "forte", + "forti", + "alto", + "alta", + "alti", + "alte" + "bassa", + "basso", + "bassi", + "basse" + } +}