From c4199571742756d7c4295401fa63753dcd1d9de1 Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Wed, 21 Nov 2018 11:23:13 +0100
Subject: [PATCH 01/24] Improve logging in CLI

---
 snips_nlu/cli/inference.py | 12 ++++++++++--
 snips_nlu/cli/metrics.py   | 21 +++++++++++++++++----
 snips_nlu/cli/training.py  |  2 +-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/snips_nlu/cli/inference.py b/snips_nlu/cli/inference.py
index cfcfe2819..345fb2a5d 100644
--- a/snips_nlu/cli/inference.py
+++ b/snips_nlu/cli/inference.py
@@ -1,19 +1,27 @@
 from __future__ import unicode_literals, print_function
 
 import json
+import logging
+
 from builtins import input
 
 import plac
 
 from snips_nlu import SnipsNLUEngine
+from snips_nlu.cli.utils import set_nlu_logger
 
 
 @plac.annotations(
     training_path=("Path to a trained engine", "positional", None, str),
     query=("Query to parse. If provided, it disables the interactive "
-           "behavior.", "option", "q", str))
-def parse(training_path, query):
+           "behavior.", "option", "q", str),
+    verbose=("Print logs", "flag", "v"),
+)
+def parse(training_path, query, verbose=False):
     """Load a trained NLU engine and play with its parsing API interactively"""
+    if verbose:
+        set_nlu_logger(logging.DEBUG)
+
     engine = SnipsNLUEngine.from_path(training_path)
 
     if query:
diff --git a/snips_nlu/cli/metrics.py b/snips_nlu/cli/metrics.py
index a383970ab..e76ea3766 100644
--- a/snips_nlu/cli/metrics.py
+++ b/snips_nlu/cli/metrics.py
@@ -1,11 +1,14 @@
 from __future__ import print_function, unicode_literals
 
 import json
+import logging
+
 from pathlib import Path
 
 import plac
 
 from snips_nlu import SnipsNLUEngine, load_resources
+from snips_nlu.cli.utils import set_nlu_logger
 from snips_nlu.utils import json_string
 
 
@@ -38,10 +41,15 @@ def parse(self, text):
                       "(between 0 and 1)", "option", "t", float),
     exclude_slot_metrics=("Exclude slot metrics and slot errors in the output",
                           "flag", "s", bool),
-    include_errors=("Include parsing errors in the output", "flag", "i", bool))
+    include_errors=("Include parsing errors in the output", "flag", "i", bool),
+    verbose=("Print logs", "flag", "v"),
+)
 def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5,
                       train_size_ratio=1.0, exclude_slot_metrics=False,
-                      include_errors=False):
+                      include_errors=False, verbose=False):
+    if verbose:
+        set_nlu_logger(logging.DEBUG)
+
     def progression_handler(progress):
         print("%d%%" % int(progress * 100))
 
@@ -84,10 +92,15 @@ def progression_handler(progress):
     config_path=("Path to a NLU engine config file", "option", "c", str),
     exclude_slot_metrics=("Exclude slot metrics and slot errors in the output",
                           "flag", "s", bool),
-    include_errors=("Include parsing errors in the output", "flag", "i", bool))
+    include_errors=("Include parsing errors in the output", "flag", "i", bool),
+    verbose=("Print logs", "flag", "v"),
+)
 def train_test_metrics(train_dataset_path, test_dataset_path, output_path,
                        config_path=None, exclude_slot_metrics=False,
-                       include_errors=False):
+                       include_errors=False, verbose=True):
+    if verbose:
+        set_nlu_logger(logging.DEBUG)
+
     if config_path is not None:
         with Path(config_path).open("r", encoding="utf-8") as f:
             config = json.load(f)
diff --git a/snips_nlu/cli/training.py b/snips_nlu/cli/training.py
index 682309220..72a12d106 100644
--- a/snips_nlu/cli/training.py
+++ b/snips_nlu/cli/training.py
@@ -20,7 +20,7 @@
 def train(dataset_path, output_path, config_path, verbose):
     """Train an NLU engine on the provided dataset"""
     if verbose:
-        set_nlu_logger(logging.INFO)
+        set_nlu_logger(logging.DEBUG)
     with Path(dataset_path).open("r", encoding="utf8") as f:
         dataset = json.load(f)
 

From ba36802c3d732d1f40067ab274ae2ba9b79f7d75 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 17:51:25 +0100
Subject: [PATCH 02/24] Stream logs to stdout instead of stderr

---
 snips_nlu/cli/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snips_nlu/cli/utils.py b/snips_nlu/cli/utils.py
index a31f19e2a..e04106029 100644
--- a/snips_nlu/cli/utils.py
+++ b/snips_nlu/cli/utils.py
@@ -105,6 +105,6 @@ def check_resources_alias(resource_name, shortcuts):
 def set_nlu_logger(level=logging.INFO):
     logger = logging.getLogger(snips_nlu.__name__)
     logger.setLevel(level)
-    handler = logging.StreamHandler()
+    handler = logging.StreamHandler(sys.stdout)
     handler.setLevel(level)
     logger.addHandler(handler)

From 72f67e0e0fc99e0fd68ae22b616ac1aa2f5836fe Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 14 Nov 2018 16:05:01 +0100
Subject: [PATCH 03/24] Move dataset utils into dedicated package

---
 snips_nlu/dataset/__init__.py                 |  4 ++
 snips_nlu/dataset/utils.py                    | 38 ++++++++++++
 .../{dataset.py => dataset/validation.py}     | 61 +++++--------------
 snips_nlu/tests/test_dataset.py               | 17 +++---
 4 files changed, 65 insertions(+), 55 deletions(-)
 create mode 100644 snips_nlu/dataset/__init__.py
 create mode 100644 snips_nlu/dataset/utils.py
 rename snips_nlu/{dataset.py => dataset/validation.py} (76%)

diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py
new file mode 100644
index 000000000..20b840aa2
--- /dev/null
+++ b/snips_nlu/dataset/__init__.py
@@ -0,0 +1,4 @@
+from snips_nlu.dataset.utils import (
+    extract_intent_entities, extract_utterance_entities,
+    get_dataset_gazetteer_entities, get_text_from_chunks)
+from snips_nlu.dataset.validation import validate_and_format_dataset
diff --git a/snips_nlu/dataset/utils.py b/snips_nlu/dataset/utils.py
new file mode 100644
index 000000000..2047bf716
--- /dev/null
+++ b/snips_nlu/dataset/utils.py
@@ -0,0 +1,38 @@
+from future.utils import iteritems, itervalues
+
+from snips_nlu.constants import (
+    DATA, ENTITIES, ENTITY, INTENTS, TEXT, UTTERANCES)
+from snips_nlu.entity_parser.builtin_entity_parser import is_gazetteer_entity
+
+
+def extract_utterance_entities(dataset):
+    entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]}
+
+    for intent in itervalues(dataset[INTENTS]):
+        for utterance in intent[UTTERANCES]:
+            for chunk in utterance[DATA]:
+                if ENTITY in chunk:
+                    entities_values[chunk[ENTITY]].add(chunk[TEXT].strip())
+    return {k: list(v) for k, v in iteritems(entities_values)}
+
+
+def extract_intent_entities(dataset, entity_filter=None):
+    intent_entities = {intent: set() for intent in dataset[INTENTS]}
+    for intent_name, intent_data in iteritems(dataset[INTENTS]):
+        for utterance in intent_data[UTTERANCES]:
+            for chunk in utterance[DATA]:
+                if ENTITY in chunk:
+                    if entity_filter and not entity_filter(chunk[ENTITY]):
+                        continue
+                    intent_entities[intent_name].add(chunk[ENTITY])
+    return intent_entities
+
+
+def get_text_from_chunks(chunks):
+    return "".join(chunk[TEXT] for chunk in chunks)
+
+
+def get_dataset_gazetteer_entities(dataset, intent=None):
+    if intent is not None:
+        return extract_intent_entities(dataset, is_gazetteer_entity)[intent]
+    return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)}
diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset/validation.py
similarity index 76%
rename from snips_nlu/dataset.py
rename to snips_nlu/dataset/validation.py
index 22fba7c30..ec2c135c2 100644
--- a/snips_nlu/dataset.py
+++ b/snips_nlu/dataset/validation.py
@@ -12,36 +12,14 @@
     AUTOMATICALLY_EXTENSIBLE, CAPITALIZE, DATA, ENTITIES, ENTITY, INTENTS,
     LANGUAGE, MATCHING_STRICTNESS, SLOT_NAME, SYNONYMS, TEXT, USE_SYNONYMS,
     UTTERANCES, VALIDATED, VALUE)
+from snips_nlu.dataset import extract_utterance_entities
 from snips_nlu.entity_parser.builtin_entity_parser import (
-    BuiltinEntityParser, is_builtin_entity, is_gazetteer_entity)
+    BuiltinEntityParser, is_builtin_entity)
 from snips_nlu.preprocessing import tokenize_light
 from snips_nlu.string_variations import get_string_variations
 from snips_nlu.utils import validate_key, validate_keys, validate_type
 
 
-def extract_utterance_entities(dataset):
-    entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]}
-
-    for intent in itervalues(dataset[INTENTS]):
-        for utterance in intent[UTTERANCES]:
-            for chunk in utterance[DATA]:
-                if ENTITY in chunk:
-                    entities_values[chunk[ENTITY]].add(chunk[TEXT].strip())
-    return {k: list(v) for k, v in iteritems(entities_values)}
-
-
-def extract_intent_entities(dataset, entity_filter=None):
-    intent_entities = {intent: set() for intent in dataset[INTENTS]}
-    for intent_name, intent_data in iteritems(dataset[INTENTS]):
-        for utterance in intent_data[UTTERANCES]:
-            for chunk in utterance[DATA]:
-                if ENTITY in chunk:
-                    if entity_filter and not entity_filter(chunk[ENTITY]):
-                        continue
-                    intent_entities[intent_name].add(chunk[ENTITY])
-    return intent_entities
-
-
 def validate_and_format_dataset(dataset):
     """Checks that the dataset is valid and format it"""
     # Make this function idempotent
@@ -61,7 +39,7 @@ def validate_and_format_dataset(dataset):
         raise ValueError("Unknown language: '%s'" % language)
 
     for intent in itervalues(dataset[INTENTS]):
-        validate_and_format_intent(intent, dataset[ENTITIES])
+        _validate_and_format_intent(intent, dataset[ENTITIES])
 
     utterance_entities_values = extract_utterance_entities(dataset)
     builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)
@@ -70,15 +48,16 @@ def validate_and_format_dataset(dataset):
         uterrance_entities = utterance_entities_values[entity_name]
         if is_builtin_entity(entity_name):
             dataset[ENTITIES][entity_name] = \
-                validate_and_format_builtin_entity(entity, uterrance_entities)
+                _validate_and_format_builtin_entity(entity, uterrance_entities)
         else:
-            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
+            dataset[ENTITIES][
+                entity_name] = _validate_and_format_custom_entity(
                 entity, uterrance_entities, language, builtin_entity_parser)
     dataset[VALIDATED] = True
     return dataset
 
 
-def validate_and_format_intent(intent, entities):
+def _validate_and_format_intent(intent, entities):
     validate_type(intent, dict)
     validate_key(intent, UTTERANCES, object_label="intent dict")
     validate_type(intent[UTTERANCES], list)
@@ -100,11 +79,7 @@ def validate_and_format_intent(intent, entities):
     return intent
 
 
-def get_text_from_chunks(chunks):
-    return "".join(chunk[TEXT] for chunk in chunks)
-
-
-def has_any_capitalization(entity_utterances, language):
+def _has_any_capitalization(entity_utterances, language):
     for utterance in entity_utterances:
         tokens = tokenize_light(utterance, language)
         if any(t.isupper() or t.istitle() for t in tokens):
@@ -112,7 +87,7 @@ def has_any_capitalization(entity_utterances, language):
     return False
 
 
-def add_entity_variations(utterances, entity_variations, entity_value):
+def _add_entity_variations(utterances, entity_variations, entity_value):
     utterances[entity_value] = entity_value
     for variation in entity_variations[entity_value]:
         if variation:
@@ -129,8 +104,8 @@ def _extract_entity_values(entity):
     return values
 
 
-def validate_and_format_custom_entity(entity, queries_entities, language,
-                                      builtin_entity_parser):
+def _validate_and_format_custom_entity(entity, queries_entities, language,
+                                       builtin_entity_parser):
     validate_type(entity, dict)
 
     # TODO: this is here temporarily, only to allow backward compatibility
@@ -169,8 +144,8 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
 
     # Compute capitalization before normalizing
     # Normalization lowercase and hence lead to bad capitalization calculation
-    formatted_entity[CAPITALIZE] = has_any_capitalization(queries_entities,
-                                                          language)
+    formatted_entity[CAPITALIZE] = _has_any_capitalization(queries_entities,
+                                                           language)
 
     validated_utterances = dict()
     # Map original values an synonyms
@@ -208,7 +183,7 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
 
     for entry in entity[DATA]:
         entry_value = entry[VALUE]
-        validated_utterances = add_entity_variations(
+        validated_utterances = _add_entity_variations(
             validated_utterances, non_colliding_variations, entry_value)
 
     # Merge queries entities
@@ -227,12 +202,6 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
     return formatted_entity
 
 
-def validate_and_format_builtin_entity(entity, queries_entities):
+def _validate_and_format_builtin_entity(entity, queries_entities):
     validate_type(entity, dict)
     return {UTTERANCES: set(queries_entities)}
-
-
-def get_dataset_gazetteer_entities(dataset, intent=None):
-    if intent is not None:
-        return extract_intent_entities(dataset, is_gazetteer_entity)[intent]
-    return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)}
diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset.py
index 0f0298a08..6f9e0aa10 100644
--- a/snips_nlu/tests/test_dataset.py
+++ b/snips_nlu/tests/test_dataset.py
@@ -5,8 +5,7 @@
 
 from mock import mock
 
-from snips_nlu.constants import (
-    ENTITIES, SNIPS_DATETIME)
+from snips_nlu.constants import ENTITIES, SNIPS_DATETIME
 from snips_nlu.dataset import validate_and_format_dataset
 from snips_nlu.tests.utils import SnipsTest
 
@@ -155,7 +154,7 @@ def test_invalid_language_should_raise_exception(self):
             validate_and_format_dataset(dataset)
         self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0]))
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_format_dataset_by_adding_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -208,7 +207,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertDictEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_format_dataset_by_adding_entity_values(
             self, mocked_get_string_variations):
         # Given
@@ -321,7 +320,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_add_missing_reference_entity_values_when_not_use_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -462,7 +461,7 @@ def test_should_not_require_data_for_builtin_entities(self):
         with self.fail_if_exception("Could not validate dataset"):
             validate_and_format_dataset(dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_remove_empty_entities_value_and_empty_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -576,7 +575,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_add_capitalize_field(
             self, mocked_get_string_variations):
         # Given
@@ -752,7 +751,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertDictEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_normalize_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -827,7 +826,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertDictEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_dataset_should_handle_synonyms(
             self, mocked_get_string_variations):
         # Given

From 48eeffebf80a2d478aadcc20ba278c755df45f21 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Thu, 15 Nov 2018 18:31:02 +0100
Subject: [PATCH 04/24] Simplify dataset parsing

---
 snips_nlu/cli/dataset/assistant_dataset.py    |  11 +-
 snips_nlu/dataset/__init__.py                 |   2 +
 .../dataset/entities.py => dataset/entity.py} | 104 ++++++++------
 .../intent_dataset.py => dataset/intent.py}   | 132 ++++++------------
 snips_nlu/tests/test_cli.py                   |  20 +--
 5 files changed, 120 insertions(+), 149 deletions(-)
 rename snips_nlu/{cli/dataset/entities.py => dataset/entity.py} (50%)
 rename snips_nlu/{cli/dataset/intent_dataset.py => dataset/intent.py} (61%)

diff --git a/snips_nlu/cli/dataset/assistant_dataset.py b/snips_nlu/cli/dataset/assistant_dataset.py
index 4a1982115..7961c34bc 100644
--- a/snips_nlu/cli/dataset/assistant_dataset.py
+++ b/snips_nlu/cli/dataset/assistant_dataset.py
@@ -1,10 +1,9 @@
 # coding=utf-8
-from __future__ import unicode_literals, print_function
+from __future__ import print_function, unicode_literals
 
 from pathlib import Path
 
-from snips_nlu.cli.dataset.entities import CustomEntity, create_entity
-from snips_nlu.cli.dataset.intent_dataset import IntentDataset
+from snips_nlu.dataset import Entity, Intent
 
 
 class AssistantDataset(object):
@@ -53,10 +52,10 @@ def from_files(cls, language, filenames):
                                      "'intent_' or 'entity_' but found: %s"
                                      % stem)
 
-        intents_datasets = [IntentDataset.from_file(f)
+        intents_datasets = [Intent.from_file(f)
                             for f in intent_filepaths]
 
-        entities = [CustomEntity.from_file(f) for f in entity_filepaths]
+        entities = [Entity.from_file(f) for f in entity_filepaths]
         entity_names = set(e.name for e in entities)
 
         # Add entities appearing only in the intents data
@@ -64,7 +63,7 @@ def from_files(cls, language, filenames):
             for entity_name in intent_data.entities_names:
                 if entity_name not in entity_names:
                     entity_names.add(entity_name)
-                    entities.append(create_entity(entity_name))
+                    entities.append(Entity(name=entity_name))
         return cls(language, intents_datasets, entities)
 
     @property
diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py
index 20b840aa2..89c0c9744 100644
--- a/snips_nlu/dataset/__init__.py
+++ b/snips_nlu/dataset/__init__.py
@@ -1,3 +1,5 @@
+from snips_nlu.dataset.entity import Entity, EntityFormatError
+from snips_nlu.dataset.intent import Intent, IntentFormatError
 from snips_nlu.dataset.utils import (
     extract_intent_entities, extract_utterance_entities,
     get_dataset_gazetteer_entities, get_text_from_chunks)
diff --git a/snips_nlu/cli/dataset/entities.py b/snips_nlu/dataset/entity.py
similarity index 50%
rename from snips_nlu/cli/dataset/entities.py
rename to snips_nlu/dataset/entity.py
index bb854f300..3668fc132 100644
--- a/snips_nlu/cli/dataset/entities.py
+++ b/snips_nlu/dataset/entity.py
@@ -3,58 +3,95 @@
 
 import csv
 import re
-from abc import ABCMeta, abstractmethod
 from pathlib import Path
 
 import six
-from future.utils import with_metaclass
+from snips_nlu_ontology import get_all_builtin_entities
 
 from snips_nlu.constants import (
     AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS,
     USE_SYNONYMS, VALUE)
-from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
 
 AUTO_EXT_REGEX = re.compile(r'^#\sautomatically_extensible=(true|false)\s*$')
 
 
-class Entity(with_metaclass(ABCMeta, object)):
-    def __init__(self, name):
-        self.name = name
+class EntityFormatError(TypeError):
+    pass
 
-    @abstractmethod
-    def json(self):
-        pass
 
+class Entity(object):
+    """Entity of an :class:`.AssistantDataset`
 
-class CustomEntity(Entity):
-    """Custom entity of an :class:`.AssistantDataset`
+    This class can represents both a custom entity and a builtin entity
 
-        Attributes:
-            utterances (list of :class:`.EntityUtterance`): entity utterances
-            automatically_extensible (bool): whether or not the entity can be
-                extended to values not present in the dataset
-            use_synonyms (bool): whether or not to map entity values using
-                synonyms
+    Attributes:
+        name (str): name of the entity
+        utterances (list of :class:`.EntityUtterance`): entity utterances
+            (only for custom entities)
+        automatically_extensible (bool): whether or not the entity can be
+            extended to values not present in the dataset (only for custom
+            entities)
+        use_synonyms (bool): whether or not to map entity values using
+            synonyms (only for custom entities)
+        matching_strictness (float): controls the matching strictness of the
+            entity (only for custom entities)
     """
 
-    def __init__(self, name, utterances, automatically_extensible,
-                 use_synonyms, matching_strictness=1.0):
-        super(CustomEntity, self).__init__(name)
+    def __init__(self, name, utterances=None, automatically_extensible=True,
+                 use_synonyms=True, matching_strictness=1.0):
+        if utterances is None:
+            utterances = []
+        self.name = name
         self.utterances = utterances
         self.automatically_extensible = automatically_extensible
         self.use_synonyms = use_synonyms
         self.matching_strictness = matching_strictness
 
+    @property
+    def is_builtin(self):
+        return self.name in get_all_builtin_entities()
+
+    @classmethod
+    def from_yaml(cls, yaml_dict):
+        """Build an :class:`.Entity` from its YAML definition dict"""
+        object_type = yaml_dict.get("type")
+        if object_type and object_type != "entity":
+            raise EntityFormatError("Wrong type: '%s'" % object_type)
+        entity_name = yaml_dict.get("name")
+        if not entity_name:
+            raise EntityFormatError("No 'name' attribute found")
+        auto_extensible = yaml_dict.get(AUTOMATICALLY_EXTENSIBLE, True)
+        use_synonyms = yaml_dict.get(USE_SYNONYMS, True)
+        matching_strictness = yaml_dict.get("matching_strictness", 1.0)
+        utterances = []
+        for entity_value in yaml_dict.get("values", []):
+            if isinstance(entity_value, list):
+                utterance = EntityUtterance(entity_value[0], entity_value[1:])
+            elif isinstance(entity_value, str):
+                utterance = EntityUtterance(entity_value)
+            else:
+                raise EntityFormatError(
+                    "YAML entity values must be either strings or lists, but "
+                    "found: %s" % type(entity_value))
+            utterances.append(utterance)
+
+        cls(name=entity_name,
+            utterances=utterances,
+            automatically_extensible=auto_extensible,
+            use_synonyms=use_synonyms,
+            matching_strictness=matching_strictness)
+
     @classmethod
     def from_file(cls, filepath):
         filepath = Path(filepath)
         stem = filepath.stem
         if not stem.startswith("entity_"):
-            raise AssertionError("Entity filename should start with 'entity_' "
-                                 "but found: %s" % stem)
+            raise EntityFormatError(
+                "Entity filename should start with 'entity_' but found: %s"
+                % stem)
         entity_name = stem[7:]
         if not entity_name:
-            raise AssertionError("Entity name must not be empty")
+            raise EntityFormatError("Entity name must not be empty")
         utterances = []
         with filepath.open(encoding="utf-8") as f:
             it = f
@@ -82,6 +119,8 @@ def from_file(cls, filepath):
     @property
     def json(self):
         """Returns the entity in json format"""
+        if self.is_builtin:
+            return dict()
         return {
             AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible,
             USE_SYNONYMS: self.use_synonyms,
@@ -109,25 +148,6 @@ def json(self):
         return {VALUE: self.value, SYNONYMS: self.synonyms}
 
 
-class BuiltinEntity(Entity):
-    """Builtin entity of an :class:`.AssistantDataset`"""
-
-    @property
-    def json(self):
-        return dict()
-
-
 def utf_8_encoder(f):
     for line in f:
         yield line.encode("utf-8")
-
-
-def create_entity(entity_name, utterances=None, automatically_extensible=True,
-                  use_synonyms=True):
-    if is_builtin_entity(entity_name):
-        return BuiltinEntity(entity_name)
-    else:
-        if utterances is None:
-            utterances = []
-        return CustomEntity(entity_name, utterances, automatically_extensible,
-                            use_synonyms)
diff --git a/snips_nlu/cli/dataset/intent_dataset.py b/snips_nlu/dataset/intent.py
similarity index 61%
rename from snips_nlu/cli/dataset/intent_dataset.py
rename to snips_nlu/dataset/intent.py
index bebb87575..83cdfdf89 100644
--- a/snips_nlu/cli/dataset/intent_dataset.py
+++ b/snips_nlu/dataset/intent.py
@@ -1,4 +1,4 @@
-from __future__ import print_function, absolute_import
+from __future__ import absolute_import, print_function
 
 from abc import ABCMeta, abstractmethod
 from builtins import object
@@ -6,13 +6,18 @@
 
 from future.utils import with_metaclass
 
-from snips_nlu.constants import UTTERANCES, SLOT_NAME, ENTITY, TEXT, DATA
+from snips_nlu.constants import DATA, ENTITY, SLOT_NAME, TEXT, UTTERANCES
 
-INTENT_FORMATTING_ERROR = AssertionError(
+
+class IntentFormatError(TypeError):
+    pass
+
+
+INTENT_FORMATTING_ERROR = IntentFormatError(
     "Intent file is not properly formatted")
 
 
-class IntentDataset(object):
+class Intent(object):
     """Dataset of an intent
 
     Can parse utterances from a text file or an iterator.
@@ -33,17 +38,19 @@ class IntentDataset(object):
     def __init__(self, intent_name):
         self.intent_name = intent_name
         self.utterances = []
+        self.slot_mapping = dict()
 
     @classmethod
     def from_file(cls, filepath):
         filepath = Path(filepath)
         stem = filepath.stem
         if not stem.startswith("intent_"):
-            raise AssertionError("Intent filename should start with 'intent_' "
-                                 "but found: %s" % stem)
+            raise IntentFormatError(
+                "Intent filename should start with 'intent_' but found: %s"
+                % stem)
         intent_name = stem[7:]
         if not intent_name:
-            raise AssertionError("Intent name must not be empty")
+            raise IntentFormatError("Intent name must not be empty")
         with filepath.open(encoding="utf-8") as f:
             lines = iter(l.strip() for l in f if l.strip())
             return cls.from_iter(intent_name, lines)
@@ -59,6 +66,9 @@ def from_iter(cls, intent_name, samples_iter):
 
     def add(self, utterance):
         """Adds an :class:`.IntentUtterance` to the dataset"""
+        for chunk in utterance.slot_chunks:
+            if chunk.name not in self.slot_mapping:
+                self.slot_mapping[chunk.name] = chunk.entity
         self.utterances.append(utterance)
 
     @property
@@ -79,58 +89,19 @@ def entities_names(self):
 
 
 class IntentUtterance(object):
-    def __init__(self, input, chunks):
-        self.input = input
+    def __init__(self, chunks):
         self.chunks = chunks
 
     @property
-    def annotated(self):
-        """Annotates with *
+    def text(self):
+        return "".join((chunk.text for chunk in self.chunks))
 
-        Returns: The sentence annotated just with stars
-
-        Examples:
+    @property
+    def slot_chunks(self):
+        return (chunk for chunk in self.chunks if isinstance(chunk, SlotChunk))
 
-            >>> from snips_nlu.cli.dataset.intent_dataset import \
-                IntentUtterance
-            >>> p = "the [role:role](president) of [country:country](France)"
-            >>> u = IntentUtterance.parse(p)
-            >>> u.annotated
-            'the *president* of *France*'
-        """
-        binput = bytearray(self.input, 'utf-8')
-        acc = 0
-        star = ord('*')
-        for chunk in self.chunks:
-            if isinstance(chunk, SlotChunk):
-                binput.insert(chunk.range.start + acc, star)
-                binput.insert(chunk.range.end + acc + 1, star)
-                acc += 2
-        return binput.decode('utf-8')
-
-    @staticmethod
-    def stripped(input, chunks):
-        acc = 0
-        s = ''
-        new_chunks = []
-        for chunk in chunks:
-            start = chunk.range.start
-            end = chunk.range.end
-            s += input[start:end]
-            if isinstance(chunk, SlotChunk):
-                acc += chunk.tag_range.size
-                rng = Range(start - acc, end - acc)
-                new_chunk = SlotChunk(chunk.name, chunk.entity, rng,
-                                      chunk.text, chunk.tag_range)
-                new_chunks.append(new_chunk)
-                acc += 1
-            else:
-                rng = Range(start - acc, end - acc)
-                new_chunks.append(TextChunk(chunk.text, rng))
-        return s, new_chunks
-
-    @staticmethod
-    def parse(string):
+    @classmethod
+    def parse(cls, string):
         """Parses an utterance
 
         Args:
@@ -138,29 +109,28 @@ def parse(string):
 
         Examples:
 
-            >>> from snips_nlu.cli.dataset.intent_dataset import \
-                IntentUtterance
+            >>> from snips_nlu.dataset.intent import IntentUtterance
             >>> u = IntentUtterance.\
                 parse("president of [country:default](France)")
+            >>> u.text
+            'president of France'
             >>> len(u.chunks)
             2
             >>> u.chunks[0].text
             'president of '
-            >>> u.chunks[0].range.start
-            0
-            >>> u.chunks[0].range.end
-            13
+            >>> u.chunks[1].name
+            'country'
+            >>> u.chunks[1].entity
+            'default'
         """
         sm = SM(string)
         capture_text(sm)
-        string, chunks = IntentUtterance.stripped(string, sm.chunks)
-        return IntentUtterance(string, chunks)
+        return cls(sm.chunks)
 
 
 class Chunk(with_metaclass(ABCMeta, object)):
-    def __init__(self, text, range):
+    def __init__(self, text):
         self.text = text
-        self.range = range
 
     @abstractmethod
     def json(self):
@@ -168,11 +138,10 @@ def json(self):
 
 
 class SlotChunk(Chunk):
-    def __init__(self, slot_name, entity, range, text, tag_range):
-        super(SlotChunk, self).__init__(text, range)
+    def __init__(self, slot_name, entity, text):
+        super(SlotChunk, self).__init__(text)
         self.name = slot_name
         self.entity = entity
-        self.tag_range = tag_range
 
     @property
     def json(self):
@@ -191,16 +160,6 @@ def json(self):
         }
 
 
-class Range(object):
-    def __init__(self, start, end=None):
-        self.start = start
-        self.end = end
-
-    @property
-    def size(self):
-        return self.end - self.start + 1
-
-
 class SM(object):
     """State Machine for parsing"""
 
@@ -209,24 +168,19 @@ def __init__(self, input):
         self.chunks = []
         self.current = 0
 
-    def add_slot(self, slot_start, name, entity):
+    def add_slot(self, name, entity):
         """Adds a named slot
 
         Args:
-            slot_start (int): position where the slot tag started
             name (str): slot name
             entity (str): entity name
         """
-        tag_range = Range(slot_start - 1)
-        chunk = SlotChunk(slot_name=name, entity=entity, range=None, text=None,
-                          tag_range=tag_range)
+        chunk = SlotChunk(slot_name=name, entity=entity, text=None)
         self.chunks.append(chunk)
 
     def add_text(self, text):
         """Adds a simple text chunk using the current position"""
-        start = self.current
-        end = start + len(text)
-        chunk = TextChunk(text=text, range=Range(start=start, end=end))
+        chunk = TextChunk(text=text)
         self.chunks.append(chunk)
 
     def add_tagged(self, text):
@@ -234,10 +188,7 @@ def add_tagged(self, text):
         if not self.chunks:
             raise AssertionError("Cannot add tagged text because chunks list "
                                  "is empty")
-        chunk = self.chunks[-1]
-        chunk.text = text
-        chunk.tag_range.end = self.current - 1
-        chunk.range = Range(start=self.current, end=self.current + len(text))
+        self.chunks[-1].text = text
 
     def find(self, s):
         return self.input.find(s, self.current)
@@ -280,7 +231,6 @@ def capture_text(state):
 
 
 def capture_slot(state):
-    slot_start = state.current
     next_pos = state.find(':')
     if next_pos < 0:
         raise INTENT_FORMATTING_ERROR
@@ -292,7 +242,7 @@ def capture_slot(state):
             raise INTENT_FORMATTING_ERROR
         entity = state[:next_pos]
         state.move(next_pos)
-        state.add_slot(slot_start, slot_name, entity)
+        state.add_slot(slot_name, entity)
         if state.read() != '(':
             raise INTENT_FORMATTING_ERROR
         capture_tagged(state)
diff --git a/snips_nlu/tests/test_cli.py b/snips_nlu/tests/test_cli.py
index 57542d64b..d640ecb1c 100644
--- a/snips_nlu/tests/test_cli.py
+++ b/snips_nlu/tests/test_cli.py
@@ -7,10 +7,10 @@
 from snips_nlu import SnipsNLUEngine
 from snips_nlu.cli import cross_val_metrics, parse, train, train_test_metrics
 from snips_nlu.cli.dataset import AssistantDataset
-from snips_nlu.cli.dataset.entities import CustomEntity
-from snips_nlu.cli.dataset.intent_dataset import IntentDataset
 from snips_nlu.constants import PACKAGE_PATH
-from snips_nlu.dataset import validate_and_format_dataset
+from snips_nlu.dataset import (
+    Entity, EntityFormatError, Intent, IntentFormatError,
+    validate_and_format_dataset)
 from snips_nlu.tests.utils import BEVERAGE_DATASET_PATH, SnipsTest, TEST_PATH
 
 
@@ -81,7 +81,7 @@ def test_should_generate_intent_from_file(self):
         intent_file = examples_path / "intent_getWeather.txt"
 
         # When
-        intent_dataset = IntentDataset.from_file(intent_file)
+        intent_dataset = Intent.from_file(intent_file)
         intent_dict = intent_dataset.json
 
         # Then
@@ -156,7 +156,7 @@ def test_should_generate_entity_from_file(self):
         entity_file = examples_path / "entity_location.txt"
 
         # When
-        entity_dataset = CustomEntity.from_file(entity_file)
+        entity_dataset = Entity.from_file(entity_file)
         entity_dict = entity_dataset.json
 
         # Then
@@ -191,7 +191,7 @@ def test_should_generate_entity_from_file_with_autoextensible(self):
         entity_file = examples_path / "entity_location_autoextent_false.txt"
 
         # When
-        entity_dataset = CustomEntity.from_file(entity_file)
+        entity_dataset = Entity.from_file(entity_file)
         entity_dict = entity_dataset.json
 
         # Then
@@ -419,8 +419,8 @@ def test_should_fail_generating_intent_with_wrong_file_name(self):
         intent_file = examples_path / "getWeather.txt"
 
         # When / Then
-        with self.assertRaises(AssertionError):
-            IntentDataset.from_file(intent_file)
+        with self.assertRaises(IntentFormatError):
+            Intent.from_file(intent_file)
 
     def test_should_fail_generating_entity_with_wrong_file_name(self):
         # Given
@@ -428,5 +428,5 @@ def test_should_fail_generating_entity_with_wrong_file_name(self):
         entity_file = examples_path / "location.txt"
 
         # When / Then
-        with self.assertRaises(AssertionError):
-            CustomEntity.from_file(entity_file)
+        with self.assertRaises(EntityFormatError):
+            Entity.from_file(entity_file)

From 6442404ff3f3545a5f52cb42564809ed4dbb1bef Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Thu, 15 Nov 2018 19:32:54 +0100
Subject: [PATCH 05/24] Add entity and intent loading from yaml files

---
 snips_nlu/__init__.py                         |   2 -
 snips_nlu/cli/dataset/__init__.py             |   1 -
 .../dataset/examples/intent_getWeather.txt    |   4 +-
 snips_nlu/cli/generate_dataset.py             |   2 +-
 snips_nlu/dataset/__init__.py                 |   1 +
 .../dataset.py}                               |   3 +-
 snips_nlu/dataset/entity.py                   |  10 +-
 snips_nlu/dataset/intent.py                   |  53 ++-
 snips_nlu/nlu_engine/__init__.py              |   2 +-
 snips_nlu/tests/test_cli.py                   | 361 ------------------
 snips_nlu/tests/test_dataset_loading.py       | 199 ++++++++++
 ..._dataset.py => test_dataset_validation.py} |   2 +-
 snips_nlu/tests/test_entity_loading.py        | 212 ++++++++++
 snips_nlu/tests/test_intent_loading.py        |  90 +++++
 14 files changed, 554 insertions(+), 388 deletions(-)
 delete mode 100644 snips_nlu/cli/dataset/__init__.py
 rename snips_nlu/{cli/dataset/assistant_dataset.py => dataset/dataset.py} (97%)
 create mode 100644 snips_nlu/tests/test_dataset_loading.py
 rename snips_nlu/tests/{test_dataset.py => test_dataset_validation.py} (99%)
 create mode 100644 snips_nlu/tests/test_entity_loading.py
 create mode 100644 snips_nlu/tests/test_intent_loading.py

diff --git a/snips_nlu/__init__.py b/snips_nlu/__init__.py
index e782bbfba..6414de42d 100644
--- a/snips_nlu/__init__.py
+++ b/snips_nlu/__init__.py
@@ -1,5 +1,3 @@
-import logging
-
 from snips_nlu_ontology import get_ontology_version
 
 from snips_nlu.__about__ import __model_version__, __version__
diff --git a/snips_nlu/cli/dataset/__init__.py b/snips_nlu/cli/dataset/__init__.py
deleted file mode 100644
index 3a8808233..000000000
--- a/snips_nlu/cli/dataset/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from snips_nlu.cli.dataset.assistant_dataset import AssistantDataset
diff --git a/snips_nlu/cli/dataset/examples/intent_getWeather.txt b/snips_nlu/cli/dataset/examples/intent_getWeather.txt
index bc611e565..960ce52bb 100644
--- a/snips_nlu/cli/dataset/examples/intent_getWeather.txt
+++ b/snips_nlu/cli/dataset/examples/intent_getWeather.txt
@@ -1,3 +1,3 @@
 what is the weather in [weatherLocation:location](Paris)?
-Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation:location](Moscow)?
-How is the weather in [weatherLocation:location](San Francisco) [weatherDate:snips/datetime](today)?
\ No newline at end of file
+Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)?
+How is the weather in [weatherLocation:location](San Francisco) [weatherDate] please?
\ No newline at end of file
diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py
index ffb0cea89..ac923f0dd 100644
--- a/snips_nlu/cli/generate_dataset.py
+++ b/snips_nlu/cli/generate_dataset.py
@@ -4,7 +4,7 @@
 
 import plac
 
-from snips_nlu.cli.dataset.assistant_dataset import AssistantDataset
+from snips_nlu.dataset import AssistantDataset
 
 
 @plac.annotations(
diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py
index 89c0c9744..9dd099c85 100644
--- a/snips_nlu/dataset/__init__.py
+++ b/snips_nlu/dataset/__init__.py
@@ -1,3 +1,4 @@
+from snips_nlu.dataset.dataset import AssistantDataset
 from snips_nlu.dataset.entity import Entity, EntityFormatError
 from snips_nlu.dataset.intent import Intent, IntentFormatError
 from snips_nlu.dataset.utils import (
diff --git a/snips_nlu/cli/dataset/assistant_dataset.py b/snips_nlu/dataset/dataset.py
similarity index 97%
rename from snips_nlu/cli/dataset/assistant_dataset.py
rename to snips_nlu/dataset/dataset.py
index 7961c34bc..4630e53a2 100644
--- a/snips_nlu/cli/dataset/assistant_dataset.py
+++ b/snips_nlu/dataset/dataset.py
@@ -3,7 +3,8 @@
 
 from pathlib import Path
 
-from snips_nlu.dataset import Entity, Intent
+from snips_nlu.dataset.entity import Entity
+from snips_nlu.dataset.intent import Intent
 
 
 class AssistantDataset(object):
diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
index 3668fc132..c3b60b87e 100644
--- a/snips_nlu/dataset/entity.py
+++ b/snips_nlu/dataset/entity.py
@@ -75,11 +75,11 @@ def from_yaml(cls, yaml_dict):
                     "found: %s" % type(entity_value))
             utterances.append(utterance)
 
-        cls(name=entity_name,
-            utterances=utterances,
-            automatically_extensible=auto_extensible,
-            use_synonyms=use_synonyms,
-            matching_strictness=matching_strictness)
+        return cls(name=entity_name,
+                   utterances=utterances,
+                   automatically_extensible=auto_extensible,
+                   use_synonyms=use_synonyms,
+                   matching_strictness=matching_strictness)
 
     @classmethod
     def from_file(cls, filepath):
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index 83cdfdf89..2ed7f3957 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -35,10 +35,32 @@ class Intent(object):
         utterances (list of :class:`.IntentUtterance`): intent utterances
     """
 
-    def __init__(self, intent_name):
+    def __init__(self, intent_name, slot_mapping=None):
+        if slot_mapping is None:
+            slot_mapping = dict()
         self.intent_name = intent_name
         self.utterances = []
-        self.slot_mapping = dict()
+        self.slot_mapping = slot_mapping
+
+    @classmethod
+    def from_yaml(cls, yaml_dict):
+        """Build an :class:`.Intent` from its YAML definition dict"""
+        object_type = yaml_dict.get("type")
+        if object_type and object_type != "intent":
+            raise IntentFormatError("Wrong type: '%s'" % object_type)
+        intent_name = yaml_dict.get("name")
+        if not intent_name:
+            raise IntentFormatError("No 'name' attribute found")
+        slot_mapping = dict()
+        for slot in yaml_dict.get("slots", []):
+            slot_mapping[slot["name"]] = slot["entity"]
+        dataset = cls(intent_name, slot_mapping)
+        utterances = (u.strip() for u in yaml_dict["utterances"] if u.strip())
+        if not utterances:
+            raise IntentFormatError(
+                "Intent must contain at least one utterance")
+        dataset.add_utterances(utterances)
+        return dataset
 
     @classmethod
     def from_file(cls, filepath):
@@ -51,18 +73,16 @@ def from_file(cls, filepath):
         intent_name = stem[7:]
         if not intent_name:
             raise IntentFormatError("Intent name must not be empty")
+        dataset = cls(intent_name)
         with filepath.open(encoding="utf-8") as f:
             lines = iter(l.strip() for l in f if l.strip())
-            return cls.from_iter(intent_name, lines)
+            dataset.add_utterances(lines)
+            return dataset
 
-    @classmethod
-    def from_iter(cls, intent_name, samples_iter):
-        """Generates a dataset from an iterator of samples"""
-        dataset = cls(intent_name)
+    def add_utterances(self, samples_iter):
         for sample in samples_iter:
             utterance = IntentUtterance.parse(sample)
-            dataset.add(utterance)
-        return dataset
+            self.add(utterance)
 
     def add(self, utterance):
         """Adds an :class:`.IntentUtterance` to the dataset"""
@@ -168,7 +188,7 @@ def __init__(self, input):
         self.chunks = []
         self.current = 0
 
-    def add_slot(self, name, entity):
+    def add_slot(self, name, entity=None):
         """Adds a named slot
 
         Args:
@@ -233,7 +253,12 @@ def capture_text(state):
 def capture_slot(state):
     next_pos = state.find(':')
     if next_pos < 0:
-        raise INTENT_FORMATTING_ERROR
+        next_pos = state.find(']')
+        if next_pos < 0:
+            raise INTENT_FORMATTING_ERROR
+        slot_name = state[:next_pos]
+        state.move(next_pos)
+        state.add_slot(slot_name)
     else:
         slot_name = state[:next_pos]
         state.move(next_pos)
@@ -243,9 +268,11 @@ def capture_slot(state):
         entity = state[:next_pos]
         state.move(next_pos)
         state.add_slot(slot_name, entity)
-        if state.read() != '(':
-            raise INTENT_FORMATTING_ERROR
+    if state.peek() == '(':
+        state.read()
         capture_tagged(state)
+    else:
+        capture_text(state)
 
 
 def capture_tagged(state):
diff --git a/snips_nlu/nlu_engine/__init__.py b/snips_nlu/nlu_engine/__init__.py
index a4ec8db24..07c8ebc73 100644
--- a/snips_nlu/nlu_engine/__init__.py
+++ b/snips_nlu/nlu_engine/__init__.py
@@ -1 +1 @@
-from .nlu_engine import SnipsNLUEngine
+from snips_nlu.nlu_engine.nlu_engine import SnipsNLUEngine
diff --git a/snips_nlu/tests/test_cli.py b/snips_nlu/tests/test_cli.py
index d640ecb1c..d55d2f87b 100644
--- a/snips_nlu/tests/test_cli.py
+++ b/snips_nlu/tests/test_cli.py
@@ -6,11 +6,6 @@
 
 from snips_nlu import SnipsNLUEngine
 from snips_nlu.cli import cross_val_metrics, parse, train, train_test_metrics
-from snips_nlu.cli.dataset import AssistantDataset
-from snips_nlu.constants import PACKAGE_PATH
-from snips_nlu.dataset import (
-    Entity, EntityFormatError, Intent, IntentFormatError,
-    validate_and_format_dataset)
 from snips_nlu.tests.utils import BEVERAGE_DATASET_PATH, SnipsTest, TEST_PATH
 
 
@@ -74,359 +69,3 @@ def test_train_test_metrics(self):
         # Then
         if not self.tmp_file_path.exists():
             self.fail("No metrics found")
-
-    def test_should_generate_intent_from_file(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file = examples_path / "intent_getWeather.txt"
-
-        # When
-        intent_dataset = Intent.from_file(intent_file)
-        intent_dict = intent_dataset.json
-
-        # Then
-        expected_intent_dict = {
-            "utterances": [
-                {
-                    "data": [
-                        {
-                            "text": "what is the weather in "
-                        },
-                        {
-                            "entity": "location",
-                            "slot_name": "weatherLocation",
-                            "text": "Paris"
-                        },
-                        {
-                            "text": "?"
-                        }
-                    ]
-                },
-                {
-                    "data": [
-                        {
-                            "text": "Will it rain "
-                        },
-                        {
-                            "entity": "snips/datetime",
-                            "slot_name": "weatherDate",
-                            "text": "tomorrow"
-                        },
-                        {
-                            "text": " in "
-                        },
-                        {
-                            "entity": "location",
-                            "slot_name": "weatherLocation",
-                            "text": "Moscow"
-                        },
-                        {
-                            "text": "?"
-                        }
-                    ]
-                },
-                {
-                    "data": [
-                        {
-                            "text": "How is the weather in "
-                        },
-                        {
-                            "entity": "location",
-                            "slot_name": "weatherLocation",
-                            "text": "San Francisco"
-                        },
-                        {
-                            "entity": "snips/datetime",
-                            "slot_name": "weatherDate",
-                            "text": "today"
-                        },
-                        {
-                            "text": "?"
-                        }
-                    ]
-                }
-            ]
-        }
-
-        self.assertDictEqual(expected_intent_dict, intent_dict)
-
-    def test_should_generate_entity_from_file(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "entity_location.txt"
-
-        # When
-        entity_dataset = Entity.from_file(entity_file)
-        entity_dict = entity_dataset.json
-
-        # Then
-        expected_entity_dict = {
-            "automatically_extensible": True,
-            "data": [
-                {
-                    "synonyms": [
-                        "big apple"
-                    ],
-                    "value": "new york"
-                },
-                {
-                    "synonyms": [
-                        "city of lights"
-                    ],
-                    "value": "paris"
-                },
-                {
-                    "synonyms": [],
-                    "value": "london"
-                }
-            ],
-            "use_synonyms": True,
-            "matching_strictness": 1.0
-        }
-        self.assertDictEqual(expected_entity_dict, entity_dict)
-
-    def test_should_generate_entity_from_file_with_autoextensible(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "entity_location_autoextent_false.txt"
-
-        # When
-        entity_dataset = Entity.from_file(entity_file)
-        entity_dict = entity_dataset.json
-
-        # Then
-        expected_entity_dict = {
-            "automatically_extensible": False,
-            "data": [
-                {
-                    "synonyms": [
-                        "big apple"
-                    ],
-                    "value": "new york"
-                },
-                {
-                    "synonyms": [
-                        "city of lights"
-                    ],
-                    "value": "paris"
-                },
-                {
-                    "synonyms": [],
-                    "value": "london"
-                }
-            ],
-            "use_synonyms": True,
-            "matching_strictness": 1.0
-        }
-        self.assertDictEqual(expected_entity_dict, entity_dict)
-
-    def test_should_generate_dataset_from_files(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file_1 = examples_path / "intent_whoIsGame.txt"
-        intent_file_2 = examples_path / "intent_getWeather.txt"
-        entity_file_1 = examples_path / "entity_location.txt"
-
-        dataset = AssistantDataset.from_files(
-            "en", [intent_file_1, intent_file_2, entity_file_1])
-        dataset_dict = dataset.json
-
-        # When / Then
-        expected_dataset_dict = {
-            "entities": {
-                "company": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "country": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "location": {
-                    "automatically_extensible": True,
-                    "data": [
-                        {
-                            "synonyms": [
-                                "big apple"
-                            ],
-                            "value": "new york"
-                        },
-                        {
-                            "synonyms": [
-                                "city of lights"
-                            ],
-                            "value": "paris"
-                        },
-                        {
-                            "synonyms": [],
-                            "value": "london"
-                        }
-                    ],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "role": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "snips/datetime": {}
-            },
-            "intents": {
-                "getWeather": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "what is the weather in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "Paris"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "Will it rain "
-                                },
-                                {
-                                    "entity": "snips/datetime",
-                                    "slot_name": "weatherDate",
-                                    "text": "tomorrow"
-                                },
-                                {
-                                    "text": " in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "Moscow"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "How is the weather in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "San Francisco"
-                                },
-                                {
-                                    "entity": "snips/datetime",
-                                    "slot_name": "weatherDate",
-                                    "text": "today"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
-                        }
-                    ]
-                },
-                "whoIsGame": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "president"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "country",
-                                    "slot_name": "country",
-                                    "text": "France"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "prime minister"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "country",
-                                    "slot_name": "country",
-                                    "text": "UK"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "CEO"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "company",
-                                    "slot_name": "company",
-                                    "text": "Google"
-                                },
-                                {
-                                    "text": " please"
-                                }
-                            ]
-                        }
-                    ]
-                }
-            },
-            "language": "en"
-        }
-        validate_and_format_dataset(dataset_dict)
-        self.assertDictEqual(expected_dataset_dict, dataset_dict)
-
-    def test_should_fail_generating_intent_with_wrong_file_name(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file = examples_path / "getWeather.txt"
-
-        # When / Then
-        with self.assertRaises(IntentFormatError):
-            Intent.from_file(intent_file)
-
-    def test_should_fail_generating_entity_with_wrong_file_name(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "location.txt"
-
-        # When / Then
-        with self.assertRaises(EntityFormatError):
-            Entity.from_file(entity_file)
diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py
new file mode 100644
index 000000000..8bde8550d
--- /dev/null
+++ b/snips_nlu/tests/test_dataset_loading.py
@@ -0,0 +1,199 @@
+from unittest import TestCase
+
+from snips_nlu.constants import PACKAGE_PATH
+from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset
+
+
+class TestDatasetLoading(TestCase):
+    def test_should_generate_dataset_from_files(self):
+        # Given
+        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
+        intent_file_1 = examples_path / "intent_whoIsGame.txt"
+        intent_file_2 = examples_path / "intent_getWeather.txt"
+        entity_file_1 = examples_path / "entity_location.txt"
+
+        dataset = AssistantDataset.from_files(
+            "en", [intent_file_1, intent_file_2, entity_file_1])
+        dataset_dict = dataset.json
+
+        # When / Then
+        expected_dataset_dict = {
+            "entities": {
+                "company": {
+                    "automatically_extensible": True,
+                    "data": [],
+                    "use_synonyms": True,
+                    "matching_strictness": 1.0,
+                },
+                "country": {
+                    "automatically_extensible": True,
+                    "data": [],
+                    "use_synonyms": True,
+                    "matching_strictness": 1.0,
+                },
+                "location": {
+                    "automatically_extensible": True,
+                    "data": [
+                        {
+                            "synonyms": [
+                                "big apple"
+                            ],
+                            "value": "new york"
+                        },
+                        {
+                            "synonyms": [
+                                "city of lights"
+                            ],
+                            "value": "paris"
+                        },
+                        {
+                            "synonyms": [],
+                            "value": "london"
+                        }
+                    ],
+                    "use_synonyms": True,
+                    "matching_strictness": 1.0,
+                },
+                "role": {
+                    "automatically_extensible": True,
+                    "data": [],
+                    "use_synonyms": True,
+                    "matching_strictness": 1.0,
+                },
+                "snips/datetime": {}
+            },
+            "intents": {
+                "getWeather": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "what is the weather in "
+                                },
+                                {
+                                    "entity": "location",
+                                    "slot_name": "weatherLocation",
+                                    "text": "Paris"
+                                },
+                                {
+                                    "text": "?"
+                                }
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "Will it rain "
+                                },
+                                {
+                                    "entity": "snips/datetime",
+                                    "slot_name": "weatherDate",
+                                    "text": "tomorrow"
+                                },
+                                {
+                                    "text": " in "
+                                },
+                                {
+                                    "entity": "location",
+                                    "slot_name": "weatherLocation",
+                                    "text": "Moscow"
+                                },
+                                {
+                                    "text": "?"
+                                }
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "How is the weather in "
+                                },
+                                {
+                                    "entity": "location",
+                                    "slot_name": "weatherLocation",
+                                    "text": "San Francisco"
+                                },
+                                {
+                                    "entity": "snips/datetime",
+                                    "slot_name": "weatherDate",
+                                    "text": "today"
+                                },
+                                {
+                                    "text": "?"
+                                }
+                            ]
+                        }
+                    ]
+                },
+                "whoIsGame": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "who is the "
+                                },
+                                {
+                                    "entity": "role",
+                                    "slot_name": "role",
+                                    "text": "president"
+                                },
+                                {
+                                    "text": " of "
+                                },
+                                {
+                                    "entity": "country",
+                                    "slot_name": "country",
+                                    "text": "France"
+                                }
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "who is the "
+                                },
+                                {
+                                    "entity": "role",
+                                    "slot_name": "role",
+                                    "text": "prime minister"
+                                },
+                                {
+                                    "text": " of "
+                                },
+                                {
+                                    "entity": "country",
+                                    "slot_name": "country",
+                                    "text": "UK"
+                                }
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "who is the "
+                                },
+                                {
+                                    "entity": "role",
+                                    "slot_name": "role",
+                                    "text": "CEO"
+                                },
+                                {
+                                    "text": " of "
+                                },
+                                {
+                                    "entity": "company",
+                                    "slot_name": "company",
+                                    "text": "Google"
+                                },
+                                {
+                                    "text": " please"
+                                }
+                            ]
+                        }
+                    ]
+                }
+            },
+            "language": "en"
+        }
+        validate_and_format_dataset(dataset_dict)
+        self.assertDictEqual(expected_dataset_dict, dataset_dict)
diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset_validation.py
similarity index 99%
rename from snips_nlu/tests/test_dataset.py
rename to snips_nlu/tests/test_dataset_validation.py
index 6f9e0aa10..1aa9f457d 100644
--- a/snips_nlu/tests/test_dataset.py
+++ b/snips_nlu/tests/test_dataset_validation.py
@@ -10,7 +10,7 @@
 from snips_nlu.tests.utils import SnipsTest
 
 
-class TestDataset(SnipsTest):
+class TestDatasetValidation(SnipsTest):
     def test_missing_intent_key_should_raise_exception(self):
         # Given
         dataset = {
diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py
new file mode 100644
index 000000000..d49b1a4c3
--- /dev/null
+++ b/snips_nlu/tests/test_entity_loading.py
@@ -0,0 +1,212 @@
+import io
+from unittest import TestCase
+
+import yaml
+
+from snips_nlu.constants import PACKAGE_PATH
+from snips_nlu.dataset import Entity, EntityFormatError
+
+
+class TestEntityLoading(TestCase):
+    def test_from_yaml_file(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+type: entity
+name: location
+automatically_extensible: no
+use_synonyms: yes
+matching_strictness: 0.5
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        entity = Entity.from_yaml(yaml_dict)
+        entity_dict = entity.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": False,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 0.5
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    def test_from_yaml_file_with_defaults(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+name: location
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        entity = Entity.from_yaml(yaml_dict)
+        entity_dict = entity.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": True,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    def test_fail_from_yaml_file_when_wrong_type(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+type: intent
+name: location
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When / Then
+        with self.assertRaises(EntityFormatError):
+            Entity.from_yaml(yaml_dict)
+
+    def test_fail_from_yaml_file_when_no_name(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When / Then
+        with self.assertRaises(EntityFormatError):
+            Entity.from_yaml(yaml_dict)
+
+    def test_from_text_file(self):
+        # Given
+        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
+        entity_file = examples_path / "entity_location.txt"
+
+        # When
+        entity = Entity.from_file(entity_file)
+        entity_dict = entity.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": True,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    def test_from_file_with_autoextensible(self):
+        # Given
+        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
+        entity_file = examples_path / "entity_location_autoextent_false.txt"
+
+        # When
+        entity_dataset = Entity.from_file(entity_file)
+        entity_dict = entity_dataset.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": False,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    def test_should_fail_generating_entity_with_wrong_file_name(self):
+        # Given
+        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
+        entity_file = examples_path / "location.txt"
+
+        # When / Then
+        with self.assertRaises(EntityFormatError):
+            Entity.from_file(entity_file)
diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py
new file mode 100644
index 000000000..3e732edbf
--- /dev/null
+++ b/snips_nlu/tests/test_intent_loading.py
@@ -0,0 +1,90 @@
+from unittest import TestCase
+
+from snips_nlu.constants import PACKAGE_PATH
+from snips_nlu.dataset import Intent, IntentFormatError
+
+
+class TestIntentLoading(TestCase):
+    def test_should_generate_intent_from_text_file(self):
+        # Given
+        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
+        intent_file = examples_path / "intent_getWeather.txt"
+
+        # When
+        intent_dataset = Intent.from_file(intent_file)
+        intent_dict = intent_dataset.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "Paris"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "Will it rain "
+                        },
+                        {
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": "tomorrow"
+                        },
+                        {
+                            "text": " in "
+                        },
+                        {
+                            "entity": None,
+                            "slot_name": "weatherLocation",
+                            "text": "Moscow"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "How is the weather in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "San Francisco"
+                        },
+                        {
+                            "entity": None,
+                            "slot_name": "weatherDate",
+                            "text": None
+                        },
+                        {
+                            "text": " please?"
+                        }
+                    ]
+                }
+            ]
+        }
+
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    def test_should_fail_generating_intent_with_wrong_file_name(self):
+        # Given
+        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
+        intent_file = examples_path / "getWeather.txt"
+
+        # When / Then
+        with self.assertRaises(IntentFormatError):
+            Intent.from_file(intent_file)

From e97af48c033d53b18ce578b0448bc1f565717ceb Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Mon, 19 Nov 2018 17:29:39 +0100
Subject: [PATCH 06/24] Add dataset loading from yaml files

---
 setup.py                                      |   1 +
 .../cli/dataset/examples/entity_location.txt  |   3 -
 .../entity_location_autoextent_false.txt      |   4 -
 .../dataset/examples/intent_getWeather.txt    |   3 -
 .../cli/dataset/examples/intent_whoIsGame.txt |   3 -
 snips_nlu/dataset/dataset.py                  |  99 +++-
 snips_nlu/dataset/entity.py                   |  14 +-
 snips_nlu/dataset/intent.py                   |  90 ++--
 snips_nlu/dataset/validation.py               |   7 +-
 snips_nlu/tests/test_dataset_loading.py       | 430 +++++++++++-------
 snips_nlu/tests/test_entity_loading.py        |  48 +-
 snips_nlu/tests/test_intent_loading.py        | 223 ++++++++-
 snips_nlu/tests/utils.py                      |   9 +-
 13 files changed, 665 insertions(+), 269 deletions(-)
 delete mode 100644 snips_nlu/cli/dataset/examples/entity_location.txt
 delete mode 100644 snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt
 delete mode 100644 snips_nlu/cli/dataset/examples/intent_getWeather.txt
 delete mode 100644 snips_nlu/cli/dataset/examples/intent_whoIsGame.txt

diff --git a/setup.py b/setup.py
index 6459693c4..9829a11e2 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
     "plac>=0.9.6,<1.0",
     "requests>=2.0,<3.0",
     "pathlib==1.0.1; python_version < '3.4'",
+    "pyaml>=17,<18"
 ]
 
 extras_require = {
diff --git a/snips_nlu/cli/dataset/examples/entity_location.txt b/snips_nlu/cli/dataset/examples/entity_location.txt
deleted file mode 100644
index c6453a733..000000000
--- a/snips_nlu/cli/dataset/examples/entity_location.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-new york,big apple
-paris,city of lights
-london
\ No newline at end of file
diff --git a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt b/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt
deleted file mode 100644
index 243c4d290..000000000
--- a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-# automatically_extensible=false
-new york,big apple
-paris,city of lights
-london
\ No newline at end of file
diff --git a/snips_nlu/cli/dataset/examples/intent_getWeather.txt b/snips_nlu/cli/dataset/examples/intent_getWeather.txt
deleted file mode 100644
index 960ce52bb..000000000
--- a/snips_nlu/cli/dataset/examples/intent_getWeather.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-what is the weather in [weatherLocation:location](Paris)?
-Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)?
-How is the weather in [weatherLocation:location](San Francisco) [weatherDate] please?
\ No newline at end of file
diff --git a/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt b/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt
deleted file mode 100644
index 03f369d50..000000000
--- a/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-who is the [role:role](president) of [country:country](France)
-who is the [role:role](prime minister) of [country:country](UK)
-who is the [role:role](CEO) of [company:company](Google) please
diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py
index 4630e53a2..364e67364 100644
--- a/snips_nlu/dataset/dataset.py
+++ b/snips_nlu/dataset/dataset.py
@@ -1,29 +1,58 @@
 # coding=utf-8
 from __future__ import print_function, unicode_literals
 
+import io
+from itertools import cycle
 from pathlib import Path
 
+import yaml
+from snips_nlu_ontology import get_builtin_entity_examples
+
 from snips_nlu.dataset.entity import Entity
 from snips_nlu.dataset.intent import Intent
 
 
+class DatasetFormatError(TypeError):
+    pass
+
+
 class AssistantDataset(object):
-    """Dataset of an assistant
+    """Dataset used in the main NLU training API
 
-    Merges a list of :class:`.AssistantDataset` into a single dataset ready to
-    be used by Snips NLU
+    Consists of intents and entities data. This object can be built either from
+    text files (:meth:`.AssistantDataset.from_files`) or from YAML files
+    (:meth:`.AssistantDataset.from_yaml_files`).
 
     Attributes:
-        language (str): language of the assistant
-        intents_datasets (list of :class:`.IntentDataset`): data of the
-            assistant intents
-        entities (list of :class:`.Entity`): data of the assistant entities
+        language (str): language of the intents
+        intents (list of :class:`.Intent`): intents data
+        entities (list of :class:`.Entity`): entities data
     """
 
-    def __init__(self, language, intent_datasets, entities):
+    def __init__(self, language, intents, entities):
         self.language = language
-        self.intents_datasets = intent_datasets
+        self.intents = intents
         self.entities = entities
+        self._add_missing_entities()
+        self._ensure_entity_values()
+
+    @classmethod
+    def from_yaml_files(cls, language, filenames):
+        entities = []
+        intents = []
+        for filename in filenames:
+            with io.open(filename, encoding="utf8") as f:
+                for doc in yaml.safe_load_all(f):
+                    doc_type = doc.get("type")
+                    if doc_type == "entity":
+                        entities.append(Entity.from_yaml(doc))
+                    elif doc_type == "intent":
+                        intents.append(Intent.from_yaml(doc))
+                    else:
+                        raise DatasetFormatError(
+                            "Invalid 'type' value in YAML file '%s': '%s'"
+                            % (filename, doc_type))
+        return cls(language, intents, entities)
 
     @classmethod
     def from_files(cls, language, filenames):
@@ -53,23 +82,57 @@ def from_files(cls, language, filenames):
                                      "'intent_' or 'entity_' but found: %s"
                                      % stem)
 
-        intents_datasets = [Intent.from_file(f)
-                            for f in intent_filepaths]
+        intents = [Intent.from_file(f) for f in intent_filepaths]
 
         entities = [Entity.from_file(f) for f in entity_filepaths]
-        entity_names = set(e.name for e in entities)
+        return cls(language, intents, entities)
+
+    def _add_missing_entities(self):
+        entity_names = set(e.name for e in self.entities)
 
-        # Add entities appearing only in the intents data
-        for intent_data in intents_datasets:
-            for entity_name in intent_data.entities_names:
+        # Add entities appearing only in the intents utterances
+        for intent in self.intents:
+            for entity_name in intent.entities_names:
                 if entity_name not in entity_names:
                     entity_names.add(entity_name)
-                    entities.append(Entity(name=entity_name))
-        return cls(language, intents_datasets, entities)
+                    self.entities.append(Entity(name=entity_name))
+
+    def _ensure_entity_values(self):
+        entities_values = {entity.name: self._get_entity_values(entity)
+                           for entity in self.entities}
+        for intent in self.intents:
+            for utterance in intent.utterances:
+                for chunk in utterance.slot_chunks:
+                    if chunk.text is not None:
+                        continue
+                    try:
+                        chunk.text = next(entities_values[chunk.entity])
+                    except StopIteration:
+                        raise DatasetFormatError(
+                            "At least one entity value must be provided for "
+                            "entity '%s'" % chunk.entity)
+        return self
+
+    def _get_entity_values(self, entity):
+        if entity.is_builtin:
+            return cycle(get_builtin_entity_examples(
+                entity.name, self.language))
+        values = [v for utterance in entity.utterances
+                  for v in utterance.variations]
+        values_set = set(values)
+        for intent in self.intents:
+            for utterance in intent.utterances:
+                for chunk in utterance.slot_chunks:
+                    if not chunk.text or chunk.entity != entity.name:
+                        continue
+                    if chunk.text not in values_set:
+                        values_set.add(chunk.text)
+                        values.append(chunk.text)
+        return cycle(values)
 
     @property
     def json(self):
         intents = {intent_data.intent_name: intent_data.json
-                   for intent_data in self.intents_datasets}
+                   for intent_data in self.intents}
         entities = {entity.name: entity.json for entity in self.entities}
         return dict(language=self.language, intents=intents, entities=entities)
diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
index c3b60b87e..1d4923fff 100644
--- a/snips_nlu/dataset/entity.py
+++ b/snips_nlu/dataset/entity.py
@@ -22,19 +22,19 @@ class EntityFormatError(TypeError):
 class Entity(object):
     """Entity of an :class:`.AssistantDataset`
 
-    This class can represents both a custom entity and a builtin entity
+    This class can represents both a custom or a builtin entity
 
     Attributes:
         name (str): name of the entity
         utterances (list of :class:`.EntityUtterance`): entity utterances
             (only for custom entities)
         automatically_extensible (bool): whether or not the entity can be
-            extended to values not present in the dataset (only for custom
+            extended to values not present in the data (only for custom
             entities)
         use_synonyms (bool): whether or not to map entity values using
             synonyms (only for custom entities)
         matching_strictness (float): controls the matching strictness of the
-            entity (only for custom entities)
+            entity (only for custom entities). Must be between 0.0 and 1.0.
     """
 
     def __init__(self, name, utterances=None, automatically_extensible=True,
@@ -59,7 +59,7 @@ def from_yaml(cls, yaml_dict):
             raise EntityFormatError("Wrong type: '%s'" % object_type)
         entity_name = yaml_dict.get("name")
         if not entity_name:
-            raise EntityFormatError("No 'name' attribute found")
+            raise EntityFormatError("Missing 'name' attribute")
         auto_extensible = yaml_dict.get(AUTOMATICALLY_EXTENSIBLE, True)
         use_synonyms = yaml_dict.get(USE_SYNONYMS, True)
         matching_strictness = yaml_dict.get("matching_strictness", 1.0)
@@ -100,6 +100,8 @@ def from_file(cls, filepath):
             reader = csv.reader(list(it))
             autoextent = True
             for row in reader:
+                if not row or not row[0].strip():
+                    continue
                 if six.PY2:
                     row = [cell.decode("utf-8") for cell in row]
                 value = row[0]
@@ -143,6 +145,10 @@ def __init__(self, value, synonyms=None):
             synonyms = []
         self.synonyms = synonyms
 
+    @property
+    def variations(self):
+        return [self.value] + self.synonyms
+
     @property
     def json(self):
         return {VALUE: self.value, SYNONYMS: self.synonyms}
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index 2ed7f3957..650bf59e0 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -33,14 +33,17 @@ class Intent(object):
     Attributes:
         intent_name (str): name of the intent
         utterances (list of :class:`.IntentUtterance`): intent utterances
+        slot_mapping (dict): mapping between slot names and entities
     """
 
-    def __init__(self, intent_name, slot_mapping=None):
+    def __init__(self, intent_name, utterances, slot_mapping=None):
         if slot_mapping is None:
             slot_mapping = dict()
         self.intent_name = intent_name
-        self.utterances = []
+        self.utterances = utterances
         self.slot_mapping = slot_mapping
+        self._complete_slot_name_mapping()
+        self._ensure_entity_names()
 
     @classmethod
     def from_yaml(cls, yaml_dict):
@@ -50,17 +53,16 @@ def from_yaml(cls, yaml_dict):
             raise IntentFormatError("Wrong type: '%s'" % object_type)
         intent_name = yaml_dict.get("name")
         if not intent_name:
-            raise IntentFormatError("No 'name' attribute found")
+            raise IntentFormatError("Missing 'name' attribute")
         slot_mapping = dict()
         for slot in yaml_dict.get("slots", []):
             slot_mapping[slot["name"]] = slot["entity"]
-        dataset = cls(intent_name, slot_mapping)
-        utterances = (u.strip() for u in yaml_dict["utterances"] if u.strip())
+        utterances = [IntentUtterance.parse(u.strip())
+                      for u in yaml_dict["utterances"] if u.strip()]
         if not utterances:
             raise IntentFormatError(
                 "Intent must contain at least one utterance")
-        dataset.add_utterances(utterances)
-        return dataset
+        return cls(intent_name, utterances, slot_mapping)
 
     @classmethod
     def from_file(cls, filepath):
@@ -73,27 +75,30 @@ def from_file(cls, filepath):
         intent_name = stem[7:]
         if not intent_name:
             raise IntentFormatError("Intent name must not be empty")
-        dataset = cls(intent_name)
         with filepath.open(encoding="utf-8") as f:
             lines = iter(l.strip() for l in f if l.strip())
-            dataset.add_utterances(lines)
-            return dataset
-
-    def add_utterances(self, samples_iter):
-        for sample in samples_iter:
-            utterance = IntentUtterance.parse(sample)
-            self.add(utterance)
-
-    def add(self, utterance):
-        """Adds an :class:`.IntentUtterance` to the dataset"""
-        for chunk in utterance.slot_chunks:
-            if chunk.name not in self.slot_mapping:
-                self.slot_mapping[chunk.name] = chunk.entity
-        self.utterances.append(utterance)
+            utterances = [IntentUtterance.parse(sample) for sample in lines]
+        return cls(intent_name, utterances)
+
+    def _complete_slot_name_mapping(self):
+        for utterance in self.utterances:
+            for chunk in utterance.slot_chunks:
+                if chunk.entity and chunk.slot_name not in self.slot_mapping:
+                    self.slot_mapping[chunk.slot_name] = chunk.entity
+        return self
+
+    def _ensure_entity_names(self):
+        for utterance in self.utterances:
+            for chunk in utterance.slot_chunks:
+                if chunk.entity:
+                    continue
+                chunk.entity = self.slot_mapping.get(
+                    chunk.slot_name, chunk.slot_name)
+        return self
 
     @property
     def json(self):
-        """Intent dataset in json format"""
+        """Intent data in json format"""
         return {
             UTTERANCES: [
                 {DATA: [chunk.json for chunk in utterance.chunks]}
@@ -103,7 +108,7 @@ def json(self):
 
     @property
     def entities_names(self):
-        """Set of entity names present in the intent dataset"""
+        """Set of entity names present in the intent utterances"""
         return set(chunk.entity for u in self.utterances
                    for chunk in u.chunks if isinstance(chunk, SlotChunk))
 
@@ -138,7 +143,7 @@ def parse(cls, string):
             2
             >>> u.chunks[0].text
             'president of '
-            >>> u.chunks[1].name
+            >>> u.chunks[1].slot_name
             'country'
             >>> u.chunks[1].entity
             'default'
@@ -160,14 +165,14 @@ def json(self):
 class SlotChunk(Chunk):
     def __init__(self, slot_name, entity, text):
         super(SlotChunk, self).__init__(text)
-        self.name = slot_name
+        self.slot_name = slot_name
         self.entity = entity
 
     @property
     def json(self):
         return {
             TEXT: self.text,
-            SLOT_NAME: self.name,
+            SLOT_NAME: self.slot_name,
             ENTITY: self.entity,
         }
 
@@ -188,6 +193,10 @@ def __init__(self, input):
         self.chunks = []
         self.current = 0
 
+    @property
+    def end_of_input(self):
+        return self.current >= len(self.input)
+
     def add_slot(self, name, entity=None):
         """Adds a named slot
 
@@ -222,6 +231,8 @@ def move(self, pos):
         self.current = pos + 1
 
     def peek(self):
+        if self.end_of_input:
+            return None
         return self[0]
 
     def read(self):
@@ -251,22 +262,19 @@ def capture_text(state):
 
 
 def capture_slot(state):
-    next_pos = state.find(':')
-    if next_pos < 0:
-        next_pos = state.find(']')
-        if next_pos < 0:
-            raise INTENT_FORMATTING_ERROR
-        slot_name = state[:next_pos]
-        state.move(next_pos)
+    next_colon_pos = state.find(':')
+    next_square_bracket_pos = state.find(']')
+    if next_square_bracket_pos < 0:
+        raise INTENT_FORMATTING_ERROR
+    if next_colon_pos < 0 or next_square_bracket_pos < next_colon_pos:
+        slot_name = state[:next_square_bracket_pos]
+        state.move(next_square_bracket_pos)
         state.add_slot(slot_name)
     else:
-        slot_name = state[:next_pos]
-        state.move(next_pos)
-        next_pos = state.find(']')
-        if next_pos < 0:
-            raise INTENT_FORMATTING_ERROR
-        entity = state[:next_pos]
-        state.move(next_pos)
+        slot_name = state[:next_colon_pos]
+        state.move(next_colon_pos)
+        entity = state[:next_square_bracket_pos]
+        state.move(next_square_bracket_pos)
         state.add_slot(slot_name, entity)
     if state.peek() == '(':
         state.read()
diff --git a/snips_nlu/dataset/validation.py b/snips_nlu/dataset/validation.py
index ec2c135c2..5f5dc7e16 100644
--- a/snips_nlu/dataset/validation.py
+++ b/snips_nlu/dataset/validation.py
@@ -50,9 +50,10 @@ def validate_and_format_dataset(dataset):
             dataset[ENTITIES][entity_name] = \
                 _validate_and_format_builtin_entity(entity, uterrance_entities)
         else:
-            dataset[ENTITIES][
-                entity_name] = _validate_and_format_custom_entity(
-                entity, uterrance_entities, language, builtin_entity_parser)
+            dataset[ENTITIES][entity_name] = \
+                _validate_and_format_custom_entity(
+                    entity, uterrance_entities, language,
+                    builtin_entity_parser)
     dataset[VALIDATED] = True
     return dataset
 
diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py
index 8bde8550d..34bcbc967 100644
--- a/snips_nlu/tests/test_dataset_loading.py
+++ b/snips_nlu/tests/test_dataset_loading.py
@@ -1,199 +1,293 @@
-from unittest import TestCase
+from __future__ import unicode_literals
 
-from snips_nlu.constants import PACKAGE_PATH
-from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset
+import io
+from unittest import TestCase
 
+import mock
+from mock import patch
 
-class TestDatasetLoading(TestCase):
-    def test_should_generate_dataset_from_files(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file_1 = examples_path / "intent_whoIsGame.txt"
-        intent_file_2 = examples_path / "intent_getWeather.txt"
-        entity_file_1 = examples_path / "entity_location.txt"
-
-        dataset = AssistantDataset.from_files(
-            "en", [intent_file_1, intent_file_2, entity_file_1])
-        dataset_dict = dataset.json
+from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset
 
-        # When / Then
-        expected_dataset_dict = {
-            "entities": {
-                "company": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "country": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
+EXPECTED_DATASET_DICT = {
+    "entities": {
+        "company": {
+            "automatically_extensible": True,
+            "data": [],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "country": {
+            "automatically_extensible": True,
+            "data": [],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "location": {
+            "automatically_extensible": True,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
                 },
-                "location": {
-                    "automatically_extensible": True,
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "role": {
+            "automatically_extensible": True,
+            "data": [],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "snips/datetime": {}
+    },
+    "intents": {
+        "getWeather": {
+            "utterances": [
+                {
                     "data": [
                         {
-                            "synonyms": [
-                                "big apple"
-                            ],
-                            "value": "new york"
+                            "text": "what is the weather in "
                         },
                         {
-                            "synonyms": [
-                                "city of lights"
-                            ],
-                            "value": "paris"
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "Paris"
                         },
                         {
-                            "synonyms": [],
-                            "value": "london"
+                            "text": "?"
                         }
-                    ],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "role": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
+                    ]
                 },
-                "snips/datetime": {}
-            },
-            "intents": {
-                "getWeather": {
-                    "utterances": [
+                {
+                    "data": [
                         {
-                            "data": [
-                                {
-                                    "text": "what is the weather in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "Paris"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
+                            "text": "is it raining in "
                         },
                         {
-                            "data": [
-                                {
-                                    "text": "Will it rain "
-                                },
-                                {
-                                    "entity": "snips/datetime",
-                                    "slot_name": "weatherDate",
-                                    "text": "tomorrow"
-                                },
-                                {
-                                    "text": " in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "Moscow"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "new york"
                         },
                         {
-                            "data": [
-                                {
-                                    "text": "How is the weather in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "San Francisco"
-                                },
-                                {
-                                    "entity": "snips/datetime",
-                                    "slot_name": "weatherDate",
-                                    "text": "today"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": "Today"
+                        }
+                    ]
+                }
+            ]
+        },
+        "whoIsGame": {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "who is the "
+                        },
+                        {
+                            "entity": "role",
+                            "slot_name": "role",
+                            "text": "president"
+                        },
+                        {
+                            "text": " of "
+                        },
+                        {
+                            "entity": "country",
+                            "slot_name": "country",
+                            "text": "France"
                         }
                     ]
                 },
-                "whoIsGame": {
-                    "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "who is the "
+                        },
                         {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "president"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "country",
-                                    "slot_name": "country",
-                                    "text": "France"
-                                }
-                            ]
+                            "entity": "role",
+                            "slot_name": "role",
+                            "text": "CEO"
                         },
                         {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "prime minister"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "country",
-                                    "slot_name": "country",
-                                    "text": "UK"
-                                }
-                            ]
+                            "text": " of "
                         },
                         {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "CEO"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "company",
-                                    "slot_name": "company",
-                                    "text": "Google"
-                                },
-                                {
-                                    "text": " please"
-                                }
-                            ]
+                            "entity": "company",
+                            "slot_name": "company",
+                            "text": "Google"
+                        },
+                        {
+                            "text": " please"
                         }
                     ]
                 }
-            },
-            "language": "en"
+            ]
         }
+    },
+    "language": "en"
+}
+
+
+class TestDatasetLoading(TestCase):
+    @patch("snips_nlu.dataset.dataset.io")
+    def test_should_generate_dataset_from_yaml_files(self, mock_io):
+        # Given
+        intent_file_1 = "whoIsGame.yaml"
+        intent_file_2 = "getWeather.yaml"
+        entity_file_1 = "location.yaml"
+
+        who_is_game_yaml = """
+# whoIsGame Intent
+---
+type: intent
+name: whoIsGame
+utterances:
+  - who is the [role](president) of [country](France)
+  - who is the [role](CEO) of [company](Google) please
+        """
+
+        get_weather_yaml = """
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [weatherLocation:location](Paris)?
+  - is it raining in [weatherLocation] [weatherDate:snips/datetime]
+        """
+
+        location_yaml = """
+# Location Entity
+---
+type: entity
+name: location
+automatically_extensible: true
+values:
+- [new york, big apple]
+- london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(filename, **kwargs):
+            if filename == intent_file_1:
+                return io.StringIO(who_is_game_yaml)
+            if filename == intent_file_2:
+                return io.StringIO(get_weather_yaml)
+            if filename == entity_file_1:
+                return io.StringIO(location_yaml)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
+        dataset_files = [intent_file_1, intent_file_2, entity_file_1]
+
+        # When
+        dataset = AssistantDataset.from_yaml_files("en", dataset_files)
+        dataset_dict = dataset.json
+
+        # Then
+        validate_and_format_dataset(dataset_dict)
+        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
+
+    @mock.patch("snips_nlu.dataset.dataset.io")
+    def test_should_generate_dataset_from_merged_yaml_file(self, mock_io):
+        # Given
+        dataset_file = "dataset.yaml"
+        dataset_yaml = """
+# whoIsGame Intent
+---
+type: intent
+name: whoIsGame
+utterances:
+  - who is the [role](president) of [country](France)
+  - who is the [role](CEO) of [company](Google) please
+
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [weatherLocation:location](Paris)?
+  - is it raining in [weatherLocation] [weatherDate:snips/datetime]
+  
+# Location Entity
+---
+type: entity
+name: location
+automatically_extensible: true
+values:
+- [new york, big apple]
+- london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(filename, **kwargs):
+            if filename == dataset_file:
+                return io.StringIO(dataset_yaml)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
+
+        # When
+        dataset = AssistantDataset.from_yaml_files("en", [dataset_file])
+        dataset_dict = dataset.json
+
+        # Then
+        validate_and_format_dataset(dataset_dict)
+        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
+
+    def test_should_generate_dataset_from_files(self):
+        # Given
+        intent_file_1 = "intent_whoIsGame.txt"
+        intent_file_2 = "intent_getWeather.txt"
+        entity_file_1 = "entity_location.txt"
+
+        who_is_game_txt = """
+who is the [role:role](president) of [country:country](France)
+who is the [role:role](CEO) of [company:company](Google) please
+"""
+
+        get_weather_txt = """
+what is the weather in [weatherLocation:location](Paris)?
+is it raining in [weatherLocation] [weatherDate:snips/datetime]
+"""
+
+        location_txt = """
+new york,big apple
+london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == intent_file_1:
+                return io.StringIO(who_is_game_txt)
+            if str(self_) == intent_file_2:
+                return io.StringIO(get_weather_txt)
+            if str(self_) == entity_file_1:
+                return io.StringIO(location_txt)
+            return None
+
+        # pylint:enable=unused-argument
+
+        dataset_files = [intent_file_1, intent_file_2, entity_file_1]
+
+        # When
+        with patch("pathlib.io") as mock_io:
+            mock_io.open.side_effect = mock_open
+            dataset = AssistantDataset.from_files("en", dataset_files)
+        dataset_dict = dataset.json
+
+        # When / Then
         validate_and_format_dataset(dataset_dict)
-        self.assertDictEqual(expected_dataset_dict, dataset_dict)
+        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py
index d49b1a4c3..09105da00 100644
--- a/snips_nlu/tests/test_entity_loading.py
+++ b/snips_nlu/tests/test_entity_loading.py
@@ -1,9 +1,11 @@
+from __future__ import unicode_literals
+
 import io
 from unittest import TestCase
 
 import yaml
+from mock import patch
 
-from snips_nlu.constants import PACKAGE_PATH
 from snips_nlu.dataset import Entity, EntityFormatError
 
 
@@ -132,10 +134,24 @@ def test_fail_from_yaml_file_when_no_name(self):
         with self.assertRaises(EntityFormatError):
             Entity.from_yaml(yaml_dict)
 
-    def test_from_text_file(self):
+    @patch("pathlib.io")
+    def test_from_text_file(self, mock_io):
         # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "entity_location.txt"
+        entity_file = "entity_location.txt"
+        location_txt = """
+new york,big apple
+paris,city of lights
+london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == entity_file:
+                return io.StringIO(location_txt)
+            return None
+
+        # pylint:enable=unused-argument
+        mock_io.open.side_effect = mock_open
 
         # When
         entity = Entity.from_file(entity_file)
@@ -167,10 +183,25 @@ def test_from_text_file(self):
         }
         self.assertDictEqual(expected_entity_dict, entity_dict)
 
-    def test_from_file_with_autoextensible(self):
+    @patch("pathlib.io")
+    def test_from_file_with_autoextensible(self, mock_io):
         # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "entity_location_autoextent_false.txt"
+        entity_file = "entity_location.txt"
+        location_txt = """# automatically_extensible=false
+new york,big apple
+paris,city of lights
+london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == entity_file:
+                return io.StringIO(location_txt)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
 
         # When
         entity_dataset = Entity.from_file(entity_file)
@@ -204,8 +235,7 @@ def test_from_file_with_autoextensible(self):
 
     def test_should_fail_generating_entity_with_wrong_file_name(self):
         # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "location.txt"
+        entity_file = "location.txt"
 
         # When / Then
         with self.assertRaises(EntityFormatError):
diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py
index 3e732edbf..1dfcc8282 100644
--- a/snips_nlu/tests/test_intent_loading.py
+++ b/snips_nlu/tests/test_intent_loading.py
@@ -1,14 +1,203 @@
+from __future__ import unicode_literals
+
+import io
 from unittest import TestCase
 
-from snips_nlu.constants import PACKAGE_PATH
+import yaml
+from mock import patch
+
 from snips_nlu.dataset import Intent, IntentFormatError
 
 
 class TestIntentLoading(TestCase):
-    def test_should_generate_intent_from_text_file(self):
+    def test_should_load_from_yaml_file(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [weatherLocation:location](paris) ?
+  - "Will it rain [date:snips/datetime](tomorrow) in
+    [weatherLocation:location](london)?"
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        intent = Intent.from_yaml(yaml_dict)
+        intent_dict = intent.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "text": "paris",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": " ?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "Will it rain "
+                        },
+                        {
+                            "text": "tomorrow",
+                            "entity": "snips/datetime",
+                            "slot_name": "date"
+                        },
+                        {
+                            "text": " in "
+                        },
+                        {
+                            "text": "london",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                }
+            ]
+        }
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    def test_should_load_from_yaml_file_using_slot_mapping(self):
         # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file = examples_path / "intent_getWeather.txt"
+        yaml_stream = io.StringIO("""
+# getWeather Intent
+---
+type: intent
+name: getWeather
+slots:
+  - name: date
+    entity: snips/datetime
+  - name: weatherLocation
+    entity: location
+utterances:
+  - what is the weather in [weatherLocation](paris) ?
+  - Will it rain [date] in [weatherLocation](london)?
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        intent = Intent.from_yaml(yaml_dict)
+        intent_dict = intent.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "text": "paris",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": " ?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "Will it rain "
+                        },
+                        {
+                            "text": None,
+                            "entity": "snips/datetime",
+                            "slot_name": "date"
+                        },
+                        {
+                            "text": " in "
+                        },
+                        {
+                            "text": "london",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                }
+            ]
+        }
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    def test_should_load_from_yaml_file_using_implicit_values(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [location] ?
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        intent = Intent.from_yaml(yaml_dict)
+        intent_dict = intent.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "text": None,
+                            "entity": "location",
+                            "slot_name": "location"
+                        },
+                        {
+                            "text": " ?"
+                        }
+                    ]
+                }
+            ]
+        }
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    @patch("pathlib.io")
+    def test_should_generate_intent_from_text_file(self, mock_io):
+        # Given
+        intent_file = "intent_getWeather.txt"
+        get_weather_txt = """
+what is the weather in [weatherLocation:location](Paris)?
+Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)?
+How is the weather in [weatherLocation:location] [weatherDate] please?
+is it raining in [weatherLocation] [weatherDate:snips/datetime]
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == intent_file:
+                return io.StringIO(get_weather_txt)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
 
         # When
         intent_dataset = Intent.from_file(intent_file)
@@ -46,7 +235,7 @@ def test_should_generate_intent_from_text_file(self):
                             "text": " in "
                         },
                         {
-                            "entity": None,
+                            "entity": "location",
                             "slot_name": "weatherLocation",
                             "text": "Moscow"
                         },
@@ -63,10 +252,10 @@ def test_should_generate_intent_from_text_file(self):
                         {
                             "entity": "location",
                             "slot_name": "weatherLocation",
-                            "text": "San Francisco"
+                            "text": None
                         },
                         {
-                            "entity": None,
+                            "entity": "snips/datetime",
                             "slot_name": "weatherDate",
                             "text": None
                         },
@@ -74,6 +263,23 @@ def test_should_generate_intent_from_text_file(self):
                             "text": " please?"
                         }
                     ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "is it raining in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": None
+                        },
+                        {
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": None
+                        }
+                    ]
                 }
             ]
         }
@@ -82,8 +288,7 @@ def test_should_generate_intent_from_text_file(self):
 
     def test_should_fail_generating_intent_with_wrong_file_name(self):
         # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file = examples_path / "getWeather.txt"
+        intent_file = "getWeather.txt"
 
         # When / Then
         with self.assertRaises(IntentFormatError):
diff --git a/snips_nlu/tests/utils.py b/snips_nlu/tests/utils.py
index d057ce4e1..7ea7c219f 100644
--- a/snips_nlu/tests/utils.py
+++ b/snips_nlu/tests/utils.py
@@ -14,10 +14,11 @@
 from snips_nlu.utils import json_string, unicode_string
 
 TEST_PATH = Path(__file__).parent
-SAMPLE_DATASET_PATH = TEST_PATH / "resources" / "sample_dataset.json"
-BEVERAGE_DATASET_PATH = TEST_PATH / "resources" / "beverage_dataset.json"
-WEATHER_DATASET_PATH = TEST_PATH / "resources" / "weather_dataset.json"
-PERFORMANCE_DATASET_PATH = TEST_PATH / "resources" / "performance_dataset.json"
+TEST_RESOURCES_PATH = TEST_PATH / "resources"
+SAMPLE_DATASET_PATH = TEST_RESOURCES_PATH / "sample_dataset.json"
+BEVERAGE_DATASET_PATH = TEST_RESOURCES_PATH / "beverage_dataset.json"
+WEATHER_DATASET_PATH = TEST_RESOURCES_PATH / "weather_dataset.json"
+PERFORMANCE_DATASET_PATH = TEST_RESOURCES_PATH / "performance_dataset.json"
 
 
 # pylint: disable=invalid-name

From 6e57058acff63430e56b075bdba3736201746036 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Mon, 19 Nov 2018 17:56:24 +0100
Subject: [PATCH 07/24] Fix issue with yaml loaders

---
 snips_nlu/dataset/entity.py |  1 +
 snips_nlu/dataset/utils.py  | 13 +++++++++++++
 snips_nlu/utils.py          | 20 ++------------------
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
index 1d4923fff..08d8ed800 100644
--- a/snips_nlu/dataset/entity.py
+++ b/snips_nlu/dataset/entity.py
@@ -3,6 +3,7 @@
 
 import csv
 import re
+from builtins import str
 from pathlib import Path
 
 import six
diff --git a/snips_nlu/dataset/utils.py b/snips_nlu/dataset/utils.py
index 2047bf716..b4fc0c33b 100644
--- a/snips_nlu/dataset/utils.py
+++ b/snips_nlu/dataset/utils.py
@@ -1,10 +1,23 @@
+from __future__ import unicode_literals
+
 from future.utils import iteritems, itervalues
+from yaml import Loader, SafeLoader
 
 from snips_nlu.constants import (
     DATA, ENTITIES, ENTITY, INTENTS, TEXT, UTTERANCES)
 from snips_nlu.entity_parser.builtin_entity_parser import is_gazetteer_entity
 
 
+def construct_yaml_str(self, node):
+    # Override the default string handling function
+    # to always return unicode objects
+    return self.construct_scalar(node)
+
+
+Loader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str)
+SafeLoader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str)
+
+
 def extract_utterance_entities(dataset):
     entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]}
 
diff --git a/snips_nlu/utils.py b/snips_nlu/utils.py
index 198722b7a..b12adfdbb 100644
--- a/snips_nlu/utils.py
+++ b/snips_nlu/utils.py
@@ -7,7 +7,7 @@
 import os
 import shutil
 from builtins import bytes, object, str
-from collections import Mapping, OrderedDict, namedtuple
+from collections import OrderedDict
 from contextlib import contextmanager
 from datetime import datetime
 from functools import wraps
@@ -24,6 +24,7 @@
 REGEX_PUNCT = {'\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}',
                '^', '$', '#', '&', '-', '~'}
 
+
 # pylint: disable=invalid-name
 
 class abstractclassmethod(classmethod):
@@ -97,12 +98,6 @@ def validate_keys(obj, keys, object_label=None):
         validate_key(obj, key, object_label)
 
 
-def validate_range(rng):
-    if not isinstance(rng, (list, tuple)) or len(rng) != 2 or rng[0] > rng[1]:
-        raise ValueError("range must be a length 2 list or tuple and must be "
-                         "valid")
-
-
 class LimitedSizeDict(OrderedDict):
     def __init__(self, *args, **kwds):
         if "size_limit" not in kwds:
@@ -138,17 +133,6 @@ def __setitem__(self, key, value):
         super(UnupdatableDict, self).__setitem__(key, value)
 
 
-def namedtuple_with_defaults(typename, field_names, default_values=()):
-    T = namedtuple(typename, field_names)  # pylint: disable=C0103
-    T.__new__.__defaults__ = (None,) * len(T._fields)
-    if isinstance(default_values, Mapping):
-        prototype = T(**default_values)
-    else:
-        prototype = T(*default_values)
-    T.__new__.__defaults__ = tuple(prototype)
-    return T
-
-
 def mkdir_p(path):
     """Reproduces the 'mkdir -p shell' command
 

From f3d81c465aacefe9bfca6fcd36eed6e69c9bbc91 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 20 Nov 2018 16:38:23 +0100
Subject: [PATCH 08/24] Deprecate dataset text files format

---
 setup.py                                | 3 ++-
 snips_nlu/cli/generate_dataset.py       | 5 ++++-
 snips_nlu/dataset/dataset.py            | 5 +++++
 snips_nlu/dataset/entity.py             | 4 ++++
 snips_nlu/dataset/intent.py             | 4 ++++
 snips_nlu/tests/test_dataset_loading.py | 2 ++
 snips_nlu/tests/test_entity_loading.py  | 4 ++++
 snips_nlu/tests/test_intent_loading.py  | 3 +++
 8 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 9829a11e2..fe523edad 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,8 @@
     "plac>=0.9.6,<1.0",
     "requests>=2.0,<3.0",
     "pathlib==1.0.1; python_version < '3.4'",
-    "pyaml>=17,<18"
+    "pyaml>=17,<18",
+    "deprecation>=2,<3"
 ]
 
 extras_require = {
diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py
index ac923f0dd..faca6043d 100644
--- a/snips_nlu/cli/generate_dataset.py
+++ b/snips_nlu/cli/generate_dataset.py
@@ -13,5 +13,8 @@
            "filename"))
 def generate_dataset(language, *files):
     """Create a Snips NLU dataset from text friendly files"""
-    dataset = AssistantDataset.from_files(language, list(files))
+    if any(f.endswith(".yml") or f.endswith(".yaml") for f in files):
+        dataset = AssistantDataset.from_yaml_files(language, list(files))
+    else:
+        dataset = AssistantDataset.from_files(language, list(files))
     print(json.dumps(dataset.json, indent=2, sort_keys=True))
diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py
index 364e67364..d9cef2318 100644
--- a/snips_nlu/dataset/dataset.py
+++ b/snips_nlu/dataset/dataset.py
@@ -6,8 +6,10 @@
 from pathlib import Path
 
 import yaml
+from deprecation import deprecated
 from snips_nlu_ontology import get_builtin_entity_examples
 
+from snips_nlu.__about__ import __version__
 from snips_nlu.dataset.entity import Entity
 from snips_nlu.dataset.intent import Intent
 
@@ -55,6 +57,9 @@ def from_yaml_files(cls, language, filenames):
         return cls(language, intents, entities)
 
     @classmethod
+    @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
+                current_version=__version__,
+                details="Use from_yaml_files instead")
     def from_files(cls, language, filenames):
         """Creates an :class:`.AssistantDataset` from a language and a list of
         intent and entity files
diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
index 08d8ed800..04c9743ce 100644
--- a/snips_nlu/dataset/entity.py
+++ b/snips_nlu/dataset/entity.py
@@ -7,8 +7,10 @@
 from pathlib import Path
 
 import six
+from deprecation import deprecated
 from snips_nlu_ontology import get_all_builtin_entities
 
+from snips_nlu.__about__ import __version__
 from snips_nlu.constants import (
     AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS,
     USE_SYNONYMS, VALUE)
@@ -83,6 +85,8 @@ def from_yaml(cls, yaml_dict):
                    matching_strictness=matching_strictness)
 
     @classmethod
+    @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
+                current_version=__version__, details="Use from_yaml instead")
     def from_file(cls, filepath):
         filepath = Path(filepath)
         stem = filepath.stem
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index 650bf59e0..58dbaa679 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -4,8 +4,10 @@
 from builtins import object
 from pathlib import Path
 
+from deprecation import deprecated
 from future.utils import with_metaclass
 
+from snips_nlu.__about__ import __version__
 from snips_nlu.constants import DATA, ENTITY, SLOT_NAME, TEXT, UTTERANCES
 
 
@@ -65,6 +67,8 @@ def from_yaml(cls, yaml_dict):
         return cls(intent_name, utterances, slot_mapping)
 
     @classmethod
+    @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
+                current_version=__version__, details="Use from_yaml instead")
     def from_file(cls, filepath):
         filepath = Path(filepath)
         stem = filepath.stem
diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py
index 34bcbc967..458fba82f 100644
--- a/snips_nlu/tests/test_dataset_loading.py
+++ b/snips_nlu/tests/test_dataset_loading.py
@@ -4,6 +4,7 @@
 from unittest import TestCase
 
 import mock
+from deprecation import fail_if_not_removed
 from mock import patch
 
 from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset
@@ -247,6 +248,7 @@ def mock_open(filename, **kwargs):
         validate_and_format_dataset(dataset_dict)
         self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
 
+    @fail_if_not_removed
     def test_should_generate_dataset_from_files(self):
         # Given
         intent_file_1 = "intent_whoIsGame.txt"
diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py
index 09105da00..4e49c7553 100644
--- a/snips_nlu/tests/test_entity_loading.py
+++ b/snips_nlu/tests/test_entity_loading.py
@@ -4,6 +4,7 @@
 from unittest import TestCase
 
 import yaml
+from deprecation import fail_if_not_removed
 from mock import patch
 
 from snips_nlu.dataset import Entity, EntityFormatError
@@ -135,6 +136,7 @@ def test_fail_from_yaml_file_when_no_name(self):
             Entity.from_yaml(yaml_dict)
 
     @patch("pathlib.io")
+    @fail_if_not_removed
     def test_from_text_file(self, mock_io):
         # Given
         entity_file = "entity_location.txt"
@@ -184,6 +186,7 @@ def mock_open(self_, *args, **kwargs):
         self.assertDictEqual(expected_entity_dict, entity_dict)
 
     @patch("pathlib.io")
+    @fail_if_not_removed
     def test_from_file_with_autoextensible(self, mock_io):
         # Given
         entity_file = "entity_location.txt"
@@ -233,6 +236,7 @@ def mock_open(self_, *args, **kwargs):
         }
         self.assertDictEqual(expected_entity_dict, entity_dict)
 
+    @fail_if_not_removed
     def test_should_fail_generating_entity_with_wrong_file_name(self):
         # Given
         entity_file = "location.txt"
diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py
index 1dfcc8282..50332d18f 100644
--- a/snips_nlu/tests/test_intent_loading.py
+++ b/snips_nlu/tests/test_intent_loading.py
@@ -4,6 +4,7 @@
 from unittest import TestCase
 
 import yaml
+from deprecation import fail_if_not_removed
 from mock import patch
 
 from snips_nlu.dataset import Intent, IntentFormatError
@@ -179,6 +180,7 @@ def test_should_load_from_yaml_file_using_implicit_values(self):
         self.assertDictEqual(expected_intent_dict, intent_dict)
 
     @patch("pathlib.io")
+    @fail_if_not_removed
     def test_should_generate_intent_from_text_file(self, mock_io):
         # Given
         intent_file = "intent_getWeather.txt"
@@ -286,6 +288,7 @@ def mock_open(self_, *args, **kwargs):
 
         self.assertDictEqual(expected_intent_dict, intent_dict)
 
+    @fail_if_not_removed
     def test_should_fail_generating_intent_with_wrong_file_name(self):
         # Given
         intent_file = "getWeather.txt"

From a53a173f5c58312755aedd141123d7405452f582 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 20 Nov 2018 19:27:33 +0100
Subject: [PATCH 09/24] Add dedicated documentation section on dataset format

---
 docs/source/dataset.rst    | 249 +++++++++++++++++++++++++++++++++++++
 docs/source/index.rst      |   3 +-
 docs/source/quickstart.rst |   4 +-
 docs/source/tutorial.rst   |  14 +--
 4 files changed, 260 insertions(+), 10 deletions(-)
 create mode 100644 docs/source/dataset.rst

diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
new file mode 100644
index 000000000..826c1f638
--- /dev/null
+++ b/docs/source/dataset.rst
@@ -0,0 +1,249 @@
+.. _dataset:
+
+Training Dataset Format
+=======================
+
+The Snips NLU library leverages machine learning algorithms and some training
+data in order to produce a powerful intent recognition engine.
+
+The better your training data is, and the more accurate your NLU engine will
+be. Thus, it is worth spending a bit of time to create a dataset that
+corresponds well to your use case.
+
+Snips NLU accepts two different dataset formats. The first one, which relies
+on YAML, is the preferred option if you want to create or edit a dataset
+manually.
+The other dataset format uses JSON and should rather be used if you plan to
+create or edit datasets programmatically.
+
+YAML format
+-----------
+
+The YAML dataset format allows you to define intents and entities using the
+`YAML <http://yaml.org/about.html>`_ syntax.
+
+------
+Entity
+------
+
+Here is what an entity file looks like:
+
+.. code-block:: yaml
+
+    # City Entity
+    ---
+    type: entity # allows to differentiate between entities and intents files
+    name: city # name of the entity
+    values:
+    - london # single entity value
+    - [new york, big apple] # entity value with a synonym
+    - [paris, city of lights]
+
+You can specify entity values either using single YAML scalars (e.g. ``london``),
+or using lists if you want to define some synonyms (e.g.
+``[paris, city of lights]``)
+
+Here is a more comprehensive example which contains additional attributes that
+are optional:
+
+.. code-block:: yaml
+
+    # City Entity
+    ---
+    type: entity
+    name: city
+    automatically_extensible: false # default value is true
+    use_synonyms: false # default value is true
+    matching_strictness: 0.8 # default value is 1.0
+    values:
+    - london
+    - [new york, big apple]
+    - [paris, city of lights]
+
+------
+Intent
+------
+
+Here is the format used to describe an intent:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight # name of the intent
+    utterances:
+      - find me a flight from [origin:city](Paris) to [destination:city](New York)
+      - I need a flight leaving [date:snips/datetime](this weekend) to [destination:city](Berlin)
+      - show me flights to go to [arrival:city](new york) leaving [date:snips/datetime](this evening)
+
+We use a standard markdown-like annotation syntax to annotate slots within
+utterances. The ``[origin:city](Paris)`` chunk describes a slot with its three
+components:
+
+    - ``origin``: the slot name
+    - ``city``: the slot type
+    - ``Paris``: the slot value
+
+Note that different slot names can share the same slot type. This is the case
+for the ``origin`` and ``destination`` slot names in the previous example, which
+have the same slot type ``city``.
+
+If you are to write more than just three utterances, you can actually specify
+the slot mapping explicitly in the intent file and remove it from the
+utterances. This will result in simpler annotations:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight # name of the intent
+    slots:
+      - name: origin
+        entity: city
+      - name: destination
+        entity: city
+      - name: date
+        entity: snips/datetime
+    utterances:
+      - find me a flight from [origin](Paris) to [destination](New York)
+      - I need a flight leaving [date](this weekend) to [destination](Berlin)
+      - show me flights to go to [arrival](new york) leaving [date](this evening)
+
+-------
+Dataset
+-------
+
+You are free to organize the yaml documents as you want. Either having one yaml
+file for each intent and each entity, or gathering some documents together
+(e.g. all entities together, or all intents together) in the same yaml file.
+Here is the yaml file corresponding to the previous ``city`` entity and
+``searchFlight`` intent merged together:
+
+.. code-block:: yaml
+
+    # City Entity
+    ---
+    type: entity # allows to differentiate between entities and intents files
+    name: city # name of the entity
+    values:
+    - london # single entity value
+    - [new york, big apple] # entity value with a synonym
+    - [paris, city of lights]
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight # name of the intent
+    slots:
+      - name: origin
+        entity: city
+      - name: destination
+        entity: city
+      - name: date
+        entity: snips/datetime
+    utterances:
+      - find me a flight from [origin](Paris) to [destination](New York)
+      - I need a flight leaving [date](this weekend) to [destination](Berlin)
+      - show me flights to go to [arrival](new york) leaving [date](this evening)
+
+Once your intents and entities are created using the YAML format described
+previously, you can produce a dataset using the
+:ref:`Command Line Interface (CLI) <cli>`:
+
+.. code-block:: console
+
+    snips-nlu generate-dataset en city.yaml searchFlight.yaml > dataset.json
+
+Or alternatively if you merged the yaml documents into a single file:
+
+.. code-block:: console
+
+    snips-nlu generate-dataset en dataset.yaml > dataset.json
+
+This will generate a JSON dataset and write it in the ``dataset.json`` file.
+The format of the generated file is the second allowed format that is described
+in the next section.
+
+JSON format
+-----------
+
+The JSON format is the format which is eventually used by the training API. It
+was designed to be easy to parse.
+
+We created a `sample dataset`_ that you can check to better understand the
+format.
+
+There are three attributes at the root of the JSON document:
+
+    - ``"language"``: the language of the dataset in :ref:`ISO format <languages>`
+    - ``"intents"``: a dictionary mapping between intents names and intents data
+    - ``"entities"``: a dictionary mapping between entities names and entities data
+
+Here is how the entities are represented in this format:
+
+.. code-block:: json
+
+    {
+      "entities": {
+        "snips/datetime": {},
+        "city": {
+          "data": [
+            {
+              "value": "london",
+              "synonyms": []
+            },
+            {
+              "value": "new york",
+              "synonyms": [
+                "big apple"
+              ]
+            },
+            {
+              "value": "paris",
+              "synonyms": [
+                "city of lights"
+              ]
+            }
+          ],
+          "use_synonyms": true,
+          "automatically_extensible": true,
+          "matching_strictness": 1.0
+        }
+      }
+    }
+
+Note that the ``"snips/datetime"`` entity data is empty as it is a
+:ref:`builtin entity <builtin_entity_resolution>`.
+
+The intent utterances are defined using the following format:
+
+.. code-block:: json
+
+    {
+      "data": [
+        {
+          "text": "find me a flight from "
+        },
+        {
+          "text": "Paris",
+          "entity": "city",
+          "slot_name": "origin"
+        },
+        {
+          "text": " to "
+        },
+        {
+          "text": "New York",
+          "entity": "city",
+          "slot_name": "destination"
+        }
+      ]
+    }
+
+Once you have created a JSON dataset, either directly or with YAML files, you
+can use it to train an NLU engine. To do so, you can use the CLI as documented
+:ref:`here<training_cli>`, or the :ref:`python API <training_the_engine>`.
+
+.. _sample dataset: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu_samples/sample_dataset.json
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 94dbbe1ab..5b976aedc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,9 +81,10 @@ the :ref:`api` documentation or alternatively check the `github repository`_.
    installation
    quickstart
    tutorial
-   cli
+   dataset
    data_model
    languages
+   cli
    api
 
 
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index e36dddbb6..cbb2b4115 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -43,7 +43,7 @@ resources used to improve performance with the :func:`.load_resources` function.
     nlu_engine = SnipsNLUEngine()
 
 Now that we have our engine object created, we need to feed it with our sample
-dataset. In general, this action will require some *machine learning* hence we
+dataset. In general, this action will require some *machine learning*, so we
 will actually *fit* the engine:
 
 .. code-block:: python
@@ -52,7 +52,7 @@ will actually *fit* the engine:
 
 
 Our NLU engine is now trained to recognize new utterances that extend beyond
-what is strictly contained in the dataset, it is able to *generalize*.
+what is strictly contained in the dataset: it is able to *generalize*.
 
 Let's try to parse something now!
 
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index 0ae3c17da..0585b0e57 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -4,22 +4,20 @@ Tutorial
 ========
 
 In this section, we will build an NLU assistant for home automation tasks. It
-will be able to understand queries about lights and thermostats. More precisely
-our assistant will contain three :ref:`intents <intent>`:
+will be able to understand queries about lights and thermostats. More
+precisely, our assistant will contain three :ref:`intents <intent>`:
 
 - ``turnLightOn``
 - ``turnLightOff``
 - ``setTemperature``
 
 The first two intents will be about turning on and off the lights in a specific
-room. Thus, these intents will have one :ref:`slot` which will be the ``room``.
-The third intent will let you control the temperature of a specific room, thus
-it will have two slots: the ``roomTemperature`` and the ``room``.
+room. These intents will have one :ref:`slot` which will be the ``room``.
+The third intent will let you control the temperature of a specific room. It
+will have two slots: the ``roomTemperature`` and the ``room``.
 
 The first step is to create an appropriate dataset for this task.
 
-.. _dataset:
-
 Snips dataset format
 --------------------
 
@@ -251,6 +249,8 @@ That will raise a ``NotTrained`` error, as we did not train the engine with
 the dataset that we created.
 
 
+.. _training_the_engine:
+
 Training the engine
 -------------------
 

From da950258a4295f216276e26d6d3ef0a932e3e4bc Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 12:18:33 +0100
Subject: [PATCH 10/24] Update documentation with new YAML format

---
 docs/source/cli.rst      |  70 +++--------
 docs/source/dataset.rst  |   4 +-
 docs/source/tutorial.rst | 251 +++++++++++++--------------------------
 3 files changed, 101 insertions(+), 224 deletions(-)

diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index a5334df0c..c031f7600 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -14,70 +14,30 @@ is typically used by running ``snips-nlu <command> [args]`` or alternatively
 Creating a dataset
 ------------------
 
-As seen in the :ref:`tutorial` section, a command allows you to generate a
-dataset from a :ref:`language <languages>` and a list of text files describing
-:ref:`intents <intent>` and :ref:`entities <slot>`:
+As seen in the :ref:`tutorial <tutorial>` section, a command allows you to generate a
+dataset from a :ref:`language <languages>` and a list of YAML files containing
+data for :ref:`intents <intent>` and :ref:`entities <slot>`:
 
 .. code-block:: bash
 
-   snips-nlu generate-dataset en intent_1.txt intent_2.txt entity_1.txt
+   snips-nlu generate-dataset en my_first_intent.yaml my_second_intent.yaml my_entity.yaml
 
-This will print a Json string to the standard output. If you want to store the
-dataset directly in a Json file, you just have to pipe the previous command like
-below:
-
-.. code-block:: bash
-
-   snips-nlu generate-dataset en intent_1.txt intent_2.txt entity_1.txt > dataset.json
-
-
-Each intent file corresponds to a single intent, and the name of the file must
-start with ``intent_``. The same is true for entity files, which must start
-with ``entity_``.
-
-An intent file is a text file in which each row corresponds to an utterance.
-Slots, along with their corresponding slot type (entity), can be defined using
-the following syntax:
-
-.. code-block:: console
+.. note::
 
-   Find me a flight from [departure:city](Paris) to [destination:city](London)
-   Find me a flight from [departure:city](Moscow) [departureDate:snips/datetime](tomorrow around 9pm)
+    You don't have to use separated files for each intent and entity. You could
+    for instance merge all intents together in a single ``intents.yaml`` file,
+    or even merge all intents and entities in a single ``dataset.yaml`` file.
 
-In this example, there are three different slots -- ``departure``,
-``destination`` and ``departureDate`` -- and two different entities -- ``city``
-and ``snips/datetime`` (which is a :ref:`builtin entity <builtin_entity_resolution>`).
-Check :ref:`this section <entity_vs_slot_name>` to have more details about the
-difference between slots and entities.
-
-An entity file is a comma separated text file in which each row corresponds to
-an entity value, optionally followed with its :ref:`synonyms <synonyms>`. The syntax used
-is the following:
-
-.. code-block:: console
-
-   bedroom
-   garden,yard,backyard
-
-Here, the entity (room) has two values which are ``"bedroom"`` and ``"garden"``.
-Two synonyms, ``"yard"`` and ``"backyard"``, are defined for ``"garden"``.
-If a value or a synonym contains a comma, the value must be put between
-double quotes ``"``.
-
-If the value contains double quotes, it must be doubled
-to be escaped like this:  ``"A value with a "","" in it"`` which corresponds
-to the actual value ``A value with a "," in it``.
-
-.. Note::
+This will print a JSON string to the standard output. If you want to store the
+dataset directly in a JSON file, you just have to pipe the previous command like
+below:
 
-    By default entities are generated as :ref:`automatically extensible <auto_extensible>`,
-    i.e. the recognition will accept additional values than the ones listed in
-    the entity file. This behavior can be changed by adding at the beginning of
-    the entity file the following:
+.. code-block:: bash
 
-    .. code-block:: bash
+   snips-nlu generate-dataset en my_first_intent.yaml my_second_intent.yaml my_entity.yaml > dataset.json
 
-       # automatically_extensible=false
+Check the :ref:`Training Dataset Format <dataset>` section for more details
+about the format used to describe the training data.
 
 .. _training_cli:
 
diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
index 826c1f638..1e3628884 100644
--- a/docs/source/dataset.rst
+++ b/docs/source/dataset.rst
@@ -16,6 +16,8 @@ manually.
 The other dataset format uses JSON and should rather be used if you plan to
 create or edit datasets programmatically.
 
+.. _yaml_format:
+
 YAML format
 -----------
 
@@ -244,6 +246,6 @@ The intent utterances are defined using the following format:
 
 Once you have created a JSON dataset, either directly or with YAML files, you
 can use it to train an NLU engine. To do so, you can use the CLI as documented
-:ref:`here<training_cli>`, or the :ref:`python API <training_the_engine>`.
+:ref:`here <training_cli>`, or the :ref:`python API <training_the_engine>`.
 
 .. _sample dataset: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu_samples/sample_dataset.json
\ No newline at end of file
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index 0585b0e57..72b6576b2 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -18,183 +18,98 @@ will have two slots: the ``roomTemperature`` and the ``room``.
 
 The first step is to create an appropriate dataset for this task.
 
-Snips dataset format
---------------------
-
-The format used by Snips to describe the input data is designed to be simple to
-parse as well as easy to read.
-
-We created a `sample dataset`_ that you can check to better understand the
-format.
-
-You have three options to create your dataset. You can build it manually by
-respecting the format used in the sample, you can also use the
-:ref:`dataset creation CLI <dataset_cli>` included in the lib, or alternatively
-you can use `chatito`_ a DSL tool for dataset generation.
-
-We will go for the second option here and start by creating three files
-corresponding to our three intents and one entity file corresponding to the
-``room`` entity:
-
-- ``intent_turnLightOn.txt``
-- ``intent_turnLightOff.txt``
-- ``intent_setTemperature.txt``
-- ``entity_room.txt``
-
-The name of each file is important as the tool will map it to the intent or
-entity name. In particular, the prefixes ``intent_`` and ``entity_`` are
-required in order to distinguish intents from entity files.
-
-Let's add training examples for the first intent by inserting the following
-lines in the first file, ``intent_turnLightOn.txt``:
-
-.. code-block:: console
-
-    Turn on the lights in the [room:room](kitchen)
-    give me some light in the [room:room](bathroom) please
-    Can you light up the [room:room](living room) ?
-    switch the [room:room](bedroom)'s lights on please
-
-We use a standard markdown-like annotation syntax to annotate slots within
-utterances. The ``[room:room]`` chunks describe the slot with its two
-components: :ref:`the slot name and the entity <entity_vs_slot_name>`. In our
-case we used the same value, ``room``, to describe both. The parts with
-parenthesis, like ``(kitchen)``, correspond to the text value of the slot.
-
-Let's move on to the second intent, and insert this into
-``intent_turnLightOff.txt``:
-
-.. code-block:: console
-
-    Turn off the lights in the [room:room](entrance)
-    turn the [room:room](bathroom)'s light out please
-    switch off the light the [room:room](kitchen), will you?
-    Switch the [room:room](bedroom)'s lights off please
-
-And now the last file, ``intent_setTemperature.txt``:
-
-.. code-block:: console
-
-    Set the temperature to [roomTemperature:snips/temperature](19 degrees) in the [room:room](bedroom)
-    please set the [room:room](living room)'s temperature to [roomTemperature:snips/temperature](twenty two degrees celsius)
-    I want [roomTemperature:snips/temperature](75 degrees fahrenheit) in the [room:room](bathroom) please
-    Can you increase the temperature to [roomTemperature:snips/temperature](22 degrees) ?
-
-As you can see here, we used a new slot, ``[room_temperature:snips/temperature]``,
-whose name is ``roomTemperature`` and whose type is ``snips/temperature``. The slot
-type used here is a :ref:`builtin entity <builtin_entity_resolution>`. It
-allows you to resolve the temperature values properly.
-
-Let's move to the ``entity_room.txt`` entity file:
-
-.. code-block:: console
-
-    bedroom
-    living room,main room
-    garden,yard,backyard
-
-The entity file is a comma (``,``) separated file. Each line corresponds to an
-entity value followed by its potential :ref:`synonyms <synonyms>`.
-
-We are now ready to generate our dataset:
+Training Data
+-------------
+
+Check the :ref:`Training Dataset Format <dataset>` section for more details
+about the format used to describe the training data.
+
+In this tutorial, we will create our dataset using the
+:ref:`YAML format <yaml_format>`, and create a ``dataset.yaml`` file with the
+following content:
+
+.. code-block:: yaml
+
+    # turnLightOn intent
+    ---
+    type: intent
+    name: turnLightOn
+    slots:
+      - name: room
+        entity: room
+    utterances:
+      - Turn on the lights in the [room](kitchen)
+      - give me some light in the [room](bathroom) please
+      - Can you light up the [room](living room) ?
+      - switch the [room](bedroom)'s lights on please
+
+    # turnLightOff intent
+    ---
+    type: intent
+    name: turnLightOff
+    slots:
+      - name: room
+        entity: room
+    utterances:
+      - Turn off the lights in the [room](entrance)
+      - turn the [room](bathroom)'s light out please
+      - switch off the light the [room](kitchen), will you?
+      - Switch the [room](bedroom)'s lights off please
+
+    # setTemperature intent
+    ---
+    type: intent
+    name: setTemperature
+    slots:
+      - name: room
+        entity: room
+      - name: roomTemperature
+        entity: snips/temperature
+    utterances:
+      - Set the temperature to [roomTemperature](19 degrees) in the [room](bedroom)
+      - please set the [room](living room)'s temperature to [roomTemperature](twenty two degrees celsius)
+      - I want [roomTemperature](75 degrees fahrenheit) in the [room](bathroom) please
+      - Can you increase the temperature to [roomTemperature](22 degrees) ?
+
+    # room entity
+    ---
+    type: entity
+    name: room
+    automatically_extensible: no
+    values:
+    - bedroom
+    - [living room, main room, lounge]
+    - [garden, yard, backyard]
+
+Here, we put all the intents and entities in the same file but we could have
+split them in dedicated files as well.
+
+The ``setTemperature`` intent references a ``roomTemperature`` slot which
+relies on the ``snips/temperature`` entity. This entity is a
+:ref:`builtin entity <builtin_entity_resolution>`. It allows to resolve the
+temperature values properly.
+
+The ``room`` entity makes use of :ref:`synonyms <synonyms>` by defining lists
+like ``[living room, main room, lounge]``. In this case, ``main room`` and
+``lounge`` will point to ``living room``, the first item of the list, which is
+the reference value.
+
+Besides, this entity is marked as not
+:ref:`automatically extensible <auto_extensible>` which means that the NLU
+will only output values that we have defined and will not try to match other
+values.
+
+We are now ready to generate our dataset using the :ref:`CLI <cli>`:
 
 .. code-block:: bash
 
-    snips-nlu generate-dataset en intent_turnLightOn.txt intent_turnLightOff.txt intent_setTemperature.txt entity_room.txt > dataset.json
+    snips-nlu generate-dataset en dataset.yaml > dataset.json
 
 .. note::
 
     We used ``en`` as the language here but other languages are supported,
     please check the :ref:`languages` section to know more.
 
-Now, the ``"entities"`` part of the generated json looks like that:
-
-.. code-block:: json
-
-    {
-      "entities": {
-        "room": {
-          "automatically_extensible": true,
-          "data": [
-            {
-              "synonyms": [],
-              "value": "bedroom"
-            },
-            {
-              "synonyms": [
-                "main room"
-              ],
-              "value": "living room"
-            },
-            {
-              "synonyms": [
-                "yard",
-                "backyard"
-              ],
-              "value": "garden"
-            }
-          ],
-          "matching_strictness": 1.0,
-          "use_synonyms": true
-        },
-        "snips/temperature": {}
-      }
-    }
-
-You can see that both entities from the intent utterances and from the ``room``
-entity file were added.
-
-By default, the ``room`` entity is set to be
-:ref:`automatically extensible <auto_extensible>` but in our case we don't want
-to handle any entity value that would not be part of the dataset, so we set
-this attribute to ``false``.
-Moreover, we are going to add some rooms that were not in the previous sentences
-and that we want our assistant to cover. Additionally, we add some
-:ref:`synonyms <synonyms>`. Finally, the entities part looks like that:
-
-.. code-block:: json
-
-    {
-      "entities": {
-        "room": {
-          "automatically_extensible": false,
-          "data": [
-            {
-              "synonyms": [],
-              "value": "bathroom"
-            },
-            {
-              "synonyms": [
-                "sleeping room"
-              ],
-              "value": "bedroom"
-            },
-            {
-              "synonyms": [
-                "main room",
-                "lounge"
-              ],
-              "value": "living room"
-            },
-            {
-              "synonyms": [
-                "yard",
-                "backyard"
-              ],
-              "value": "garden"
-            }
-          ],
-          "matching_strictness": 1.0,
-          "use_synonyms": true
-        },
-        "snips/temperature": {}
-      }
-    }
-
-
-We don't need to edit the ``snips/temperature`` entity as it is a builtin
-entity.
-
 Now that we have our dataset ready, let's move to the next step which is to
 create an NLU engine.
 

From 0b898c2b7de1dcc79d101380f8b8adcf721aa9d7 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 14:35:30 +0100
Subject: [PATCH 11/24] Add documentation about implicit values in YAML format

---
 docs/source/dataset.rst | 105 +++++++++++++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 22 deletions(-)

diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
index 1e3628884..f20a65d29 100644
--- a/docs/source/dataset.rst
+++ b/docs/source/dataset.rst
@@ -18,13 +18,13 @@ create or edit datasets programmatically.
 
 .. _yaml_format:
 
+===========
 YAML format
------------
+===========
 
 The YAML dataset format allows you to define intents and entities using the
 `YAML <http://yaml.org/about.html>`_ syntax.
 
-------
 Entity
 ------
 
@@ -37,9 +37,9 @@ Here is what an entity file looks like:
     type: entity # allows to differentiate between entities and intents files
     name: city # name of the entity
     values:
-    - london # single entity value
-    - [new york, big apple] # entity value with a synonym
-    - [paris, city of lights]
+      - london # single entity value
+      - [new york, big apple] # entity value with a synonym
+      - [paris, city of lights]
 
 You can specify entity values either using single YAML scalars (e.g. ``london``),
 or using lists if you want to define some synonyms (e.g.
@@ -58,11 +58,10 @@ are optional:
     use_synonyms: false # default value is true
     matching_strictness: 0.8 # default value is 1.0
     values:
-    - london
-    - [new york, big apple]
-    - [paris, city of lights]
+      - london
+      - [new york, big apple]
+      - [paris, city of lights]
 
-------
 Intent
 ------
 
@@ -100,7 +99,7 @@ utterances. This will result in simpler annotations:
     # searchFlight Intent
     ---
     type: intent
-    name: searchFlight # name of the intent
+    name: searchFlight
     slots:
       - name: origin
         entity: city
@@ -113,7 +112,6 @@ utterances. This will result in simpler annotations:
       - I need a flight leaving [date](this weekend) to [destination](Berlin)
       - show me flights to go to [arrival](new york) leaving [date](this evening)
 
--------
 Dataset
 -------
 
@@ -125,19 +123,47 @@ Here is the yaml file corresponding to the previous ``city`` entity and
 
 .. code-block:: yaml
 
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight
+    slots:
+      - name: origin
+        entity: city
+      - name: destination
+        entity: city
+      - name: date
+        entity: snips/datetime
+    utterances:
+      - find me a flight from [origin](Paris) to [destination](New York)
+      - I need a flight leaving [date](this weekend) to [destination](Berlin)
+      - show me flights to go to [arrival](new york) leaving [date](this evening)
+
     # City Entity
     ---
-    type: entity # allows to differentiate between entities and intents files
-    name: city # name of the entity
+    type: entity
+    name: city
     values:
-    - london # single entity value
-    - [new york, big apple] # entity value with a synonym
-    - [paris, city of lights]
+      - london
+      - [new york, big apple]
+      - [paris, city of lights]
+
+---------------------------------------
+Implicit entity values and slot mapping
+---------------------------------------
+
+In order to make the annotation process even easier, there is a mechanism that
+allows to populate entity values automatically based on the entity values that
+are already provided.
+
+This results in a much simpler dataset file:
+
+.. code-block:: yaml
 
     # searchFlight Intent
     ---
     type: intent
-    name: searchFlight # name of the intent
+    name: searchFlight
     slots:
       - name: origin
         entity: city
@@ -146,9 +172,41 @@ Here is the yaml file corresponding to the previous ``city`` entity and
       - name: date
         entity: snips/datetime
     utterances:
-      - find me a flight from [origin](Paris) to [destination](New York)
-      - I need a flight leaving [date](this weekend) to [destination](Berlin)
-      - show me flights to go to [arrival](new york) leaving [date](this evening)
+      - find me a flight from [origin] to [destination]
+      - I need a flight leaving [date] to [destination]
+      - show me flights to go to [arrival] leaving [date]
+
+    # City Entity
+    ---
+    type: entity
+    name: city
+    values:
+      - london
+      - [new york, big apple]
+      - [paris, city of lights]
+
+For this to work, you need to provide at least one value for each
+*custom entity*. This can be done either through an entity file, or simply by
+providing an entity value in one of the annotated utterances.
+Entity values are automatically generated for *builtin entities*.
+
+Here is a final example of a valid YAML dataset leveraging implicit entity
+values as well as implicit slot mapping:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight
+    utterances:
+      - find me a flight from [origin:city](Paris) to [destination:city]
+      - I need a flight leaving [date:snips/datetime] to [destination]
+      - show me flights to go to [arrival] leaving [date]
+
+Note that the city entity was not provided here, but one value (``Paris``) was
+provided in the first annotated utterance. The mapping between slot name and
+entity is also inferred from the first two utterances.
 
 Once your intents and entities are created using the YAML format described
 previously, you can produce a dataset using the
@@ -166,10 +224,13 @@ Or alternatively if you merged the yaml documents into a single file:
 
 This will generate a JSON dataset and write it in the ``dataset.json`` file.
 The format of the generated file is the second allowed format that is described
-in the next section.
+in the :ref:`JSON format <json_format>` section.
+
+.. _json_format:
 
+===========
 JSON format
------------
+===========
 
 The JSON format is the format which is eventually used by the training API. It
 was designed to be easy to parse.

From 32be2bfbc93f53f02d08b407d5b9d87a769dbf6f Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 14:49:14 +0100
Subject: [PATCH 12/24] Fix yaml examples

---
 docs/source/dataset.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
index f20a65d29..16743a77c 100644
--- a/docs/source/dataset.rst
+++ b/docs/source/dataset.rst
@@ -76,7 +76,7 @@ Here is the format used to describe an intent:
     utterances:
       - find me a flight from [origin:city](Paris) to [destination:city](New York)
       - I need a flight leaving [date:snips/datetime](this weekend) to [destination:city](Berlin)
-      - show me flights to go to [arrival:city](new york) leaving [date:snips/datetime](this evening)
+      - show me flights to go to [destination:city](new york) leaving [date:snips/datetime](this evening)
 
 We use a standard markdown-like annotation syntax to annotate slots within
 utterances. The ``[origin:city](Paris)`` chunk describes a slot with its three
@@ -110,7 +110,7 @@ utterances. This will result in simpler annotations:
     utterances:
       - find me a flight from [origin](Paris) to [destination](New York)
       - I need a flight leaving [date](this weekend) to [destination](Berlin)
-      - show me flights to go to [arrival](new york) leaving [date](this evening)
+      - show me flights to go to [destination](new york) leaving [date](this evening)
 
 Dataset
 -------
@@ -137,7 +137,7 @@ Here is the yaml file corresponding to the previous ``city`` entity and
     utterances:
       - find me a flight from [origin](Paris) to [destination](New York)
       - I need a flight leaving [date](this weekend) to [destination](Berlin)
-      - show me flights to go to [arrival](new york) leaving [date](this evening)
+      - show me flights to go to [destination](new york) leaving [date](this evening)
 
     # City Entity
     ---
@@ -174,7 +174,7 @@ This results in a much simpler dataset file:
     utterances:
       - find me a flight from [origin] to [destination]
       - I need a flight leaving [date] to [destination]
-      - show me flights to go to [arrival] leaving [date]
+      - show me flights to go to [destination] leaving [date]
 
     # City Entity
     ---
@@ -202,7 +202,7 @@ values as well as implicit slot mapping:
     utterances:
       - find me a flight from [origin:city](Paris) to [destination:city]
       - I need a flight leaving [date:snips/datetime] to [destination]
-      - show me flights to go to [arrival] leaving [date]
+      - show me flights to go to [destination] leaving [date]
 
 Note that the city entity was not provided here, but one value (``Paris``) was
 provided in the first annotated utterance. The mapping between slot name and

From 04407156b484319699dbec9b8d8dea15721c7ecc Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 15:28:04 +0100
Subject: [PATCH 13/24] Rename AssistantDataset to Dataset

---
 snips_nlu/cli/generate_dataset.py       |  6 +++---
 snips_nlu/dataset/__init__.py           |  2 +-
 snips_nlu/dataset/dataset.py            | 10 +++++-----
 snips_nlu/dataset/entity.py             |  2 +-
 snips_nlu/dataset/intent.py             | 12 +-----------
 snips_nlu/tests/test_dataset_loading.py |  8 ++++----
 6 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py
index faca6043d..a62d7cba0 100644
--- a/snips_nlu/cli/generate_dataset.py
+++ b/snips_nlu/cli/generate_dataset.py
@@ -4,7 +4,7 @@
 
 import plac
 
-from snips_nlu.dataset import AssistantDataset
+from snips_nlu.dataset import Dataset
 
 
 @plac.annotations(
@@ -14,7 +14,7 @@
 def generate_dataset(language, *files):
     """Create a Snips NLU dataset from text friendly files"""
     if any(f.endswith(".yml") or f.endswith(".yaml") for f in files):
-        dataset = AssistantDataset.from_yaml_files(language, list(files))
+        dataset = Dataset.from_yaml_files(language, list(files))
     else:
-        dataset = AssistantDataset.from_files(language, list(files))
+        dataset = Dataset.from_files(language, list(files))
     print(json.dumps(dataset.json, indent=2, sort_keys=True))
diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py
index 9dd099c85..a43f2268b 100644
--- a/snips_nlu/dataset/__init__.py
+++ b/snips_nlu/dataset/__init__.py
@@ -1,4 +1,4 @@
-from snips_nlu.dataset.dataset import AssistantDataset
+from snips_nlu.dataset.dataset import Dataset
 from snips_nlu.dataset.entity import Entity, EntityFormatError
 from snips_nlu.dataset.intent import Intent, IntentFormatError
 from snips_nlu.dataset.utils import (
diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py
index d9cef2318..e72baad83 100644
--- a/snips_nlu/dataset/dataset.py
+++ b/snips_nlu/dataset/dataset.py
@@ -18,12 +18,12 @@ class DatasetFormatError(TypeError):
     pass
 
 
-class AssistantDataset(object):
+class Dataset(object):
     """Dataset used in the main NLU training API
 
     Consists of intents and entities data. This object can be built either from
-    text files (:meth:`.AssistantDataset.from_files`) or from YAML files
-    (:meth:`.AssistantDataset.from_yaml_files`).
+    text files (:meth:`.Dataset.from_files`) or from YAML files
+    (:meth:`.Dataset.from_yaml_files`).
 
     Attributes:
         language (str): language of the intents
@@ -61,8 +61,8 @@ def from_yaml_files(cls, language, filenames):
                 current_version=__version__,
                 details="Use from_yaml_files instead")
     def from_files(cls, language, filenames):
-        """Creates an :class:`.AssistantDataset` from a language and a list of
-        intent and entity files
+        """Creates a :class:`.Dataset` from a language and a list of intent and
+        entity files
 
         Args:
             language (str): language of the assistant
diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
index 04c9743ce..06fe84df8 100644
--- a/snips_nlu/dataset/entity.py
+++ b/snips_nlu/dataset/entity.py
@@ -23,7 +23,7 @@ class EntityFormatError(TypeError):
 
 
 class Entity(object):
-    """Entity of an :class:`.AssistantDataset`
+    """Entity data of a :class:`.Dataset`
 
     This class can represents both a custom or a builtin entity
 
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index 58dbaa679..c4abe6c19 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -20,17 +20,7 @@ class IntentFormatError(TypeError):
 
 
 class Intent(object):
-    """Dataset of an intent
-
-    Can parse utterances from a text file or an iterator.
-
-    An example of utterance is:
-
-        "the [role:role](president) of [country:country](France)"
-
-    a Tag is in this format:
-
-        [slot:entity_name](text_to_tag)
+    """Intent data of a :class:`.Dataset`
 
     Attributes:
         intent_name (str): name of the intent
diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py
index 458fba82f..7280f73d5 100644
--- a/snips_nlu/tests/test_dataset_loading.py
+++ b/snips_nlu/tests/test_dataset_loading.py
@@ -7,7 +7,7 @@
 from deprecation import fail_if_not_removed
 from mock import patch
 
-from snips_nlu.dataset import AssistantDataset, validate_and_format_dataset
+from snips_nlu.dataset import Dataset, validate_and_format_dataset
 
 EXPECTED_DATASET_DICT = {
     "entities": {
@@ -192,7 +192,7 @@ def mock_open(filename, **kwargs):
         dataset_files = [intent_file_1, intent_file_2, entity_file_1]
 
         # When
-        dataset = AssistantDataset.from_yaml_files("en", dataset_files)
+        dataset = Dataset.from_yaml_files("en", dataset_files)
         dataset_dict = dataset.json
 
         # Then
@@ -241,7 +241,7 @@ def mock_open(filename, **kwargs):
         mock_io.open.side_effect = mock_open
 
         # When
-        dataset = AssistantDataset.from_yaml_files("en", [dataset_file])
+        dataset = Dataset.from_yaml_files("en", [dataset_file])
         dataset_dict = dataset.json
 
         # Then
@@ -287,7 +287,7 @@ def mock_open(self_, *args, **kwargs):
         # When
         with patch("pathlib.io") as mock_io:
             mock_io.open.side_effect = mock_open
-            dataset = AssistantDataset.from_files("en", dataset_files)
+            dataset = Dataset.from_files("en", dataset_files)
         dataset_dict = dataset.json
 
         # When / Then

From 34b90cf4bbb69c15f8aea93c54d2ccaac3b83574 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 15:28:29 +0100
Subject: [PATCH 14/24] Add API reference for Dataset, Intent and Entity
 classes

---
 docs/source/api.rst          | 14 +++++++++++
 docs/source/dataset.rst      | 12 ++++++++++
 snips_nlu/dataset/dataset.py | 46 ++++++++++++++++++++++++++++++++++++
 snips_nlu/dataset/entity.py  | 28 ++++++++++++++++++++--
 snips_nlu/dataset/intent.py  | 33 +++++++++++++++++++++++---
 5 files changed, 128 insertions(+), 5 deletions(-)

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 365e50c8c..9832e7ec6 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -96,6 +96,20 @@ Configurations
    :members:
 
 
+Dataset
+-------
+
+.. module:: snips_nlu.dataset
+
+.. autoclass:: Dataset
+   :members:
+
+.. autoclass:: Intent
+   :members:
+
+.. autoclass:: Entity
+   :members:
+
 Result and output format
 ------------------------
 
diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
index 16743a77c..d414ba0fd 100644
--- a/docs/source/dataset.rst
+++ b/docs/source/dataset.rst
@@ -25,6 +25,8 @@ YAML format
 The YAML dataset format allows you to define intents and entities using the
 `YAML <http://yaml.org/about.html>`_ syntax.
 
+.. _yaml_entity_format:
+
 Entity
 ------
 
@@ -62,6 +64,8 @@ are optional:
       - [new york, big apple]
       - [paris, city of lights]
 
+.. _yaml_intent_format:
+
 Intent
 ------
 
@@ -112,6 +116,9 @@ utterances. This will result in simpler annotations:
       - I need a flight leaving [date](this weekend) to [destination](Berlin)
       - show me flights to go to [destination](new york) leaving [date](this evening)
 
+
+.. _yaml_dataset_format:
+
 Dataset
 -------
 
@@ -148,6 +155,11 @@ Here is the yaml file corresponding to the previous ``city`` entity and
       - [new york, big apple]
       - [paris, city of lights]
 
+.. important::
+
+    If you plan to have more than one entity or intent in a YAML file, you must
+    separate them using the YAML document separator: ``---``
+
 ---------------------------------------
 Implicit entity values and slot mapping
 ---------------------------------------
diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py
index e72baad83..a80e8ee6f 100644
--- a/snips_nlu/dataset/dataset.py
+++ b/snips_nlu/dataset/dataset.py
@@ -40,6 +40,52 @@ def __init__(self, language, intents, entities):
 
     @classmethod
     def from_yaml_files(cls, language, filenames):
+        """Creates a :class:`.Dataset` from a language and a list of YAML files
+        containing intents and entities data
+
+        Each file need not correspond to a single entity nor intent. They can
+        consist in several entities and intents merged together in a single
+        file.
+
+        A dataset can be defined with a YAML document following the schema
+        illustrated in the example below:
+
+        .. code-block:: yaml
+
+            # searchFlight Intent
+            ---
+            type: intent
+            name: searchFlight
+            slots:
+              - name: origin
+                entity: city
+              - name: destination
+                entity: city
+              - name: date
+                entity: snips/datetime
+            utterances:
+              - find me a flight from [origin](Paris) to [destination](New York)
+              - I need a flight leaving [date](this weekend) to [destination](Berlin)
+              - show me flights to go to [destination](new york) leaving [date](this evening)
+
+            # City Entity
+            ---
+            type: entity
+            name: city
+            values:
+              - london
+              - [new york, big apple]
+              - [paris, city of lights]
+
+        Raises:
+            DatasetFormatError: When one of the documents present in the YAML
+                files has a wrong 'type' attribute, which is not 'entity' nor
+                'intent'
+            IntentFormatError: When the YAML document of an intent does not
+                correspond to the :ref:`expected intent format <yaml_intent_format>`
+            EntityFormatError: When the YAML document of an entity does not
+                correspond to the :ref:`expected entity format <yaml_entity_format>`
+        """
         entities = []
         intents = []
         for filename in filenames:
diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
index 06fe84df8..9d138b525 100644
--- a/snips_nlu/dataset/entity.py
+++ b/snips_nlu/dataset/entity.py
@@ -25,7 +25,8 @@ class EntityFormatError(TypeError):
 class Entity(object):
     """Entity data of a :class:`.Dataset`
 
-    This class can represents both a custom or a builtin entity
+    This class can represents both a custom or a builtin entity. When the
+    entity is a builtin one, only the `name` attribute is relevant.
 
     Attributes:
         name (str): name of the entity
@@ -56,7 +57,29 @@ def is_builtin(self):
 
     @classmethod
     def from_yaml(cls, yaml_dict):
-        """Build an :class:`.Entity` from its YAML definition dict"""
+        """Build an :class:`.Entity` from its YAML definition dict
+
+        An entity can be defined with a YAML document following the schema
+        illustrated in the example below:
+
+        .. code-block:: yaml
+
+            # City Entity
+            ---
+            type: entity
+            name: city
+            automatically_extensible: false # default value is true
+            use_synonyms: false # default value is true
+            matching_strictness: 0.8 # default value is 1.0
+            values:
+              - london
+              - [new york, big apple]
+              - [paris, city of lights]
+
+        Raises:
+            EntityFormatError: When the YAML dict does not correspond to the
+                :ref:`expected entity format <yaml_entity_format>`
+        """
         object_type = yaml_dict.get("type")
         if object_type and object_type != "entity":
             raise EntityFormatError("Wrong type: '%s'" % object_type)
@@ -88,6 +111,7 @@ def from_yaml(cls, yaml_dict):
     @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
                 current_version=__version__, details="Use from_yaml instead")
     def from_file(cls, filepath):
+        """Build an :class:`.Entity` from a text file"""
         filepath = Path(filepath)
         stem = filepath.stem
         if not stem.startswith("entity_"):
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index c4abe6c19..c8f26dbb1 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -24,7 +24,8 @@ class Intent(object):
 
     Attributes:
         intent_name (str): name of the intent
-        utterances (list of :class:`.IntentUtterance`): intent utterances
+        utterances (list of :class:`.IntentUtterance`): annotated intent
+            utterances
         slot_mapping (dict): mapping between slot names and entities
     """
 
@@ -39,7 +40,33 @@ def __init__(self, intent_name, utterances, slot_mapping=None):
 
     @classmethod
     def from_yaml(cls, yaml_dict):
-        """Build an :class:`.Intent` from its YAML definition dict"""
+        """Build an :class:`.Intent` from its YAML definition dict
+
+        An intent can be defined with a YAML document following the schema
+        illustrated in the example below:
+
+        .. code-block:: yaml
+
+            # searchFlight Intent
+            ---
+            type: intent
+            name: searchFlight
+            slots:
+              - name: origin
+                entity: city
+              - name: destination
+                entity: city
+              - name: date
+                entity: snips/datetime
+            utterances:
+              - find me a flight from [origin](Paris) to [destination](New York)
+              - I need a flight leaving [date](this weekend) to [destination](Berlin)
+              - show me flights to go to [destination](new york) leaving [date](this evening)
+
+        Raises:
+            IntentFormatError: When the YAML dict does not correspond to the
+                :ref:`expected intent format <yaml_intent_format>`
+        """
         object_type = yaml_dict.get("type")
         if object_type and object_type != "intent":
             raise IntentFormatError("Wrong type: '%s'" % object_type)
@@ -60,6 +87,7 @@ def from_yaml(cls, yaml_dict):
     @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
                 current_version=__version__, details="Use from_yaml instead")
     def from_file(cls, filepath):
+        """Build an :class:`.Intent` from a text file"""
         filepath = Path(filepath)
         stem = filepath.stem
         if not stem.startswith("intent_"):
@@ -102,7 +130,6 @@ def json(self):
 
     @property
     def entities_names(self):
-        """Set of entity names present in the intent utterances"""
         return set(chunk.entity for u in self.utterances
                    for chunk in u.chunks if isinstance(chunk, SlotChunk))
 

From 8ab67bbf2f56bd650324e4cf3e02da6d8ed170be Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 21 Nov 2018 15:56:28 +0100
Subject: [PATCH 15/24] Fix linting issues

---
 snips_nlu/dataset/dataset.py            | 3 +++
 snips_nlu/dataset/intent.py             | 2 ++
 snips_nlu/tests/test_dataset_loading.py | 3 +--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py
index a80e8ee6f..d4eb996a2 100644
--- a/snips_nlu/dataset/dataset.py
+++ b/snips_nlu/dataset/dataset.py
@@ -40,6 +40,7 @@ def __init__(self, language, intents, entities):
 
     @classmethod
     def from_yaml_files(cls, language, filenames):
+        # pylint:disable=line-too-long
         """Creates a :class:`.Dataset` from a language and a list of YAML files
         containing intents and entities data
 
@@ -86,6 +87,7 @@ def from_yaml_files(cls, language, filenames):
             EntityFormatError: When the YAML document of an entity does not
                 correspond to the :ref:`expected entity format <yaml_entity_format>`
         """
+        # pylint:enable=line-too-long
         entities = []
         intents = []
         for filename in filenames:
@@ -183,6 +185,7 @@ def _get_entity_values(self, entity):
 
     @property
     def json(self):
+        """Dataset data in json format"""
         intents = {intent_data.intent_name: intent_data.json
                    for intent_data in self.intents}
         entities = {entity.name: entity.json for entity in self.entities}
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index c8f26dbb1..ed8ce436c 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -40,6 +40,7 @@ def __init__(self, intent_name, utterances, slot_mapping=None):
 
     @classmethod
     def from_yaml(cls, yaml_dict):
+        # pylint:disable=line-too-long
         """Build an :class:`.Intent` from its YAML definition dict
 
         An intent can be defined with a YAML document following the schema
@@ -67,6 +68,7 @@ def from_yaml(cls, yaml_dict):
             IntentFormatError: When the YAML dict does not correspond to the
                 :ref:`expected intent format <yaml_intent_format>`
         """
+        # pylint:enable=line-too-long
         object_type = yaml_dict.get("type")
         if object_type and object_type != "intent":
             raise IntentFormatError("Wrong type: '%s'" % object_type)
diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py
index 7280f73d5..9b825b049 100644
--- a/snips_nlu/tests/test_dataset_loading.py
+++ b/snips_nlu/tests/test_dataset_loading.py
@@ -3,7 +3,6 @@
 import io
 from unittest import TestCase
 
-import mock
 from deprecation import fail_if_not_removed
 from mock import patch
 
@@ -199,7 +198,7 @@ def mock_open(filename, **kwargs):
         validate_and_format_dataset(dataset_dict)
         self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
 
-    @mock.patch("snips_nlu.dataset.dataset.io")
+    @patch("snips_nlu.dataset.dataset.io")
     def test_should_generate_dataset_from_merged_yaml_file(self, mock_io):
         # Given
         dataset_file = "dataset.yaml"

From f8c9ee995fdcc4de64255eee79e94d779e26918b Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Thu, 22 Nov 2018 18:10:48 +0100
Subject: [PATCH 16/24] Fix issues after review

---
 docs/source/dataset.rst     | 2 +-
 docs/source/index.rst       | 2 +-
 snips_nlu/dataset/intent.py | 8 ++------
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
index d414ba0fd..19d62b580 100644
--- a/docs/source/dataset.rst
+++ b/docs/source/dataset.rst
@@ -8,7 +8,7 @@ data in order to produce a powerful intent recognition engine.
 
 The better your training data is, and the more accurate your NLU engine will
 be. Thus, it is worth spending a bit of time to create a dataset that
-corresponds well to your use case.
+matches well your use case.
 
 Snips NLU accepts two different dataset formats. The first one, which relies
 on YAML, is the preferred option if you want to create or edit a dataset
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5b976aedc..faf75bd2d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,8 +81,8 @@ the :ref:`api` documentation or alternatively check the `github repository`_.
    installation
    quickstart
    tutorial
-   dataset
    data_model
+   dataset
    languages
    cli
    api
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
index ed8ce436c..d0261925b 100644
--- a/snips_nlu/dataset/intent.py
+++ b/snips_nlu/dataset/intent.py
@@ -15,10 +15,6 @@ class IntentFormatError(TypeError):
     pass
 
 
-INTENT_FORMATTING_ERROR = IntentFormatError(
-    "Intent file is not properly formatted")
-
-
 class Intent(object):
     """Intent data of a :class:`.Dataset`
 
@@ -288,7 +284,7 @@ def capture_slot(state):
     next_colon_pos = state.find(':')
     next_square_bracket_pos = state.find(']')
     if next_square_bracket_pos < 0:
-        raise INTENT_FORMATTING_ERROR
+        raise IntentFormatError("Missing ending ']' in annotated utterance")
     if next_colon_pos < 0 or next_square_bracket_pos < next_colon_pos:
         slot_name = state[:next_square_bracket_pos]
         state.move(next_square_bracket_pos)
@@ -309,7 +305,7 @@ def capture_slot(state):
 def capture_tagged(state):
     next_pos = state.find(')')
     if next_pos < 1:
-        raise INTENT_FORMATTING_ERROR
+        raise IntentFormatError("Missing ending ')' in annotated utterance")
     else:
         tagged_text = state[:next_pos]
         state.add_tagged(tagged_text)

From 770445d709b6f1844ca414f40039ad81feda7c40 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 20 Nov 2018 10:33:59 +0100
Subject: [PATCH 17/24] Bump snips-nlu-ontology to 0.62

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fe523edad..f1c386f7c 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
     "sklearn-crfsuite>=0.3.6,<0.4",
     "semantic_version>=2.6,<3.0",
     "snips_nlu_utils>=0.7,<0.8",
-    "snips_nlu_ontology>=0.61.1,<0.62",
+    "snips_nlu_ontology>=0.62.0,<0.63",
     "num2words>=0.5.6,<0.6",
     "plac>=0.9.6,<1.0",
     "requests>=2.0,<3.0",

From 211ee37a77e0447d5fcd576f2c74d3e072431fd6 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Fri, 23 Nov 2018 10:45:09 +0100
Subject: [PATCH 18/24] Bump package version and model version

---
 snips_nlu/__about__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py
index ae1289b8a..6ea37364c 100644
--- a/snips_nlu/__about__.py
+++ b/snips_nlu/__about__.py
@@ -11,8 +11,8 @@
 __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai"
 __license__ = "Apache License, Version 2.0"
 
-__version__ = "0.17.4"
-__model_version__ = "0.17.0"
+__version__ = "0.18.0"
+__model_version__ = "0.18.0"
 
 __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/snipsco/snips-nlu-language-resources/master/compatibility.json"

From ad70ae15afd4c4ffa5b04bb4455f1df04ccc6ca9 Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Fri, 23 Nov 2018 11:02:53 +0100
Subject: [PATCH 19/24] Adapt noise generation to add more unknownwords

---
 .../log_reg_classifier_utils.py               |  62 ++-
 .../pipeline/configs/intent_classifier.py     |   3 +-
 .../tests/test_log_reg_classifier_utils.py    | 527 ++++++++++++++++++
 .../tests/test_log_reg_intent_classifier.py   | 411 +-------------
 4 files changed, 577 insertions(+), 426 deletions(-)
 create mode 100644 snips_nlu/tests/test_log_reg_classifier_utils.py

diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
index 919b8b753..dab48053f 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -9,8 +9,8 @@
 import numpy as np
 from future.utils import iteritems, itervalues
 
-from snips_nlu.constants import (
-    DATA, ENTITY, INTENTS, TEXT, UNKNOWNWORD, UTTERANCES)
+from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT,
+                                 UNKNOWNWORD, UTTERANCES)
 from snips_nlu.data_augmentation import augment_utterances
 from snips_nlu.dataset import get_text_from_chunks
 from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
@@ -50,16 +50,16 @@ def get_noise_it(noise, mean_length, std_length, random_state):
         # pylint: enable=stop-iteration-return
 
 
-def generate_smart_noise(augmented_utterances, replacement_string, language):
+def generate_smart_noise(noise, augmented_utterances, replacement_string,
+                         language):
     text_utterances = [get_text_from_chunks(u[DATA])
                        for u in augmented_utterances]
     vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
     vocab = set(vocab)
-    noise = get_noise(language)
     return [w if w in vocab else replacement_string for w in noise]
 
 
-def generate_noise_utterances(augmented_utterances, num_intents,
+def generate_noise_utterances(augmented_utterances, noise, num_intents,
                               data_augmentation_config, language,
                               random_state):
     if not augmented_utterances or not num_intents:
@@ -67,11 +67,9 @@ def generate_noise_utterances(augmented_utterances, num_intents,
     avg_num_utterances = len(augmented_utterances) / float(num_intents)
     if data_augmentation_config.unknown_words_replacement_string is not None:
         noise = generate_smart_noise(
-            augmented_utterances,
+            noise, augmented_utterances,
             data_augmentation_config.unknown_words_replacement_string,
             language)
-    else:
-        noise = get_noise(language)
 
     noise_size = min(
         int(data_augmentation_config.noise_factor * avg_num_utterances),
@@ -89,14 +87,40 @@ def generate_noise_utterances(augmented_utterances, num_intents,
         for _ in range(noise_size)]
 
 
-def add_unknown_word_to_utterances(augmented_utterances, replacement_string,
-                                   unknown_word_prob, random_state):
-    for u in augmented_utterances:
-        for chunk in u[DATA]:
-            if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \
-                    and random_state.rand() < unknown_word_prob:
-                chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT])
-    return augmented_utterances
+def add_unknown_word_to_utterances(utterances, replacement_string,
+                                   unknown_word_prob, max_unknown_words,
+                                   random_state):
+    new_utterances = deepcopy(utterances)
+    unknown_word_lengths = [i for i in range(1, max_unknown_words + 1)]
+    for u in new_utterances:
+        if random_state.rand() < unknown_word_prob:
+            # num_unknown = random_state.choice(unknown_word_lengths, p=p)
+            num_unknown = random_state.choice(unknown_word_lengths)
+            # We choose to put the noise at the end of the sentence and not
+            # in the middle so that it doesn't impact to much ngrams
+            # computation
+            extra_chunk = {
+                TEXT: " " + " ".join(
+                    replacement_string for _ in range(num_unknown))
+            }
+            u[DATA].append(extra_chunk)
+    return new_utterances
+
+
+def get_dataset_specific_noise(dataset, language):
+    """Return a noise list that excludes the dataset entity values"""
+    entities_values = set()
+    for ent_name, ent in iteritems(dataset[ENTITIES]):
+        if is_builtin_entity(ent_name):
+            continue
+        for k, v in iteritems(ent[UTTERANCES]):
+            entities_values.add(k)
+            entities_values.add(v)
+    original_noise = get_noise(language)
+    specific_noise = [n for n in original_noise if n not in entities_values]
+    if not specific_noise:  # Avoid returning an empty noise
+        return original_noise
+    return specific_noise
 
 
 def build_training_data(dataset, language, data_augmentation_config,
@@ -133,13 +157,15 @@ def build_training_data(dataset, language, data_augmentation_config,
         augmented_utterances,
         data_augmentation_config.unknown_words_replacement_string,
         data_augmentation_config.unknown_word_prob,
+        data_augmentation_config.max_unknown_words,
         random_state
     )
 
     # Adding noise
+    noise = get_dataset_specific_noise(dataset, language)
     noisy_utterances = generate_noise_utterances(
-        augmented_utterances, len(intents), data_augmentation_config, language,
-        random_state)
+        augmented_utterances, noise, len(intents), data_augmentation_config,
+        language, random_state)
 
     augmented_utterances += noisy_utterances
     utterance_classes += [noise_class for _ in noisy_utterances]
diff --git a/snips_nlu/pipeline/configs/intent_classifier.py b/snips_nlu/pipeline/configs/intent_classifier.py
index 42abb8b6d..8ea83dcd2 100644
--- a/snips_nlu/pipeline/configs/intent_classifier.py
+++ b/snips_nlu/pipeline/configs/intent_classifier.py
@@ -118,13 +118,14 @@ class IntentClassifierDataAugmentationConfig(Config):
 
     def __init__(self, min_utterances=20, noise_factor=5,
                  add_builtin_entities_examples=True, unknown_word_prob=0,
-                 unknown_words_replacement_string=None):
+                 unknown_words_replacement_string=None, max_unknown_words=3):
         self.min_utterances = min_utterances
         self.noise_factor = noise_factor
         self.add_builtin_entities_examples = add_builtin_entities_examples
         self.unknown_word_prob = unknown_word_prob
         self.unknown_words_replacement_string = \
             unknown_words_replacement_string
+        self.max_unknown_words = max_unknown_words
         if unknown_word_prob > 0 and unknown_words_replacement_string is None:
             raise ValueError("unknown_word_prob is positive (%s) but the "
                              "replacement string is None" % unknown_word_prob)
diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py
new file mode 100644
index 000000000..ff616a00d
--- /dev/null
+++ b/snips_nlu/tests/test_log_reg_classifier_utils.py
@@ -0,0 +1,527 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+from copy import deepcopy
+from itertools import cycle
+
+import numpy as np
+from future.utils import itervalues
+from mock import MagicMock, patch
+
+from snips_nlu.constants import INTENTS, LANGUAGE_EN, UTTERANCES
+from snips_nlu.dataset import validate_and_format_dataset
+from snips_nlu.intent_classifier.log_reg_classifier_utils import (
+    add_unknown_word_to_utterances, build_training_data,
+    generate_noise_utterances, generate_smart_noise, get_noise_it,
+    remove_builtin_slots, text_to_utterance, get_dataset_specific_noise)
+from snips_nlu.pipeline.configs import (
+    IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig)
+from snips_nlu.tests.test_log_reg_intent_classifier import (
+    get_mocked_augment_utterances)
+from snips_nlu.tests.utils import (SAMPLE_DATASET, SnipsTest,
+                                   get_empty_dataset)
+
+
+class TestLogRegClassifierUtils(SnipsTest):
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
+           ".augment_utterances")
+    def test_should_build_training_data_with_no_stemming_no_noise(
+            self, mocked_augment_utterances):
+        # Given
+        dataset = validate_and_format_dataset(SAMPLE_DATASET)
+        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
+        random_state = np.random.RandomState(1)
+
+        # When
+        data_augmentation_config = IntentClassifierDataAugmentationConfig(
+            noise_factor=0)
+        utterances, _, intent_mapping = build_training_data(
+            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = [utterance for intent
+                               in itervalues(dataset[INTENTS])
+                               for utterance in intent[UTTERANCES]]
+        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
+        self.assertListEqual(expected_utterances, utterances)
+        self.assertListEqual(expected_intent_mapping, intent_mapping)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
+           ".augment_utterances")
+    def test_should_build_training_data_with_noise(
+            self, mocked_augment_utterances, mocked_get_noise):
+        # Given
+        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
+        mocked_get_noise.return_value = mocked_noises
+        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
+
+        num_intents = 3
+        utterances_length = 5
+        num_queries_per_intent = 3
+        fake_utterance = {
+            "data": [
+                {"text": " ".join("1" for _ in range(utterances_length))}
+            ]
+        }
+        dataset = {
+            "intents": {
+                str(i): {
+                    "utterances": [fake_utterance] * num_queries_per_intent
+                } for i in range(num_intents)
+            },
+            "entities": {}
+        }
+        random_state = np.random.RandomState(1)
+
+        # When
+        np.random.seed(42)
+        noise_factor = 2
+        data_augmentation_config = IntentClassifierDataAugmentationConfig(
+            noise_factor=noise_factor, unknown_word_prob=0,
+            unknown_words_replacement_string=None)
+        utterances, _, intent_mapping = build_training_data(
+            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = [utterance
+                               for intent in itervalues(dataset[INTENTS])
+                               for utterance in intent[UTTERANCES]]
+        np.random.seed(42)
+        noise = list(mocked_noises)
+        noise_size = int(min(noise_factor * num_queries_per_intent,
+                             len(noise)))
+        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
+                                random_state)
+        noisy_utterances = [text_to_utterance(next(noise_it))
+                            for _ in range(noise_size)]
+        expected_utterances += noisy_utterances
+        expected_intent_mapping = sorted(dataset["intents"])
+        expected_intent_mapping.append(None)
+        self.assertListEqual(expected_utterances, utterances)
+        self.assertListEqual(intent_mapping, expected_intent_mapping)
+
+    def test_add_unknown_words_to_utterances(self):
+        # Given
+        base_utterances = {
+            "data": [
+                {
+                    "text": "hello "
+                },
+                {
+                    "text": " you ",
+                    "entity": "you"
+                },
+                {
+                    "text": " how are you "
+                },
+                {
+                    "text": "cat",
+                    "entity": "cat"
+                }
+            ]
+        }
+        utterances = []
+        for _ in range(6):
+            utterances.append(deepcopy(base_utterances))
+
+        rand_it = cycle([0, 1])
+
+        def mocked_rand():
+            return next(rand_it)
+
+        max_unknown_words = 3
+        rg_it = cycle([i for i in range(1, max_unknown_words + 1)])
+
+        def mocked_choice(a):  # pylint: disable=unused-argument
+            return next(rg_it)
+
+        unknownword_prob = .5
+
+        random_state = MagicMock()
+        random_state_rand = MagicMock()
+        random_state_rand.side_effect = mocked_rand
+        random_state_choice = MagicMock()
+        random_state_choice.side_effect = mocked_choice
+
+        random_state.rand = random_state_rand
+        random_state.choice = random_state_choice
+
+        # When
+        replacement_string = "unknownword"
+        noisy_utterances = add_unknown_word_to_utterances(
+            utterances, unknown_word_prob=unknownword_prob,
+            replacement_string=replacement_string,
+            max_unknown_words=max_unknown_words,
+            random_state=random_state
+        )
+
+        # Then
+        expected_utterances = [
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                    {
+                        "text": " unknownword"
+                    }
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                    {
+                        "text": " unknownword unknownword"
+                    }
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                    {
+                        "text": " unknownword unknownword unknownword"
+                    }
+
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                ]
+            }
+        ]
+        self.assertEqual(expected_utterances, noisy_utterances)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    def test_generate_noise_utterances_should_replace_unknown_words(
+            self, mocked_noise):
+        # Given
+        utterances = [
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "bobby",
+                        "entity": "you"
+                    }
+                ]
+            }
+        ]
+        language = LANGUAGE_EN
+        base_noise = ["hello", "dear", "you", "fool"]
+        mocked_noise.return_value = base_noise
+        replacement_string = "unknownword"
+
+        # When
+        noise = generate_smart_noise(
+            base_noise, utterances, replacement_string, language)
+
+        # Then
+        expected_noise = ["hello", replacement_string, "you",
+                          replacement_string]
+        self.assertEqual(noise, expected_noise)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
+           ".augment_utterances")
+    def test_should_build_training_data_with_unknown_noise(
+            self, mocked_augment_utterances, mocked_get_noise):
+        # Given
+        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
+        mocked_get_noise.return_value = mocked_noises
+        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
+
+        num_intents = 3
+        utterances_length = 5
+        num_queries_per_intent = 3
+        fake_utterance = {
+            "data": [
+                {"text": " ".join("1" for _ in range(utterances_length))}
+            ]
+        }
+        dataset = {
+            "intents": {
+                str(i): {
+                    "utterances": [fake_utterance] * num_queries_per_intent
+                } for i in range(num_intents)
+            },
+            "entities": {}
+        }
+        random_state = np.random.RandomState(1)
+
+        # When
+        np.random.seed(42)
+        noise_factor = 2
+        replacement_string = "unknownword"
+        data_augmentation_config = IntentClassifierDataAugmentationConfig(
+            noise_factor=noise_factor, unknown_word_prob=0,
+            unknown_words_replacement_string=replacement_string)
+        utterances, _, intent_mapping = build_training_data(
+            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = [utterance
+                               for intent in itervalues(dataset[INTENTS])
+                               for utterance in intent[UTTERANCES]]
+        np.random.seed(42)
+        noise = list(mocked_noises)
+        noise_size = int(min(noise_factor * num_queries_per_intent,
+                             len(noise)))
+        noisy_utterances = [text_to_utterance(replacement_string)
+                            for _ in range(noise_size)]
+        expected_utterances += noisy_utterances
+        expected_intent_mapping = sorted(dataset["intents"])
+        expected_intent_mapping.append(None)
+        self.assertListEqual(expected_utterances, utterances)
+        self.assertListEqual(expected_intent_mapping, intent_mapping)
+
+    def test_should_build_training_data_with_no_data(self):
+        # Given
+        language = LANGUAGE_EN
+        dataset = validate_and_format_dataset(get_empty_dataset(language))
+        random_state = np.random.RandomState(1)
+
+        # When
+        data_augmentation_config = LogRegIntentClassifierConfig() \
+            .data_augmentation_config
+        utterances, _, intent_mapping = build_training_data(
+            dataset, language, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = []
+        expected_intent_mapping = []
+        self.assertListEqual(utterances, expected_utterances)
+        self.assertListEqual(intent_mapping, expected_intent_mapping)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    def test_generate_noise_utterances(self, mocked_get_noise):
+        # Given
+        language = LANGUAGE_EN
+        num_intents = 2
+        noise_factor = 1
+        utterances_length = 5
+
+        noise = [str(i) for i in range(utterances_length)]
+        mocked_get_noise.return_value = noise
+
+        augmented_utterances = [
+            {
+                "data": [
+                    {
+                        "text": " ".join(
+                            "{}".format(i) for i in range(utterances_length))
+                    }
+                ]
+            }
+        ]
+        num_utterances = 10
+        random_state = np.random.RandomState(1)
+
+        augmented_utterances = augmented_utterances * num_utterances
+        config = IntentClassifierDataAugmentationConfig(
+            noise_factor=noise_factor)
+        # When
+        noise_utterances = generate_noise_utterances(
+            augmented_utterances, noise, num_intents, config, language,
+            random_state)
+
+        # Then
+        joined_noise = text_to_utterance(" ".join(noise))
+        for u in noise_utterances:
+            self.assertEqual(u, joined_noise)
+
+    def test_remove_builtin_slots(self):
+        # Given
+        language = LANGUAGE_EN
+        dataset = {
+            "entities": {
+                "snips/number": {}
+            },
+            "intents": {
+                "dummy_intent_1": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "I want ",
+                                },
+                                {
+                                    "text": "three",
+                                    "slot_name": "number_of_cups",
+                                    "entity": "snips/number"
+                                },
+                                {
+                                    "text": " cups",
+                                },
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "give me ",
+                                },
+                                {
+                                    "text": "twenty two",
+                                    "slot_name": "number_of_cups",
+                                    "entity": "snips/number"
+                                },
+                                {
+                                    "text": " big cups please",
+                                },
+                            ]
+                        }
+                    ]
+                }
+            },
+            "language": language
+        }
+
+        # When
+        filtered_dataset = remove_builtin_slots(dataset)
+
+        # Then
+        expected_dataset = {
+            "entities": {
+                "snips/number": {}
+            },
+            "intents": {
+                "dummy_intent_1": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "I want ",
+                                },
+                                {
+                                    "text": " cups",
+                                },
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "give me ",
+                                },
+                                {
+                                    "text": " big cups please",
+                                },
+                            ]
+                        }
+                    ]
+                }
+            },
+            "language": language
+        }
+
+        self.assertDictEqual(expected_dataset, filtered_dataset)
+
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    def test_get_dataset_specific_noise(self, mocked_noise):
+        # Given
+        dataset = validate_and_format_dataset(SAMPLE_DATASET)
+        language = "en"
+        mocked_noise.return_value = ["dummy_a", "yo"]
+
+
+        # When
+        noise = get_dataset_specific_noise(dataset, language)
+
+        # Then
+        self.assertEqual(["yo"], noise)
diff --git a/snips_nlu/tests/test_log_reg_intent_classifier.py b/snips_nlu/tests/test_log_reg_intent_classifier.py
index 64ee92a55..1c17c423b 100644
--- a/snips_nlu/tests/test_log_reg_intent_classifier.py
+++ b/snips_nlu/tests/test_log_reg_intent_classifier.py
@@ -1,26 +1,20 @@
 # coding=utf-8
 from __future__ import unicode_literals
 
-from builtins import next, range, str
-
-import numpy as np
-from future.utils import itervalues
 from mock import patch
 
 from snips_nlu.constants import (
     INTENTS, LANGUAGE_EN, RES_INTENT_NAME, UTTERANCES)
 from snips_nlu.dataset import validate_and_format_dataset
 from snips_nlu.entity_parser import BuiltinEntityParser, CustomEntityParser
-from snips_nlu.entity_parser.custom_entity_parser_usage import \
-    CustomEntityParserUsage
+from snips_nlu.entity_parser.custom_entity_parser_usage import (
+    CustomEntityParserUsage)
 from snips_nlu.intent_classifier import LogRegIntentClassifier
 from snips_nlu.intent_classifier.featurizer import Featurizer
 from snips_nlu.intent_classifier.log_reg_classifier_utils import (
-    add_unknown_word_to_utterances, build_training_data,
-    generate_noise_utterances, generate_smart_noise, get_noise_it,
-    remove_builtin_slots, text_to_utterance)
+    text_to_utterance)
 from snips_nlu.pipeline.configs import (
-    IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig)
+    LogRegIntentClassifierConfig)
 from snips_nlu.tests.utils import (
     BEVERAGE_DATASET, FixtureTest, SAMPLE_DATASET, get_empty_dataset)
 from snips_nlu.utils import NotTrained
@@ -264,400 +258,3 @@ def test_empty_vocabulary_should_fit_and_return_none_intent(
         intent_classifier = LogRegIntentClassifier().fit(dataset)
         intent = intent_classifier.get_intent("no intent there")
         self.assertEqual(None, intent)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
-           ".augment_utterances")
-    def test_should_build_training_data_with_no_stemming_no_noise(
-            self, mocked_augment_utterances):
-        # Given
-        dataset = SAMPLE_DATASET
-        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
-        random_state = np.random.RandomState(1)
-
-        # When
-        data_augmentation_config = IntentClassifierDataAugmentationConfig(
-            noise_factor=0)
-        utterances, _, intent_mapping = build_training_data(
-            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = [utterance for intent
-                               in itervalues(dataset[INTENTS])
-                               for utterance in intent[UTTERANCES]]
-        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
-        self.assertListEqual(expected_utterances, utterances)
-        self.assertListEqual(expected_intent_mapping, intent_mapping)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
-           ".augment_utterances")
-    def test_should_build_training_data_with_noise(
-            self, mocked_augment_utterances, mocked_get_noise):
-        # Given
-        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
-        mocked_get_noise.return_value = mocked_noises
-        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
-
-        num_intents = 3
-        utterances_length = 5
-        num_queries_per_intent = 3
-        fake_utterance = {
-            "data": [
-                {"text": " ".join("1" for _ in range(utterances_length))}
-            ]
-        }
-        dataset = {
-            "intents": {
-                str(i): {
-                    "utterances": [fake_utterance] * num_queries_per_intent
-                } for i in range(num_intents)
-            }
-        }
-        random_state = np.random.RandomState(1)
-
-        # When
-        np.random.seed(42)
-        noise_factor = 2
-        data_augmentation_config = IntentClassifierDataAugmentationConfig(
-            noise_factor=noise_factor, unknown_word_prob=0,
-            unknown_words_replacement_string=None)
-        utterances, _, intent_mapping = build_training_data(
-            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = [utterance
-                               for intent in itervalues(dataset[INTENTS])
-                               for utterance in intent[UTTERANCES]]
-        np.random.seed(42)
-        noise = list(mocked_noises)
-        noise_size = int(min(noise_factor * num_queries_per_intent,
-                             len(noise)))
-        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
-                                random_state)
-        noisy_utterances = [text_to_utterance(next(noise_it))
-                            for _ in range(noise_size)]
-        expected_utterances += noisy_utterances
-        expected_intent_mapping = sorted(dataset["intents"])
-        expected_intent_mapping.append(None)
-        self.assertListEqual(expected_utterances, utterances)
-        self.assertListEqual(intent_mapping, expected_intent_mapping)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
-           ".augment_utterances")
-    def test_should_build_training_data_with_unknown_noise(
-            self, mocked_augment_utterances, mocked_get_noise):
-        # Given
-        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
-        mocked_get_noise.return_value = mocked_noises
-        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
-
-        num_intents = 3
-        utterances_length = 5
-        num_queries_per_intent = 3
-        fake_utterance = {
-            "data": [
-                {"text": " ".join("1" for _ in range(utterances_length))}
-            ]
-        }
-        dataset = {
-            "intents": {
-                str(i): {
-                    "utterances": [fake_utterance] * num_queries_per_intent
-                } for i in range(num_intents)
-            }
-        }
-        random_state = np.random.RandomState(1)
-
-        # When
-        np.random.seed(42)
-        noise_factor = 2
-        replacement_string = "unknownword"
-        data_augmentation_config = IntentClassifierDataAugmentationConfig(
-            noise_factor=noise_factor, unknown_word_prob=0,
-            unknown_words_replacement_string=replacement_string)
-        utterances, _, intent_mapping = build_training_data(
-            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = [utterance
-                               for intent in itervalues(dataset[INTENTS])
-                               for utterance in intent[UTTERANCES]]
-        np.random.seed(42)
-        noise = list(mocked_noises)
-        noise_size = int(min(noise_factor * num_queries_per_intent,
-                             len(noise)))
-        noisy_utterances = [text_to_utterance(replacement_string)
-                            for _ in range(noise_size)]
-        expected_utterances += noisy_utterances
-        expected_intent_mapping = sorted(dataset["intents"])
-        expected_intent_mapping.append(None)
-        self.assertListEqual(expected_utterances, utterances)
-        self.assertListEqual(expected_intent_mapping, intent_mapping)
-
-    def test_should_build_training_data_with_no_data(self):
-        # Given
-        language = LANGUAGE_EN
-        dataset = validate_and_format_dataset(get_empty_dataset(language))
-        random_state = np.random.RandomState(1)
-
-        # When
-        data_augmentation_config = LogRegIntentClassifierConfig() \
-            .data_augmentation_config
-        utterances, _, intent_mapping = build_training_data(
-            dataset, language, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = []
-        expected_intent_mapping = []
-        self.assertListEqual(utterances, expected_utterances)
-        self.assertListEqual(intent_mapping, expected_intent_mapping)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    def test_generate_noise_utterances(self, mocked_get_noise):
-        # Given
-        language = LANGUAGE_EN
-        num_intents = 2
-        noise_factor = 1
-        utterances_length = 5
-
-        noise = [str(i) for i in range(utterances_length)]
-        mocked_get_noise.return_value = noise
-
-        augmented_utterances = [
-            {
-                "data": [
-                    {
-                        "text": " ".join(
-                            "{}".format(i) for i in range(utterances_length))
-                    }
-                ]
-            }
-        ]
-        num_utterances = 10
-        random_state = np.random.RandomState(1)
-
-        augmented_utterances = augmented_utterances * num_utterances
-        config = IntentClassifierDataAugmentationConfig(
-            noise_factor=noise_factor)
-        # When
-        noise_utterances = generate_noise_utterances(
-            augmented_utterances, num_intents, config, language, random_state)
-
-        # Then
-        joined_noise = text_to_utterance(" ".join(noise))
-        for u in noise_utterances:
-            self.assertEqual(u, joined_noise)
-
-    def test_add_unknown_words_to_utterances(self):
-        # Given
-        utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": " you ",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": "dude",
-                        "entity": "you"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": "dude",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-
-                    },
-                    {
-                        "text": " you ",
-                        "entity": "you"
-                    }
-                ]
-            }
-        ]
-        unknownword_prob = .5
-        random_state = np.random.RandomState(1)
-
-        # When
-        replacement_string = "unknownword"
-        noisy_utterances = add_unknown_word_to_utterances(
-            utterances, unknown_word_prob=unknownword_prob,
-            replacement_string=replacement_string, random_state=random_state
-        )
-
-        # Then
-        expected_utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": " unknownword ",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": "dude",
-                        "entity": "you"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": "unknownword",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": " unknownword ",
-                        "entity": "you"
-                    }
-                ]
-            }
-        ]
-        self.assertEqual(expected_utterances, noisy_utterances)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    def test_generate_noise_utterances_should_replace_unknown_words(
-            self, mocked_noise):
-        # Given
-        utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": " you ",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": "bobby",
-                        "entity": "you"
-                    }
-                ]
-            }
-        ]
-        language = LANGUAGE_EN
-        mocked_noise.return_value = ["hello", "dear", "you", "fool"]
-        replacement_string = "unknownword"
-
-        # When
-        noise = generate_smart_noise(utterances, replacement_string, language)
-
-        # Then
-        expected_noise = ["hello", replacement_string, "you",
-                          replacement_string]
-        self.assertEqual(noise, expected_noise)
-
-    def test_remove_builtin_slots(self):
-        # Given
-        language = LANGUAGE_EN
-        dataset = {
-            "entities": {
-                "snips/number": {}
-            },
-            "intents": {
-                "dummy_intent_1": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "I want ",
-                                },
-                                {
-                                    "text": "three",
-                                    "slot_name": "number_of_cups",
-                                    "entity": "snips/number"
-                                },
-                                {
-                                    "text": " cups",
-                                },
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "give me ",
-                                },
-                                {
-                                    "text": "twenty two",
-                                    "slot_name": "number_of_cups",
-                                    "entity": "snips/number"
-                                },
-                                {
-                                    "text": " big cups please",
-                                },
-                            ]
-                        }
-                    ]
-                }
-            },
-            "language": language
-        }
-
-        # When
-        filtered_dataset = remove_builtin_slots(dataset)
-
-        # Then
-        expected_dataset = {
-            "entities": {
-                "snips/number": {}
-            },
-            "intents": {
-                "dummy_intent_1": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "I want ",
-                                },
-                                {
-                                    "text": " cups",
-                                },
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "give me ",
-                                },
-                                {
-                                    "text": " big cups please",
-                                },
-                            ]
-                        }
-                    ]
-                }
-            },
-            "language": language
-        }
-
-        self.assertDictEqual(expected_dataset, filtered_dataset)

From 830f426c2bf1e40b18a327b904fae734396ec3bf Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Thu, 8 Nov 2018 16:41:55 +0100
Subject: [PATCH 20/24] Update default configs

---
 snips_nlu/default_configs/config_de.py | 1 +
 snips_nlu/default_configs/config_en.py | 1 +
 snips_nlu/default_configs/config_es.py | 1 +
 snips_nlu/default_configs/config_fr.py | 1 +
 snips_nlu/default_configs/config_it.py | 1 +
 snips_nlu/default_configs/config_ja.py | 1 +
 snips_nlu/default_configs/config_ko.py | 1 +
 7 files changed, 7 insertions(+)

diff --git a/snips_nlu/default_configs/config_de.py b/snips_nlu/default_configs/config_de.py
index 99cd61ead..34b6eabd6 100644
--- a/snips_nlu/default_configs/config_de.py
+++ b/snips_nlu/default_configs/config_de.py
@@ -175,6 +175,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_en.py b/snips_nlu/default_configs/config_en.py
index 5c12803f3..a7bbbfa5c 100644
--- a/snips_nlu/default_configs/config_en.py
+++ b/snips_nlu/default_configs/config_en.py
@@ -152,6 +152,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_es.py b/snips_nlu/default_configs/config_es.py
index dd6e6b8cc..3356b1395 100644
--- a/snips_nlu/default_configs/config_es.py
+++ b/snips_nlu/default_configs/config_es.py
@@ -139,6 +139,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_fr.py b/snips_nlu/default_configs/config_fr.py
index dd6e6b8cc..3356b1395 100644
--- a/snips_nlu/default_configs/config_fr.py
+++ b/snips_nlu/default_configs/config_fr.py
@@ -139,6 +139,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_it.py b/snips_nlu/default_configs/config_it.py
index dd6e6b8cc..3356b1395 100644
--- a/snips_nlu/default_configs/config_it.py
+++ b/snips_nlu/default_configs/config_it.py
@@ -139,6 +139,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_ja.py b/snips_nlu/default_configs/config_ja.py
index 46849b8b3..cfe6fac4a 100644
--- a/snips_nlu/default_configs/config_ja.py
+++ b/snips_nlu/default_configs/config_ja.py
@@ -195,6 +195,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_ko.py b/snips_nlu/default_configs/config_ko.py
index 4da2fd365..0b8c61245 100644
--- a/snips_nlu/default_configs/config_ko.py
+++ b/snips_nlu/default_configs/config_ko.py
@@ -173,6 +173,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },

From 70439d234c29c9f6d3824096163b0097781bd43a Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Mon, 26 Nov 2018 14:25:06 +0100
Subject: [PATCH 21/24] Fixes for review

---
 snips_nlu/intent_classifier/log_reg_classifier_utils.py | 4 +---
 snips_nlu/tests/test_log_reg_classifier_utils.py        | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
index dab48053f..83ceeb278 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -91,11 +91,9 @@ def add_unknown_word_to_utterances(utterances, replacement_string,
                                    unknown_word_prob, max_unknown_words,
                                    random_state):
     new_utterances = deepcopy(utterances)
-    unknown_word_lengths = [i for i in range(1, max_unknown_words + 1)]
     for u in new_utterances:
         if random_state.rand() < unknown_word_prob:
-            # num_unknown = random_state.choice(unknown_word_lengths, p=p)
-            num_unknown = random_state.choice(unknown_word_lengths)
+            num_unknown = random_state.randint(1, max_unknown_words + 1)
             # We choose to put the noise at the end of the sentence and not
             # in the middle so that it doesn't impact to much ngrams
             # computation
diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py
index ff616a00d..589b79c30 100644
--- a/snips_nlu/tests/test_log_reg_classifier_utils.py
+++ b/snips_nlu/tests/test_log_reg_classifier_utils.py
@@ -133,7 +133,7 @@ def mocked_rand():
         max_unknown_words = 3
         rg_it = cycle([i for i in range(1, max_unknown_words + 1)])
 
-        def mocked_choice(a):  # pylint: disable=unused-argument
+        def mocked_randint(a, b):  # pylint: disable=unused-argument
             return next(rg_it)
 
         unknownword_prob = .5
@@ -142,10 +142,10 @@ def mocked_choice(a):  # pylint: disable=unused-argument
         random_state_rand = MagicMock()
         random_state_rand.side_effect = mocked_rand
         random_state_choice = MagicMock()
-        random_state_choice.side_effect = mocked_choice
+        random_state_choice.side_effect = mocked_randint
 
         random_state.rand = random_state_rand
-        random_state.choice = random_state_choice
+        random_state.randint = random_state_choice
 
         # When
         replacement_string = "unknownword"

From 3e19c2af8eda16e53224e4339f33dec039a1518c Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Mon, 26 Nov 2018 14:21:42 +0100
Subject: [PATCH 22/24] Set default verbosity to False

---
 snips_nlu/cli/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snips_nlu/cli/metrics.py b/snips_nlu/cli/metrics.py
index e76ea3766..63b1de0aa 100644
--- a/snips_nlu/cli/metrics.py
+++ b/snips_nlu/cli/metrics.py
@@ -97,7 +97,7 @@ def progression_handler(progress):
 )
 def train_test_metrics(train_dataset_path, test_dataset_path, output_path,
                        config_path=None, exclude_slot_metrics=False,
-                       include_errors=False, verbose=True):
+                       include_errors=False, verbose=False):
     if verbose:
         set_nlu_logger(logging.DEBUG)
 

From de7e947257caaf57508a46cb4e064d8e86d5a065 Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Mon, 26 Nov 2018 17:12:11 +0100
Subject: [PATCH 23/24] Add max_unknown_words parameter in serialization

---
 .../intent_classifier/log_reg_classifier_utils.py | 15 ++++++++-------
 snips_nlu/pipeline/configs/intent_classifier.py   |  4 +++-
 snips_nlu/tests/test_config.py                    |  1 +
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
index 83ceeb278..73f06493f 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -151,13 +151,14 @@ def build_training_data(dataset, language, data_augmentation_config,
         augmented_utterances += utterances
         utterance_classes += [classes_mapping[intent_name] for _ in
                               range(len(utterances))]
-    augmented_utterances = add_unknown_word_to_utterances(
-        augmented_utterances,
-        data_augmentation_config.unknown_words_replacement_string,
-        data_augmentation_config.unknown_word_prob,
-        data_augmentation_config.max_unknown_words,
-        random_state
-    )
+    if data_augmentation_config.unknown_words_replacement_string is not None:
+        augmented_utterances = add_unknown_word_to_utterances(
+            augmented_utterances,
+            data_augmentation_config.unknown_words_replacement_string,
+            data_augmentation_config.unknown_word_prob,
+            data_augmentation_config.max_unknown_words,
+            random_state
+        )
 
     # Adding noise
     noise = get_dataset_specific_noise(dataset, language)
diff --git a/snips_nlu/pipeline/configs/intent_classifier.py b/snips_nlu/pipeline/configs/intent_classifier.py
index 8ea83dcd2..4ac330991 100644
--- a/snips_nlu/pipeline/configs/intent_classifier.py
+++ b/snips_nlu/pipeline/configs/intent_classifier.py
@@ -118,7 +118,8 @@ class IntentClassifierDataAugmentationConfig(Config):
 
     def __init__(self, min_utterances=20, noise_factor=5,
                  add_builtin_entities_examples=True, unknown_word_prob=0,
-                 unknown_words_replacement_string=None, max_unknown_words=3):
+                 unknown_words_replacement_string=None,
+                 max_unknown_words=None):
         self.min_utterances = min_utterances
         self.noise_factor = noise_factor
         self.add_builtin_entities_examples = add_builtin_entities_examples
@@ -146,6 +147,7 @@ def to_dict(self):
             "unknown_word_prob": self.unknown_word_prob,
             "unknown_words_replacement_string":
                 self.unknown_words_replacement_string,
+            "max_unknown_words": self.max_unknown_words
         }
 
     @classmethod
diff --git a/snips_nlu/tests/test_config.py b/snips_nlu/tests/test_config.py
index 16fd8c375..3077ae149 100644
--- a/snips_nlu/tests/test_config.py
+++ b/snips_nlu/tests/test_config.py
@@ -27,6 +27,7 @@ def test_intent_classifier_data_augmentation_config(self):
             "add_builtin_entities_examples": False,
             "unknown_word_prob": 0.1,
             "unknown_words_replacement_string": "foobar",
+            "max_unknown_words": None,
         }
 
         # When

From 4f338c665771f73cf0d77cf7909b448800950de4 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Mon, 26 Nov 2018 17:49:11 +0100
Subject: [PATCH 24/24] Update Changelog

---
 CHANGELOG.md                             | 10 ++++++++++
 snips_nlu/intent_classifier/modifiers.py | 24 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 snips_nlu/intent_classifier/modifiers.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52fdfde1e..0dfe8542b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,15 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [0.18.0] - 2018-11-26
+### Added
+- New YAML format to create dataset
+- Verbose mode in CLI
+
+### Changed
+- Bump `snips-nlu-ontology` to `0.62.0` to improve memory usage 
+
+
 ## [0.17.4] - 2018-11-20
 ### Added
 - Add a `--config` argument in the metrics CLI
@@ -175,6 +184,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
+[0.18.0]: https://github.com/snipsco/snips-nlu/compare/0.17.4...0.18.0
 [0.17.4]: https://github.com/snipsco/snips-nlu/compare/0.17.3...0.17.4
 [0.17.3]: https://github.com/snipsco/snips-nlu/compare/0.17.2...0.17.3
 [0.17.2]: https://github.com/snipsco/snips-nlu/compare/0.17.1...0.17.2
diff --git a/snips_nlu/intent_classifier/modifiers.py b/snips_nlu/intent_classifier/modifiers.py
new file mode 100644
index 000000000..50d2bd937
--- /dev/null
+++ b/snips_nlu/intent_classifier/modifiers.py
@@ -0,0 +1,24 @@
+MODIFIERS = {
+    "it": {
+        "più",
+        "piu",
+        "meno",
+        "molto",
+        "non",
+        "troppo",
+        "troppa",
+        "ancora",
+        "senza",
+        "con",
+        "forte",
+        "forti",
+        "alto",
+        "alta",
+        "alti",
+        "alte"
+        "bassa",
+        "basso",
+        "bassi",
+        "basse"
+    }
+}