diff --git a/CHANGELOG.md b/CHANGELOG.md
index 52fdfde1e..0dfe8542b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,15 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [0.18.0] - 2018-11-26
+### Added
+- New YAML format to create dataset
+- Verbose mode in CLI
+
+### Changed
+- Bump `snips-nlu-ontology` to `0.62.0` to improve memory usage 
+
+
 ## [0.17.4] - 2018-11-20
 ### Added
 - Add a `--config` argument in the metrics CLI
@@ -175,6 +184,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
+[0.18.0]: https://github.com/snipsco/snips-nlu/compare/0.17.4...0.18.0
 [0.17.4]: https://github.com/snipsco/snips-nlu/compare/0.17.3...0.17.4
 [0.17.3]: https://github.com/snipsco/snips-nlu/compare/0.17.2...0.17.3
 [0.17.2]: https://github.com/snipsco/snips-nlu/compare/0.17.1...0.17.2
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 365e50c8c..9832e7ec6 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -96,6 +96,20 @@ Configurations
    :members:
 
 
+Dataset
+-------
+
+.. module:: snips_nlu.dataset
+
+.. autoclass:: Dataset
+   :members:
+
+.. autoclass:: Intent
+   :members:
+
+.. autoclass:: Entity
+   :members:
+
 Result and output format
 ------------------------
 
diff --git a/docs/source/cli.rst b/docs/source/cli.rst
index a5334df0c..c031f7600 100644
--- a/docs/source/cli.rst
+++ b/docs/source/cli.rst
@@ -14,70 +14,30 @@ is typically used by running ``snips-nlu <command> [args]`` or alternatively
 Creating a dataset
 ------------------
 
-As seen in the :ref:`tutorial` section, a command allows you to generate a
-dataset from a :ref:`language <languages>` and a list of text files describing
-:ref:`intents <intent>` and :ref:`entities <slot>`:
+As seen in the :ref:`tutorial <tutorial>` section, a command allows you to generate a
+dataset from a :ref:`language <languages>` and a list of YAML files containing
+data for :ref:`intents <intent>` and :ref:`entities <slot>`:
 
 .. code-block:: bash
 
-   snips-nlu generate-dataset en intent_1.txt intent_2.txt entity_1.txt
+   snips-nlu generate-dataset en my_first_intent.yaml my_second_intent.yaml my_entity.yaml
 
-This will print a Json string to the standard output. If you want to store the
-dataset directly in a Json file, you just have to pipe the previous command like
-below:
-
-.. code-block:: bash
-
-   snips-nlu generate-dataset en intent_1.txt intent_2.txt entity_1.txt > dataset.json
-
-
-Each intent file corresponds to a single intent, and the name of the file must
-start with ``intent_``. The same is true for entity files, which must start
-with ``entity_``.
-
-An intent file is a text file in which each row corresponds to an utterance.
-Slots, along with their corresponding slot type (entity), can be defined using
-the following syntax:
-
-.. code-block:: console
+.. note::
 
-   Find me a flight from [departure:city](Paris) to [destination:city](London)
-   Find me a flight from [departure:city](Moscow) [departureDate:snips/datetime](tomorrow around 9pm)
+    You don't have to use separated files for each intent and entity. You could
+    for instance merge all intents together in a single ``intents.yaml`` file,
+    or even merge all intents and entities in a single ``dataset.yaml`` file.
 
-In this example, there are three different slots -- ``departure``,
-``destination`` and ``departureDate`` -- and two different entities -- ``city``
-and ``snips/datetime`` (which is a :ref:`builtin entity <builtin_entity_resolution>`).
-Check :ref:`this section <entity_vs_slot_name>` to have more details about the
-difference between slots and entities.
-
-An entity file is a comma separated text file in which each row corresponds to
-an entity value, optionally followed with its :ref:`synonyms <synonyms>`. The syntax used
-is the following:
-
-.. code-block:: console
-
-   bedroom
-   garden,yard,backyard
-
-Here, the entity (room) has two values which are ``"bedroom"`` and ``"garden"``.
-Two synonyms, ``"yard"`` and ``"backyard"``, are defined for ``"garden"``.
-If a value or a synonym contains a comma, the value must be put between
-double quotes ``"``.
-
-If the value contains double quotes, it must be doubled
-to be escaped like this:  ``"A value with a "","" in it"`` which corresponds
-to the actual value ``A value with a "," in it``.
-
-.. Note::
+This will print a JSON string to the standard output. If you want to store the
+dataset directly in a JSON file, you just have to pipe the previous command like
+below:
 
-    By default entities are generated as :ref:`automatically extensible <auto_extensible>`,
-    i.e. the recognition will accept additional values than the ones listed in
-    the entity file. This behavior can be changed by adding at the beginning of
-    the entity file the following:
+.. code-block:: bash
 
-    .. code-block:: bash
+   snips-nlu generate-dataset en my_first_intent.yaml my_second_intent.yaml my_entity.yaml > dataset.json
 
-       # automatically_extensible=false
+Check the :ref:`Training Dataset Format <dataset>` section for more details
+about the format used to describe the training data.
 
 .. _training_cli:
 
diff --git a/docs/source/dataset.rst b/docs/source/dataset.rst
new file mode 100644
index 000000000..19d62b580
--- /dev/null
+++ b/docs/source/dataset.rst
@@ -0,0 +1,324 @@
+.. _dataset:
+
+Training Dataset Format
+=======================
+
+The Snips NLU library leverages machine learning algorithms and some training
+data in order to produce a powerful intent recognition engine.
+
+The better your training data is, and the more accurate your NLU engine will
+be. Thus, it is worth spending a bit of time to create a dataset that
+matches well your use case.
+
+Snips NLU accepts two different dataset formats. The first one, which relies
+on YAML, is the preferred option if you want to create or edit a dataset
+manually.
+The other dataset format uses JSON and should rather be used if you plan to
+create or edit datasets programmatically.
+
+.. _yaml_format:
+
+===========
+YAML format
+===========
+
+The YAML dataset format allows you to define intents and entities using the
+`YAML <http://yaml.org/about.html>`_ syntax.
+
+.. _yaml_entity_format:
+
+Entity
+------
+
+Here is what an entity file looks like:
+
+.. code-block:: yaml
+
+    # City Entity
+    ---
+    type: entity # allows to differentiate between entities and intents files
+    name: city # name of the entity
+    values:
+      - london # single entity value
+      - [new york, big apple] # entity value with a synonym
+      - [paris, city of lights]
+
+You can specify entity values either using single YAML scalars (e.g. ``london``),
+or using lists if you want to define some synonyms (e.g.
+``[paris, city of lights]``)
+
+Here is a more comprehensive example which contains additional attributes that
+are optional:
+
+.. code-block:: yaml
+
+    # City Entity
+    ---
+    type: entity
+    name: city
+    automatically_extensible: false # default value is true
+    use_synonyms: false # default value is true
+    matching_strictness: 0.8 # default value is 1.0
+    values:
+      - london
+      - [new york, big apple]
+      - [paris, city of lights]
+
+.. _yaml_intent_format:
+
+Intent
+------
+
+Here is the format used to describe an intent:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight # name of the intent
+    utterances:
+      - find me a flight from [origin:city](Paris) to [destination:city](New York)
+      - I need a flight leaving [date:snips/datetime](this weekend) to [destination:city](Berlin)
+      - show me flights to go to [destination:city](new york) leaving [date:snips/datetime](this evening)
+
+We use a standard markdown-like annotation syntax to annotate slots within
+utterances. The ``[origin:city](Paris)`` chunk describes a slot with its three
+components:
+
+    - ``origin``: the slot name
+    - ``city``: the slot type
+    - ``Paris``: the slot value
+
+Note that different slot names can share the same slot type. This is the case
+for the ``origin`` and ``destination`` slot names in the previous example, which
+have the same slot type ``city``.
+
+If you are to write more than just three utterances, you can actually specify
+the slot mapping explicitly in the intent file and remove it from the
+utterances. This will result in simpler annotations:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight
+    slots:
+      - name: origin
+        entity: city
+      - name: destination
+        entity: city
+      - name: date
+        entity: snips/datetime
+    utterances:
+      - find me a flight from [origin](Paris) to [destination](New York)
+      - I need a flight leaving [date](this weekend) to [destination](Berlin)
+      - show me flights to go to [destination](new york) leaving [date](this evening)
+
+
+.. _yaml_dataset_format:
+
+Dataset
+-------
+
+You are free to organize the yaml documents as you want. Either having one yaml
+file for each intent and each entity, or gathering some documents together
+(e.g. all entities together, or all intents together) in the same yaml file.
+Here is the yaml file corresponding to the previous ``city`` entity and
+``searchFlight`` intent merged together:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight
+    slots:
+      - name: origin
+        entity: city
+      - name: destination
+        entity: city
+      - name: date
+        entity: snips/datetime
+    utterances:
+      - find me a flight from [origin](Paris) to [destination](New York)
+      - I need a flight leaving [date](this weekend) to [destination](Berlin)
+      - show me flights to go to [destination](new york) leaving [date](this evening)
+
+    # City Entity
+    ---
+    type: entity
+    name: city
+    values:
+      - london
+      - [new york, big apple]
+      - [paris, city of lights]
+
+.. important::
+
+    If you plan to have more than one entity or intent in a YAML file, you must
+    separate them using the YAML document separator: ``---``
+
+---------------------------------------
+Implicit entity values and slot mapping
+---------------------------------------
+
+In order to make the annotation process even easier, there is a mechanism that
+allows to populate entity values automatically based on the entity values that
+are already provided.
+
+This results in a much simpler dataset file:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight
+    slots:
+      - name: origin
+        entity: city
+      - name: destination
+        entity: city
+      - name: date
+        entity: snips/datetime
+    utterances:
+      - find me a flight from [origin] to [destination]
+      - I need a flight leaving [date] to [destination]
+      - show me flights to go to [destination] leaving [date]
+
+    # City Entity
+    ---
+    type: entity
+    name: city
+    values:
+      - london
+      - [new york, big apple]
+      - [paris, city of lights]
+
+For this to work, you need to provide at least one value for each
+*custom entity*. This can be done either through an entity file, or simply by
+providing an entity value in one of the annotated utterances.
+Entity values are automatically generated for *builtin entities*.
+
+Here is a final example of a valid YAML dataset leveraging implicit entity
+values as well as implicit slot mapping:
+
+.. code-block:: yaml
+
+    # searchFlight Intent
+    ---
+    type: intent
+    name: searchFlight
+    utterances:
+      - find me a flight from [origin:city](Paris) to [destination:city]
+      - I need a flight leaving [date:snips/datetime] to [destination]
+      - show me flights to go to [destination] leaving [date]
+
+Note that the city entity was not provided here, but one value (``Paris``) was
+provided in the first annotated utterance. The mapping between slot name and
+entity is also inferred from the first two utterances.
+
+Once your intents and entities are created using the YAML format described
+previously, you can produce a dataset using the
+:ref:`Command Line Interface (CLI) <cli>`:
+
+.. code-block:: console
+
+    snips-nlu generate-dataset en city.yaml searchFlight.yaml > dataset.json
+
+Or alternatively if you merged the yaml documents into a single file:
+
+.. code-block:: console
+
+    snips-nlu generate-dataset en dataset.yaml > dataset.json
+
+This will generate a JSON dataset and write it in the ``dataset.json`` file.
+The format of the generated file is the second allowed format that is described
+in the :ref:`JSON format <json_format>` section.
+
+.. _json_format:
+
+===========
+JSON format
+===========
+
+The JSON format is the format which is eventually used by the training API. It
+was designed to be easy to parse.
+
+We created a `sample dataset`_ that you can check to better understand the
+format.
+
+There are three attributes at the root of the JSON document:
+
+    - ``"language"``: the language of the dataset in :ref:`ISO format <languages>`
+    - ``"intents"``: a dictionary mapping between intents names and intents data
+    - ``"entities"``: a dictionary mapping between entities names and entities data
+
+Here is how the entities are represented in this format:
+
+.. code-block:: json
+
+    {
+      "entities": {
+        "snips/datetime": {},
+        "city": {
+          "data": [
+            {
+              "value": "london",
+              "synonyms": []
+            },
+            {
+              "value": "new york",
+              "synonyms": [
+                "big apple"
+              ]
+            },
+            {
+              "value": "paris",
+              "synonyms": [
+                "city of lights"
+              ]
+            }
+          ],
+          "use_synonyms": true,
+          "automatically_extensible": true,
+          "matching_strictness": 1.0
+        }
+      }
+    }
+
+Note that the ``"snips/datetime"`` entity data is empty as it is a
+:ref:`builtin entity <builtin_entity_resolution>`.
+
+The intent utterances are defined using the following format:
+
+.. code-block:: json
+
+    {
+      "data": [
+        {
+          "text": "find me a flight from "
+        },
+        {
+          "text": "Paris",
+          "entity": "city",
+          "slot_name": "origin"
+        },
+        {
+          "text": " to "
+        },
+        {
+          "text": "New York",
+          "entity": "city",
+          "slot_name": "destination"
+        }
+      ]
+    }
+
+Once you have created a JSON dataset, either directly or with YAML files, you
+can use it to train an NLU engine. To do so, you can use the CLI as documented
+:ref:`here <training_cli>`, or the :ref:`python API <training_the_engine>`.
+
+.. _sample dataset: https://github.com/snipsco/snips-nlu/blob/master/snips_nlu_samples/sample_dataset.json
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 94dbbe1ab..faf75bd2d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,9 +81,10 @@ the :ref:`api` documentation or alternatively check the `github repository`_.
    installation
    quickstart
    tutorial
-   cli
    data_model
+   dataset
    languages
+   cli
    api
 
 
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index e36dddbb6..cbb2b4115 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -43,7 +43,7 @@ resources used to improve performance with the :func:`.load_resources` function.
     nlu_engine = SnipsNLUEngine()
 
 Now that we have our engine object created, we need to feed it with our sample
-dataset. In general, this action will require some *machine learning* hence we
+dataset. In general, this action will require some *machine learning*, so we
 will actually *fit* the engine:
 
 .. code-block:: python
@@ -52,7 +52,7 @@ will actually *fit* the engine:
 
 
 Our NLU engine is now trained to recognize new utterances that extend beyond
-what is strictly contained in the dataset, it is able to *generalize*.
+what is strictly contained in the dataset: it is able to *generalize*.
 
 Let's try to parse something now!
 
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index 0ae3c17da..72b6576b2 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -4,199 +4,112 @@ Tutorial
 ========
 
 In this section, we will build an NLU assistant for home automation tasks. It
-will be able to understand queries about lights and thermostats. More precisely
-our assistant will contain three :ref:`intents <intent>`:
+will be able to understand queries about lights and thermostats. More
+precisely, our assistant will contain three :ref:`intents <intent>`:
 
 - ``turnLightOn``
 - ``turnLightOff``
 - ``setTemperature``
 
 The first two intents will be about turning on and off the lights in a specific
-room. Thus, these intents will have one :ref:`slot` which will be the ``room``.
-The third intent will let you control the temperature of a specific room, thus
-it will have two slots: the ``roomTemperature`` and the ``room``.
+room. These intents will have one :ref:`slot` which will be the ``room``.
+The third intent will let you control the temperature of a specific room. It
+will have two slots: the ``roomTemperature`` and the ``room``.
 
 The first step is to create an appropriate dataset for this task.
 
-.. _dataset:
-
-Snips dataset format
---------------------
-
-The format used by Snips to describe the input data is designed to be simple to
-parse as well as easy to read.
-
-We created a `sample dataset`_ that you can check to better understand the
-format.
-
-You have three options to create your dataset. You can build it manually by
-respecting the format used in the sample, you can also use the
-:ref:`dataset creation CLI <dataset_cli>` included in the lib, or alternatively
-you can use `chatito`_ a DSL tool for dataset generation.
-
-We will go for the second option here and start by creating three files
-corresponding to our three intents and one entity file corresponding to the
-``room`` entity:
-
-- ``intent_turnLightOn.txt``
-- ``intent_turnLightOff.txt``
-- ``intent_setTemperature.txt``
-- ``entity_room.txt``
-
-The name of each file is important as the tool will map it to the intent or
-entity name. In particular, the prefixes ``intent_`` and ``entity_`` are
-required in order to distinguish intents from entity files.
-
-Let's add training examples for the first intent by inserting the following
-lines in the first file, ``intent_turnLightOn.txt``:
-
-.. code-block:: console
-
-    Turn on the lights in the [room:room](kitchen)
-    give me some light in the [room:room](bathroom) please
-    Can you light up the [room:room](living room) ?
-    switch the [room:room](bedroom)'s lights on please
-
-We use a standard markdown-like annotation syntax to annotate slots within
-utterances. The ``[room:room]`` chunks describe the slot with its two
-components: :ref:`the slot name and the entity <entity_vs_slot_name>`. In our
-case we used the same value, ``room``, to describe both. The parts with
-parenthesis, like ``(kitchen)``, correspond to the text value of the slot.
-
-Let's move on to the second intent, and insert this into
-``intent_turnLightOff.txt``:
-
-.. code-block:: console
-
-    Turn off the lights in the [room:room](entrance)
-    turn the [room:room](bathroom)'s light out please
-    switch off the light the [room:room](kitchen), will you?
-    Switch the [room:room](bedroom)'s lights off please
-
-And now the last file, ``intent_setTemperature.txt``:
-
-.. code-block:: console
-
-    Set the temperature to [roomTemperature:snips/temperature](19 degrees) in the [room:room](bedroom)
-    please set the [room:room](living room)'s temperature to [roomTemperature:snips/temperature](twenty two degrees celsius)
-    I want [roomTemperature:snips/temperature](75 degrees fahrenheit) in the [room:room](bathroom) please
-    Can you increase the temperature to [roomTemperature:snips/temperature](22 degrees) ?
-
-As you can see here, we used a new slot, ``[room_temperature:snips/temperature]``,
-whose name is ``roomTemperature`` and whose type is ``snips/temperature``. The slot
-type used here is a :ref:`builtin entity <builtin_entity_resolution>`. It
-allows you to resolve the temperature values properly.
-
-Let's move to the ``entity_room.txt`` entity file:
-
-.. code-block:: console
-
-    bedroom
-    living room,main room
-    garden,yard,backyard
-
-The entity file is a comma (``,``) separated file. Each line corresponds to an
-entity value followed by its potential :ref:`synonyms <synonyms>`.
-
-We are now ready to generate our dataset:
+Training Data
+-------------
+
+Check the :ref:`Training Dataset Format <dataset>` section for more details
+about the format used to describe the training data.
+
+In this tutorial, we will create our dataset using the
+:ref:`YAML format <yaml_format>`, and create a ``dataset.yaml`` file with the
+following content:
+
+.. code-block:: yaml
+
+    # turnLightOn intent
+    ---
+    type: intent
+    name: turnLightOn
+    slots:
+      - name: room
+        entity: room
+    utterances:
+      - Turn on the lights in the [room](kitchen)
+      - give me some light in the [room](bathroom) please
+      - Can you light up the [room](living room) ?
+      - switch the [room](bedroom)'s lights on please
+
+    # turnLightOff intent
+    ---
+    type: intent
+    name: turnLightOff
+    slots:
+      - name: room
+        entity: room
+    utterances:
+      - Turn off the lights in the [room](entrance)
+      - turn the [room](bathroom)'s light out please
+      - switch off the light the [room](kitchen), will you?
+      - Switch the [room](bedroom)'s lights off please
+
+    # setTemperature intent
+    ---
+    type: intent
+    name: setTemperature
+    slots:
+      - name: room
+        entity: room
+      - name: roomTemperature
+        entity: snips/temperature
+    utterances:
+      - Set the temperature to [roomTemperature](19 degrees) in the [room](bedroom)
+      - please set the [room](living room)'s temperature to [roomTemperature](twenty two degrees celsius)
+      - I want [roomTemperature](75 degrees fahrenheit) in the [room](bathroom) please
+      - Can you increase the temperature to [roomTemperature](22 degrees) ?
+
+    # room entity
+    ---
+    type: entity
+    name: room
+    automatically_extensible: no
+    values:
+    - bedroom
+    - [living room, main room, lounge]
+    - [garden, yard, backyard]
+
+Here, we put all the intents and entities in the same file but we could have
+split them in dedicated files as well.
+
+The ``setTemperature`` intent references a ``roomTemperature`` slot which
+relies on the ``snips/temperature`` entity. This entity is a
+:ref:`builtin entity <builtin_entity_resolution>`. It allows to resolve the
+temperature values properly.
+
+The ``room`` entity makes use of :ref:`synonyms <synonyms>` by defining lists
+like ``[living room, main room, lounge]``. In this case, ``main room`` and
+``lounge`` will point to ``living room``, the first item of the list, which is
+the reference value.
+
+Besides, this entity is marked as not
+:ref:`automatically extensible <auto_extensible>` which means that the NLU
+will only output values that we have defined and will not try to match other
+values.
+
+We are now ready to generate our dataset using the :ref:`CLI <cli>`:
 
 .. code-block:: bash
 
-    snips-nlu generate-dataset en intent_turnLightOn.txt intent_turnLightOff.txt intent_setTemperature.txt entity_room.txt > dataset.json
+    snips-nlu generate-dataset en dataset.yaml > dataset.json
 
 .. note::
 
     We used ``en`` as the language here but other languages are supported,
     please check the :ref:`languages` section to know more.
 
-Now, the ``"entities"`` part of the generated json looks like that:
-
-.. code-block:: json
-
-    {
-      "entities": {
-        "room": {
-          "automatically_extensible": true,
-          "data": [
-            {
-              "synonyms": [],
-              "value": "bedroom"
-            },
-            {
-              "synonyms": [
-                "main room"
-              ],
-              "value": "living room"
-            },
-            {
-              "synonyms": [
-                "yard",
-                "backyard"
-              ],
-              "value": "garden"
-            }
-          ],
-          "matching_strictness": 1.0,
-          "use_synonyms": true
-        },
-        "snips/temperature": {}
-      }
-    }
-
-You can see that both entities from the intent utterances and from the ``room``
-entity file were added.
-
-By default, the ``room`` entity is set to be
-:ref:`automatically extensible <auto_extensible>` but in our case we don't want
-to handle any entity value that would not be part of the dataset, so we set
-this attribute to ``false``.
-Moreover, we are going to add some rooms that were not in the previous sentences
-and that we want our assistant to cover. Additionally, we add some
-:ref:`synonyms <synonyms>`. Finally, the entities part looks like that:
-
-.. code-block:: json
-
-    {
-      "entities": {
-        "room": {
-          "automatically_extensible": false,
-          "data": [
-            {
-              "synonyms": [],
-              "value": "bathroom"
-            },
-            {
-              "synonyms": [
-                "sleeping room"
-              ],
-              "value": "bedroom"
-            },
-            {
-              "synonyms": [
-                "main room",
-                "lounge"
-              ],
-              "value": "living room"
-            },
-            {
-              "synonyms": [
-                "yard",
-                "backyard"
-              ],
-              "value": "garden"
-            }
-          ],
-          "matching_strictness": 1.0,
-          "use_synonyms": true
-        },
-        "snips/temperature": {}
-      }
-    }
-
-
-We don't need to edit the ``snips/temperature`` entity as it is a builtin
-entity.
-
 Now that we have our dataset ready, let's move to the next step which is to
 create an NLU engine.
 
@@ -251,6 +164,8 @@ That will raise a ``NotTrained`` error, as we did not train the engine with
 the dataset that we created.
 
 
+.. _training_the_engine:
+
 Training the engine
 -------------------
 
diff --git a/setup.py b/setup.py
index 6459693c4..f1c386f7c 100644
--- a/setup.py
+++ b/setup.py
@@ -25,11 +25,13 @@
     "sklearn-crfsuite>=0.3.6,<0.4",
     "semantic_version>=2.6,<3.0",
     "snips_nlu_utils>=0.7,<0.8",
-    "snips_nlu_ontology>=0.61.1,<0.62",
+    "snips_nlu_ontology>=0.62.0,<0.63",
     "num2words>=0.5.6,<0.6",
     "plac>=0.9.6,<1.0",
     "requests>=2.0,<3.0",
     "pathlib==1.0.1; python_version < '3.4'",
+    "pyaml>=17,<18",
+    "deprecation>=2,<3"
 ]
 
 extras_require = {
diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py
index ae1289b8a..6ea37364c 100644
--- a/snips_nlu/__about__.py
+++ b/snips_nlu/__about__.py
@@ -11,8 +11,8 @@
 __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai"
 __license__ = "Apache License, Version 2.0"
 
-__version__ = "0.17.4"
-__model_version__ = "0.17.0"
+__version__ = "0.18.0"
+__model_version__ = "0.18.0"
 
 __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/snipsco/snips-nlu-language-resources/master/compatibility.json"
diff --git a/snips_nlu/__init__.py b/snips_nlu/__init__.py
index e782bbfba..6414de42d 100644
--- a/snips_nlu/__init__.py
+++ b/snips_nlu/__init__.py
@@ -1,5 +1,3 @@
-import logging
-
 from snips_nlu_ontology import get_ontology_version
 
 from snips_nlu.__about__ import __model_version__, __version__
diff --git a/snips_nlu/cli/dataset/__init__.py b/snips_nlu/cli/dataset/__init__.py
deleted file mode 100644
index 3a8808233..000000000
--- a/snips_nlu/cli/dataset/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from snips_nlu.cli.dataset.assistant_dataset import AssistantDataset
diff --git a/snips_nlu/cli/dataset/assistant_dataset.py b/snips_nlu/cli/dataset/assistant_dataset.py
deleted file mode 100644
index 4a1982115..000000000
--- a/snips_nlu/cli/dataset/assistant_dataset.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# coding=utf-8
-from __future__ import unicode_literals, print_function
-
-from pathlib import Path
-
-from snips_nlu.cli.dataset.entities import CustomEntity, create_entity
-from snips_nlu.cli.dataset.intent_dataset import IntentDataset
-
-
-class AssistantDataset(object):
-    """Dataset of an assistant
-
-    Merges a list of :class:`.AssistantDataset` into a single dataset ready to
-    be used by Snips NLU
-
-    Attributes:
-        language (str): language of the assistant
-        intents_datasets (list of :class:`.IntentDataset`): data of the
-            assistant intents
-        entities (list of :class:`.Entity`): data of the assistant entities
-    """
-
-    def __init__(self, language, intent_datasets, entities):
-        self.language = language
-        self.intents_datasets = intent_datasets
-        self.entities = entities
-
-    @classmethod
-    def from_files(cls, language, filenames):
-        """Creates an :class:`.AssistantDataset` from a language and a list of
-        intent and entity files
-
-        Args:
-            language (str): language of the assistant
-            filenames (list of str): Intent and entity files.
-                The assistant will associate each intent file to an intent,
-                and each entity file to an entity. For instance, the intent
-                file 'intent_setTemperature.txt' will correspond to the intent
-                'setTemperature', and the entity file 'entity_room.txt' will
-                correspond to the entity 'room'.
-        """
-        intent_filepaths = set()
-        entity_filepaths = set()
-        for filename in filenames:
-            filepath = Path(filename)
-            stem = filepath.stem
-            if stem.startswith("intent_"):
-                intent_filepaths.add(filepath)
-            elif stem.startswith("entity_"):
-                entity_filepaths.add(filepath)
-            else:
-                raise AssertionError("Filename should start either with "
-                                     "'intent_' or 'entity_' but found: %s"
-                                     % stem)
-
-        intents_datasets = [IntentDataset.from_file(f)
-                            for f in intent_filepaths]
-
-        entities = [CustomEntity.from_file(f) for f in entity_filepaths]
-        entity_names = set(e.name for e in entities)
-
-        # Add entities appearing only in the intents data
-        for intent_data in intents_datasets:
-            for entity_name in intent_data.entities_names:
-                if entity_name not in entity_names:
-                    entity_names.add(entity_name)
-                    entities.append(create_entity(entity_name))
-        return cls(language, intents_datasets, entities)
-
-    @property
-    def json(self):
-        intents = {intent_data.intent_name: intent_data.json
-                   for intent_data in self.intents_datasets}
-        entities = {entity.name: entity.json for entity in self.entities}
-        return dict(language=self.language, intents=intents, entities=entities)
diff --git a/snips_nlu/cli/dataset/entities.py b/snips_nlu/cli/dataset/entities.py
deleted file mode 100644
index bb854f300..000000000
--- a/snips_nlu/cli/dataset/entities.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# coding=utf-8
-from __future__ import unicode_literals
-
-import csv
-import re
-from abc import ABCMeta, abstractmethod
-from pathlib import Path
-
-import six
-from future.utils import with_metaclass
-
-from snips_nlu.constants import (
-    AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS,
-    USE_SYNONYMS, VALUE)
-from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
-
-AUTO_EXT_REGEX = re.compile(r'^#\sautomatically_extensible=(true|false)\s*$')
-
-
-class Entity(with_metaclass(ABCMeta, object)):
-    def __init__(self, name):
-        self.name = name
-
-    @abstractmethod
-    def json(self):
-        pass
-
-
-class CustomEntity(Entity):
-    """Custom entity of an :class:`.AssistantDataset`
-
-        Attributes:
-            utterances (list of :class:`.EntityUtterance`): entity utterances
-            automatically_extensible (bool): whether or not the entity can be
-                extended to values not present in the dataset
-            use_synonyms (bool): whether or not to map entity values using
-                synonyms
-    """
-
-    def __init__(self, name, utterances, automatically_extensible,
-                 use_synonyms, matching_strictness=1.0):
-        super(CustomEntity, self).__init__(name)
-        self.utterances = utterances
-        self.automatically_extensible = automatically_extensible
-        self.use_synonyms = use_synonyms
-        self.matching_strictness = matching_strictness
-
-    @classmethod
-    def from_file(cls, filepath):
-        filepath = Path(filepath)
-        stem = filepath.stem
-        if not stem.startswith("entity_"):
-            raise AssertionError("Entity filename should start with 'entity_' "
-                                 "but found: %s" % stem)
-        entity_name = stem[7:]
-        if not entity_name:
-            raise AssertionError("Entity name must not be empty")
-        utterances = []
-        with filepath.open(encoding="utf-8") as f:
-            it = f
-            if six.PY2:
-                it = list(utf_8_encoder(it))
-            reader = csv.reader(list(it))
-            autoextent = True
-            for row in reader:
-                if six.PY2:
-                    row = [cell.decode("utf-8") for cell in row]
-                value = row[0]
-                if reader.line_num == 1:
-                    m = AUTO_EXT_REGEX.match(row[0])
-                    if m:
-                        autoextent = not m.group(1).lower() == 'false'
-                        continue
-                if len(row) > 1:
-                    synonyms = row[1:]
-                else:
-                    synonyms = []
-                utterances.append(EntityUtterance(value, synonyms))
-        return cls(entity_name, utterances,
-                   automatically_extensible=autoextent, use_synonyms=True)
-
-    @property
-    def json(self):
-        """Returns the entity in json format"""
-        return {
-            AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible,
-            USE_SYNONYMS: self.use_synonyms,
-            DATA: [u.json for u in self.utterances],
-            MATCHING_STRICTNESS: self.matching_strictness
-        }
-
-
-class EntityUtterance(object):
-    """Represents a value of a :class:`.CustomEntity` with potential synonyms
-
-    Attributes:
-        value (str): entity value
-        synonyms (list of str): The values to remap to the utterance value
-        """
-
-    def __init__(self, value, synonyms=None):
-        self.value = value
-        if synonyms is None:
-            synonyms = []
-        self.synonyms = synonyms
-
-    @property
-    def json(self):
-        return {VALUE: self.value, SYNONYMS: self.synonyms}
-
-
-class BuiltinEntity(Entity):
-    """Builtin entity of an :class:`.AssistantDataset`"""
-
-    @property
-    def json(self):
-        return dict()
-
-
-def utf_8_encoder(f):
-    for line in f:
-        yield line.encode("utf-8")
-
-
-def create_entity(entity_name, utterances=None, automatically_extensible=True,
-                  use_synonyms=True):
-    if is_builtin_entity(entity_name):
-        return BuiltinEntity(entity_name)
-    else:
-        if utterances is None:
-            utterances = []
-        return CustomEntity(entity_name, utterances, automatically_extensible,
-                            use_synonyms)
diff --git a/snips_nlu/cli/dataset/examples/entity_location.txt b/snips_nlu/cli/dataset/examples/entity_location.txt
deleted file mode 100644
index c6453a733..000000000
--- a/snips_nlu/cli/dataset/examples/entity_location.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-new york,big apple
-paris,city of lights
-london
\ No newline at end of file
diff --git a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt b/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt
deleted file mode 100644
index 243c4d290..000000000
--- a/snips_nlu/cli/dataset/examples/entity_location_autoextent_false.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-# automatically_extensible=false
-new york,big apple
-paris,city of lights
-london
\ No newline at end of file
diff --git a/snips_nlu/cli/dataset/examples/intent_getWeather.txt b/snips_nlu/cli/dataset/examples/intent_getWeather.txt
deleted file mode 100644
index bc611e565..000000000
--- a/snips_nlu/cli/dataset/examples/intent_getWeather.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-what is the weather in [weatherLocation:location](Paris)?
-Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation:location](Moscow)?
-How is the weather in [weatherLocation:location](San Francisco) [weatherDate:snips/datetime](today)?
\ No newline at end of file
diff --git a/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt b/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt
deleted file mode 100644
index 03f369d50..000000000
--- a/snips_nlu/cli/dataset/examples/intent_whoIsGame.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-who is the [role:role](president) of [country:country](France)
-who is the [role:role](prime minister) of [country:country](UK)
-who is the [role:role](CEO) of [company:company](Google) please
diff --git a/snips_nlu/cli/dataset/intent_dataset.py b/snips_nlu/cli/dataset/intent_dataset.py
deleted file mode 100644
index bebb87575..000000000
--- a/snips_nlu/cli/dataset/intent_dataset.py
+++ /dev/null
@@ -1,309 +0,0 @@
-from __future__ import print_function, absolute_import
-
-from abc import ABCMeta, abstractmethod
-from builtins import object
-from pathlib import Path
-
-from future.utils import with_metaclass
-
-from snips_nlu.constants import UTTERANCES, SLOT_NAME, ENTITY, TEXT, DATA
-
-INTENT_FORMATTING_ERROR = AssertionError(
-    "Intent file is not properly formatted")
-
-
-class IntentDataset(object):
-    """Dataset of an intent
-
-    Can parse utterances from a text file or an iterator.
-
-    An example of utterance is:
-
-        "the [role:role](president) of [country:country](France)"
-
-    a Tag is in this format:
-
-        [slot:entity_name](text_to_tag)
-
-    Attributes:
-        intent_name (str): name of the intent
-        utterances (list of :class:`.IntentUtterance`): intent utterances
-    """
-
-    def __init__(self, intent_name):
-        self.intent_name = intent_name
-        self.utterances = []
-
-    @classmethod
-    def from_file(cls, filepath):
-        filepath = Path(filepath)
-        stem = filepath.stem
-        if not stem.startswith("intent_"):
-            raise AssertionError("Intent filename should start with 'intent_' "
-                                 "but found: %s" % stem)
-        intent_name = stem[7:]
-        if not intent_name:
-            raise AssertionError("Intent name must not be empty")
-        with filepath.open(encoding="utf-8") as f:
-            lines = iter(l.strip() for l in f if l.strip())
-            return cls.from_iter(intent_name, lines)
-
-    @classmethod
-    def from_iter(cls, intent_name, samples_iter):
-        """Generates a dataset from an iterator of samples"""
-        dataset = cls(intent_name)
-        for sample in samples_iter:
-            utterance = IntentUtterance.parse(sample)
-            dataset.add(utterance)
-        return dataset
-
-    def add(self, utterance):
-        """Adds an :class:`.IntentUtterance` to the dataset"""
-        self.utterances.append(utterance)
-
-    @property
-    def json(self):
-        """Intent dataset in json format"""
-        return {
-            UTTERANCES: [
-                {DATA: [chunk.json for chunk in utterance.chunks]}
-                for utterance in self.utterances
-            ]
-        }
-
-    @property
-    def entities_names(self):
-        """Set of entity names present in the intent dataset"""
-        return set(chunk.entity for u in self.utterances
-                   for chunk in u.chunks if isinstance(chunk, SlotChunk))
-
-
-class IntentUtterance(object):
-    def __init__(self, input, chunks):
-        self.input = input
-        self.chunks = chunks
-
-    @property
-    def annotated(self):
-        """Annotates with *
-
-        Returns: The sentence annotated just with stars
-
-        Examples:
-
-            >>> from snips_nlu.cli.dataset.intent_dataset import \
-                IntentUtterance
-            >>> p = "the [role:role](president) of [country:country](France)"
-            >>> u = IntentUtterance.parse(p)
-            >>> u.annotated
-            'the *president* of *France*'
-        """
-        binput = bytearray(self.input, 'utf-8')
-        acc = 0
-        star = ord('*')
-        for chunk in self.chunks:
-            if isinstance(chunk, SlotChunk):
-                binput.insert(chunk.range.start + acc, star)
-                binput.insert(chunk.range.end + acc + 1, star)
-                acc += 2
-        return binput.decode('utf-8')
-
-    @staticmethod
-    def stripped(input, chunks):
-        acc = 0
-        s = ''
-        new_chunks = []
-        for chunk in chunks:
-            start = chunk.range.start
-            end = chunk.range.end
-            s += input[start:end]
-            if isinstance(chunk, SlotChunk):
-                acc += chunk.tag_range.size
-                rng = Range(start - acc, end - acc)
-                new_chunk = SlotChunk(chunk.name, chunk.entity, rng,
-                                      chunk.text, chunk.tag_range)
-                new_chunks.append(new_chunk)
-                acc += 1
-            else:
-                rng = Range(start - acc, end - acc)
-                new_chunks.append(TextChunk(chunk.text, rng))
-        return s, new_chunks
-
-    @staticmethod
-    def parse(string):
-        """Parses an utterance
-
-        Args:
-            string (str): an utterance in the class:`.Utterance` format
-
-        Examples:
-
-            >>> from snips_nlu.cli.dataset.intent_dataset import \
-                IntentUtterance
-            >>> u = IntentUtterance.\
-                parse("president of [country:default](France)")
-            >>> len(u.chunks)
-            2
-            >>> u.chunks[0].text
-            'president of '
-            >>> u.chunks[0].range.start
-            0
-            >>> u.chunks[0].range.end
-            13
-        """
-        sm = SM(string)
-        capture_text(sm)
-        string, chunks = IntentUtterance.stripped(string, sm.chunks)
-        return IntentUtterance(string, chunks)
-
-
-class Chunk(with_metaclass(ABCMeta, object)):
-    def __init__(self, text, range):
-        self.text = text
-        self.range = range
-
-    @abstractmethod
-    def json(self):
-        pass
-
-
-class SlotChunk(Chunk):
-    def __init__(self, slot_name, entity, range, text, tag_range):
-        super(SlotChunk, self).__init__(text, range)
-        self.name = slot_name
-        self.entity = entity
-        self.tag_range = tag_range
-
-    @property
-    def json(self):
-        return {
-            TEXT: self.text,
-            SLOT_NAME: self.name,
-            ENTITY: self.entity,
-        }
-
-
-class TextChunk(Chunk):
-    @property
-    def json(self):
-        return {
-            TEXT: self.text
-        }
-
-
-class Range(object):
-    def __init__(self, start, end=None):
-        self.start = start
-        self.end = end
-
-    @property
-    def size(self):
-        return self.end - self.start + 1
-
-
-class SM(object):
-    """State Machine for parsing"""
-
-    def __init__(self, input):
-        self.input = input
-        self.chunks = []
-        self.current = 0
-
-    def add_slot(self, slot_start, name, entity):
-        """Adds a named slot
-
-        Args:
-            slot_start (int): position where the slot tag started
-            name (str): slot name
-            entity (str): entity name
-        """
-        tag_range = Range(slot_start - 1)
-        chunk = SlotChunk(slot_name=name, entity=entity, range=None, text=None,
-                          tag_range=tag_range)
-        self.chunks.append(chunk)
-
-    def add_text(self, text):
-        """Adds a simple text chunk using the current position"""
-        start = self.current
-        end = start + len(text)
-        chunk = TextChunk(text=text, range=Range(start=start, end=end))
-        self.chunks.append(chunk)
-
-    def add_tagged(self, text):
-        """Adds text to the last slot"""
-        if not self.chunks:
-            raise AssertionError("Cannot add tagged text because chunks list "
-                                 "is empty")
-        chunk = self.chunks[-1]
-        chunk.text = text
-        chunk.tag_range.end = self.current - 1
-        chunk.range = Range(start=self.current, end=self.current + len(text))
-
-    def find(self, s):
-        return self.input.find(s, self.current)
-
-    def move(self, pos):
-        """Moves the cursor of the state to position after given
-
-        Args:
-            pos (int): position to place the cursor just after
-        """
-        self.current = pos + 1
-
-    def peek(self):
-        return self[0]
-
-    def read(self):
-        c = self[0]
-        self.current += 1
-        return c
-
-    def __getitem__(self, key):
-        current = self.current
-        if isinstance(key, int):
-            return self.input[current + key]
-        elif isinstance(key, slice):
-            start = current + key.start if key.start else current
-            return self.input[slice(start, key.stop, key.step)]
-        else:
-            raise TypeError("Bad key type: %s" % type(key))
-
-
-def capture_text(state):
-    next_pos = state.find('[')
-    sub = state[:] if next_pos < 0 else state[:next_pos]
-    if sub.strip():
-        state.add_text(sub)
-    if next_pos >= 0:
-        state.move(next_pos)
-        capture_slot(state)
-
-
-def capture_slot(state):
-    slot_start = state.current
-    next_pos = state.find(':')
-    if next_pos < 0:
-        raise INTENT_FORMATTING_ERROR
-    else:
-        slot_name = state[:next_pos]
-        state.move(next_pos)
-        next_pos = state.find(']')
-        if next_pos < 0:
-            raise INTENT_FORMATTING_ERROR
-        entity = state[:next_pos]
-        state.move(next_pos)
-        state.add_slot(slot_start, slot_name, entity)
-        if state.read() != '(':
-            raise INTENT_FORMATTING_ERROR
-        capture_tagged(state)
-
-
-def capture_tagged(state):
-    next_pos = state.find(')')
-    if next_pos < 1:
-        raise INTENT_FORMATTING_ERROR
-    else:
-        tagged_text = state[:next_pos]
-        state.add_tagged(tagged_text)
-        state.move(next_pos)
-        capture_text(state)
diff --git a/snips_nlu/cli/generate_dataset.py b/snips_nlu/cli/generate_dataset.py
index ffb0cea89..a62d7cba0 100644
--- a/snips_nlu/cli/generate_dataset.py
+++ b/snips_nlu/cli/generate_dataset.py
@@ -4,7 +4,7 @@
 
 import plac
 
-from snips_nlu.cli.dataset.assistant_dataset import AssistantDataset
+from snips_nlu.dataset import Dataset
 
 
 @plac.annotations(
@@ -13,5 +13,8 @@
            "filename"))
 def generate_dataset(language, *files):
     """Create a Snips NLU dataset from text friendly files"""
-    dataset = AssistantDataset.from_files(language, list(files))
+    if any(f.endswith(".yml") or f.endswith(".yaml") for f in files):
+        dataset = Dataset.from_yaml_files(language, list(files))
+    else:
+        dataset = Dataset.from_files(language, list(files))
     print(json.dumps(dataset.json, indent=2, sort_keys=True))
diff --git a/snips_nlu/cli/inference.py b/snips_nlu/cli/inference.py
index cfcfe2819..345fb2a5d 100644
--- a/snips_nlu/cli/inference.py
+++ b/snips_nlu/cli/inference.py
@@ -1,19 +1,27 @@
 from __future__ import unicode_literals, print_function
 
 import json
+import logging
+
 from builtins import input
 
 import plac
 
 from snips_nlu import SnipsNLUEngine
+from snips_nlu.cli.utils import set_nlu_logger
 
 
 @plac.annotations(
     training_path=("Path to a trained engine", "positional", None, str),
     query=("Query to parse. If provided, it disables the interactive "
-           "behavior.", "option", "q", str))
-def parse(training_path, query):
+           "behavior.", "option", "q", str),
+    verbose=("Print logs", "flag", "v"),
+)
+def parse(training_path, query, verbose=False):
     """Load a trained NLU engine and play with its parsing API interactively"""
+    if verbose:
+        set_nlu_logger(logging.DEBUG)
+
     engine = SnipsNLUEngine.from_path(training_path)
 
     if query:
diff --git a/snips_nlu/cli/metrics.py b/snips_nlu/cli/metrics.py
index a383970ab..63b1de0aa 100644
--- a/snips_nlu/cli/metrics.py
+++ b/snips_nlu/cli/metrics.py
@@ -1,11 +1,14 @@
 from __future__ import print_function, unicode_literals
 
 import json
+import logging
+
 from pathlib import Path
 
 import plac
 
 from snips_nlu import SnipsNLUEngine, load_resources
+from snips_nlu.cli.utils import set_nlu_logger
 from snips_nlu.utils import json_string
 
 
@@ -38,10 +41,15 @@ def parse(self, text):
                       "(between 0 and 1)", "option", "t", float),
     exclude_slot_metrics=("Exclude slot metrics and slot errors in the output",
                           "flag", "s", bool),
-    include_errors=("Include parsing errors in the output", "flag", "i", bool))
+    include_errors=("Include parsing errors in the output", "flag", "i", bool),
+    verbose=("Print logs", "flag", "v"),
+)
 def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5,
                       train_size_ratio=1.0, exclude_slot_metrics=False,
-                      include_errors=False):
+                      include_errors=False, verbose=False):
+    if verbose:
+        set_nlu_logger(logging.DEBUG)
+
     def progression_handler(progress):
         print("%d%%" % int(progress * 100))
 
@@ -84,10 +92,15 @@ def progression_handler(progress):
     config_path=("Path to a NLU engine config file", "option", "c", str),
     exclude_slot_metrics=("Exclude slot metrics and slot errors in the output",
                           "flag", "s", bool),
-    include_errors=("Include parsing errors in the output", "flag", "i", bool))
+    include_errors=("Include parsing errors in the output", "flag", "i", bool),
+    verbose=("Print logs", "flag", "v"),
+)
 def train_test_metrics(train_dataset_path, test_dataset_path, output_path,
                        config_path=None, exclude_slot_metrics=False,
-                       include_errors=False):
+                       include_errors=False, verbose=False):
+    if verbose:
+        set_nlu_logger(logging.DEBUG)
+
     if config_path is not None:
         with Path(config_path).open("r", encoding="utf-8") as f:
             config = json.load(f)
diff --git a/snips_nlu/cli/training.py b/snips_nlu/cli/training.py
index 682309220..72a12d106 100644
--- a/snips_nlu/cli/training.py
+++ b/snips_nlu/cli/training.py
@@ -20,7 +20,7 @@
 def train(dataset_path, output_path, config_path, verbose):
     """Train an NLU engine on the provided dataset"""
     if verbose:
-        set_nlu_logger(logging.INFO)
+        set_nlu_logger(logging.DEBUG)
     with Path(dataset_path).open("r", encoding="utf8") as f:
         dataset = json.load(f)
 
diff --git a/snips_nlu/cli/utils.py b/snips_nlu/cli/utils.py
index a31f19e2a..e04106029 100644
--- a/snips_nlu/cli/utils.py
+++ b/snips_nlu/cli/utils.py
@@ -105,6 +105,6 @@ def check_resources_alias(resource_name, shortcuts):
 def set_nlu_logger(level=logging.INFO):
     logger = logging.getLogger(snips_nlu.__name__)
     logger.setLevel(level)
-    handler = logging.StreamHandler()
+    handler = logging.StreamHandler(sys.stdout)
     handler.setLevel(level)
     logger.addHandler(handler)
diff --git a/snips_nlu/dataset/__init__.py b/snips_nlu/dataset/__init__.py
new file mode 100644
index 000000000..a43f2268b
--- /dev/null
+++ b/snips_nlu/dataset/__init__.py
@@ -0,0 +1,7 @@
+from snips_nlu.dataset.dataset import Dataset
+from snips_nlu.dataset.entity import Entity, EntityFormatError
+from snips_nlu.dataset.intent import Intent, IntentFormatError
+from snips_nlu.dataset.utils import (
+    extract_intent_entities, extract_utterance_entities,
+    get_dataset_gazetteer_entities, get_text_from_chunks)
+from snips_nlu.dataset.validation import validate_and_format_dataset
diff --git a/snips_nlu/dataset/dataset.py b/snips_nlu/dataset/dataset.py
new file mode 100644
index 000000000..d4eb996a2
--- /dev/null
+++ b/snips_nlu/dataset/dataset.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+from __future__ import print_function, unicode_literals
+
+import io
+from itertools import cycle
+from pathlib import Path
+
+import yaml
+from deprecation import deprecated
+from snips_nlu_ontology import get_builtin_entity_examples
+
+from snips_nlu.__about__ import __version__
+from snips_nlu.dataset.entity import Entity
+from snips_nlu.dataset.intent import Intent
+
+
+class DatasetFormatError(TypeError):
+    pass
+
+
+class Dataset(object):
+    """Dataset used in the main NLU training API
+
+    Consists of intents and entities data. This object can be built either from
+    text files (:meth:`.Dataset.from_files`) or from YAML files
+    (:meth:`.Dataset.from_yaml_files`).
+
+    Attributes:
+        language (str): language of the intents
+        intents (list of :class:`.Intent`): intents data
+        entities (list of :class:`.Entity`): entities data
+    """
+
+    def __init__(self, language, intents, entities):
+        self.language = language
+        self.intents = intents
+        self.entities = entities
+        self._add_missing_entities()
+        self._ensure_entity_values()
+
+    @classmethod
+    def from_yaml_files(cls, language, filenames):
+        # pylint:disable=line-too-long
+        """Creates a :class:`.Dataset` from a language and a list of YAML files
+        containing intents and entities data
+
+        Each file need not correspond to a single entity nor intent. They can
+        consist in several entities and intents merged together in a single
+        file.
+
+        A dataset can be defined with a YAML document following the schema
+        illustrated in the example below:
+
+        .. code-block:: yaml
+
+            # searchFlight Intent
+            ---
+            type: intent
+            name: searchFlight
+            slots:
+              - name: origin
+                entity: city
+              - name: destination
+                entity: city
+              - name: date
+                entity: snips/datetime
+            utterances:
+              - find me a flight from [origin](Paris) to [destination](New York)
+              - I need a flight leaving [date](this weekend) to [destination](Berlin)
+              - show me flights to go to [destination](new york) leaving [date](this evening)
+
+            # City Entity
+            ---
+            type: entity
+            name: city
+            values:
+              - london
+              - [new york, big apple]
+              - [paris, city of lights]
+
+        Raises:
+            DatasetFormatError: When one of the documents present in the YAML
+                files has a wrong 'type' attribute, which is not 'entity' nor
+                'intent'
+            IntentFormatError: When the YAML document of an intent does not
+                correspond to the :ref:`expected intent format <yaml_intent_format>`
+            EntityFormatError: When the YAML document of an entity does not
+                correspond to the :ref:`expected entity format <yaml_entity_format>`
+        """
+        # pylint:enable=line-too-long
+        entities = []
+        intents = []
+        for filename in filenames:
+            with io.open(filename, encoding="utf8") as f:
+                for doc in yaml.safe_load_all(f):
+                    doc_type = doc.get("type")
+                    if doc_type == "entity":
+                        entities.append(Entity.from_yaml(doc))
+                    elif doc_type == "intent":
+                        intents.append(Intent.from_yaml(doc))
+                    else:
+                        raise DatasetFormatError(
+                            "Invalid 'type' value in YAML file '%s': '%s'"
+                            % (filename, doc_type))
+        return cls(language, intents, entities)
+
+    @classmethod
+    @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
+                current_version=__version__,
+                details="Use from_yaml_files instead")
+    def from_files(cls, language, filenames):
+        """Creates a :class:`.Dataset` from a language and a list of intent and
+        entity files
+
+        Args:
+            language (str): language of the assistant
+            filenames (list of str): Intent and entity files.
+                The assistant will associate each intent file to an intent,
+                and each entity file to an entity. For instance, the intent
+                file 'intent_setTemperature.txt' will correspond to the intent
+                'setTemperature', and the entity file 'entity_room.txt' will
+                correspond to the entity 'room'.
+        """
+        intent_filepaths = set()
+        entity_filepaths = set()
+        for filename in filenames:
+            filepath = Path(filename)
+            stem = filepath.stem
+            if stem.startswith("intent_"):
+                intent_filepaths.add(filepath)
+            elif stem.startswith("entity_"):
+                entity_filepaths.add(filepath)
+            else:
+                raise AssertionError("Filename should start either with "
+                                     "'intent_' or 'entity_' but found: %s"
+                                     % stem)
+
+        intents = [Intent.from_file(f) for f in intent_filepaths]
+
+        entities = [Entity.from_file(f) for f in entity_filepaths]
+        return cls(language, intents, entities)
+
+    def _add_missing_entities(self):
+        entity_names = set(e.name for e in self.entities)
+
+        # Add entities appearing only in the intents utterances
+        for intent in self.intents:
+            for entity_name in intent.entities_names:
+                if entity_name not in entity_names:
+                    entity_names.add(entity_name)
+                    self.entities.append(Entity(name=entity_name))
+
+    def _ensure_entity_values(self):
+        entities_values = {entity.name: self._get_entity_values(entity)
+                           for entity in self.entities}
+        for intent in self.intents:
+            for utterance in intent.utterances:
+                for chunk in utterance.slot_chunks:
+                    if chunk.text is not None:
+                        continue
+                    try:
+                        chunk.text = next(entities_values[chunk.entity])
+                    except StopIteration:
+                        raise DatasetFormatError(
+                            "At least one entity value must be provided for "
+                            "entity '%s'" % chunk.entity)
+        return self
+
+    def _get_entity_values(self, entity):
+        if entity.is_builtin:
+            return cycle(get_builtin_entity_examples(
+                entity.name, self.language))
+        values = [v for utterance in entity.utterances
+                  for v in utterance.variations]
+        values_set = set(values)
+        for intent in self.intents:
+            for utterance in intent.utterances:
+                for chunk in utterance.slot_chunks:
+                    if not chunk.text or chunk.entity != entity.name:
+                        continue
+                    if chunk.text not in values_set:
+                        values_set.add(chunk.text)
+                        values.append(chunk.text)
+        return cycle(values)
+
+    @property
+    def json(self):
+        """Dataset data in json format"""
+        intents = {intent_data.intent_name: intent_data.json
+                   for intent_data in self.intents}
+        entities = {entity.name: entity.json for entity in self.entities}
+        return dict(language=self.language, intents=intents, entities=entities)
diff --git a/snips_nlu/dataset/entity.py b/snips_nlu/dataset/entity.py
new file mode 100644
index 000000000..9d138b525
--- /dev/null
+++ b/snips_nlu/dataset/entity.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+import csv
+import re
+from builtins import str
+from pathlib import Path
+
+import six
+from deprecation import deprecated
+from snips_nlu_ontology import get_all_builtin_entities
+
+from snips_nlu.__about__ import __version__
+from snips_nlu.constants import (
+    AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS,
+    USE_SYNONYMS, VALUE)
+
+AUTO_EXT_REGEX = re.compile(r'^#\sautomatically_extensible=(true|false)\s*$')
+
+
+class EntityFormatError(TypeError):
+    pass
+
+
+class Entity(object):
+    """Entity data of a :class:`.Dataset`
+
+    This class can represents both a custom or a builtin entity. When the
+    entity is a builtin one, only the `name` attribute is relevant.
+
+    Attributes:
+        name (str): name of the entity
+        utterances (list of :class:`.EntityUtterance`): entity utterances
+            (only for custom entities)
+        automatically_extensible (bool): whether or not the entity can be
+            extended to values not present in the data (only for custom
+            entities)
+        use_synonyms (bool): whether or not to map entity values using
+            synonyms (only for custom entities)
+        matching_strictness (float): controls the matching strictness of the
+            entity (only for custom entities). Must be between 0.0 and 1.0.
+    """
+
+    def __init__(self, name, utterances=None, automatically_extensible=True,
+                 use_synonyms=True, matching_strictness=1.0):
+        if utterances is None:
+            utterances = []
+        self.name = name
+        self.utterances = utterances
+        self.automatically_extensible = automatically_extensible
+        self.use_synonyms = use_synonyms
+        self.matching_strictness = matching_strictness
+
+    @property
+    def is_builtin(self):
+        return self.name in get_all_builtin_entities()
+
+    @classmethod
+    def from_yaml(cls, yaml_dict):
+        """Build an :class:`.Entity` from its YAML definition dict
+
+        An entity can be defined with a YAML document following the schema
+        illustrated in the example below:
+
+        .. code-block:: yaml
+
+            # City Entity
+            ---
+            type: entity
+            name: city
+            automatically_extensible: false # default value is true
+            use_synonyms: false # default value is true
+            matching_strictness: 0.8 # default value is 1.0
+            values:
+              - london
+              - [new york, big apple]
+              - [paris, city of lights]
+
+        Raises:
+            EntityFormatError: When the YAML dict does not correspond to the
+                :ref:`expected entity format <yaml_entity_format>`
+        """
+        object_type = yaml_dict.get("type")
+        if object_type and object_type != "entity":
+            raise EntityFormatError("Wrong type: '%s'" % object_type)
+        entity_name = yaml_dict.get("name")
+        if not entity_name:
+            raise EntityFormatError("Missing 'name' attribute")
+        auto_extensible = yaml_dict.get(AUTOMATICALLY_EXTENSIBLE, True)
+        use_synonyms = yaml_dict.get(USE_SYNONYMS, True)
+        matching_strictness = yaml_dict.get("matching_strictness", 1.0)
+        utterances = []
+        for entity_value in yaml_dict.get("values", []):
+            if isinstance(entity_value, list):
+                utterance = EntityUtterance(entity_value[0], entity_value[1:])
+            elif isinstance(entity_value, str):
+                utterance = EntityUtterance(entity_value)
+            else:
+                raise EntityFormatError(
+                    "YAML entity values must be either strings or lists, but "
+                    "found: %s" % type(entity_value))
+            utterances.append(utterance)
+
+        return cls(name=entity_name,
+                   utterances=utterances,
+                   automatically_extensible=auto_extensible,
+                   use_synonyms=use_synonyms,
+                   matching_strictness=matching_strictness)
+
+    @classmethod
+    @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
+                current_version=__version__, details="Use from_yaml instead")
+    def from_file(cls, filepath):
+        """Build an :class:`.Entity` from a text file"""
+        filepath = Path(filepath)
+        stem = filepath.stem
+        if not stem.startswith("entity_"):
+            raise EntityFormatError(
+                "Entity filename should start with 'entity_' but found: %s"
+                % stem)
+        entity_name = stem[7:]
+        if not entity_name:
+            raise EntityFormatError("Entity name must not be empty")
+        utterances = []
+        with filepath.open(encoding="utf-8") as f:
+            it = f
+            if six.PY2:
+                it = list(utf_8_encoder(it))
+            reader = csv.reader(list(it))
+            autoextent = True
+            for row in reader:
+                if not row or not row[0].strip():
+                    continue
+                if six.PY2:
+                    row = [cell.decode("utf-8") for cell in row]
+                value = row[0]
+                if reader.line_num == 1:
+                    m = AUTO_EXT_REGEX.match(row[0])
+                    if m:
+                        autoextent = not m.group(1).lower() == 'false'
+                        continue
+                if len(row) > 1:
+                    synonyms = row[1:]
+                else:
+                    synonyms = []
+                utterances.append(EntityUtterance(value, synonyms))
+        return cls(entity_name, utterances,
+                   automatically_extensible=autoextent, use_synonyms=True)
+
+    @property
+    def json(self):
+        """Returns the entity in json format"""
+        if self.is_builtin:
+            return dict()
+        return {
+            AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible,
+            USE_SYNONYMS: self.use_synonyms,
+            DATA: [u.json for u in self.utterances],
+            MATCHING_STRICTNESS: self.matching_strictness
+        }
+
+
+class EntityUtterance(object):
+    """Represents a value of a :class:`.CustomEntity` with potential synonyms
+
+    Attributes:
+        value (str): entity value
+        synonyms (list of str): The values to remap to the utterance value
+        """
+
+    def __init__(self, value, synonyms=None):
+        self.value = value
+        if synonyms is None:
+            synonyms = []
+        self.synonyms = synonyms
+
+    @property
+    def variations(self):
+        return [self.value] + self.synonyms
+
+    @property
+    def json(self):
+        return {VALUE: self.value, SYNONYMS: self.synonyms}
+
+
+def utf_8_encoder(f):
+    for line in f:
+        yield line.encode("utf-8")
diff --git a/snips_nlu/dataset/intent.py b/snips_nlu/dataset/intent.py
new file mode 100644
index 000000000..d0261925b
--- /dev/null
+++ b/snips_nlu/dataset/intent.py
@@ -0,0 +1,313 @@
+from __future__ import absolute_import, print_function
+
+from abc import ABCMeta, abstractmethod
+from builtins import object
+from pathlib import Path
+
+from deprecation import deprecated
+from future.utils import with_metaclass
+
+from snips_nlu.__about__ import __version__
+from snips_nlu.constants import DATA, ENTITY, SLOT_NAME, TEXT, UTTERANCES
+
+
+class IntentFormatError(TypeError):
+    pass
+
+
+class Intent(object):
+    """Intent data of a :class:`.Dataset`
+
+    Attributes:
+        intent_name (str): name of the intent
+        utterances (list of :class:`.IntentUtterance`): annotated intent
+            utterances
+        slot_mapping (dict): mapping between slot names and entities
+    """
+
+    def __init__(self, intent_name, utterances, slot_mapping=None):
+        if slot_mapping is None:
+            slot_mapping = dict()
+        self.intent_name = intent_name
+        self.utterances = utterances
+        self.slot_mapping = slot_mapping
+        self._complete_slot_name_mapping()
+        self._ensure_entity_names()
+
+    @classmethod
+    def from_yaml(cls, yaml_dict):
+        # pylint:disable=line-too-long
+        """Build an :class:`.Intent` from its YAML definition dict
+
+        An intent can be defined with a YAML document following the schema
+        illustrated in the example below:
+
+        .. code-block:: yaml
+
+            # searchFlight Intent
+            ---
+            type: intent
+            name: searchFlight
+            slots:
+              - name: origin
+                entity: city
+              - name: destination
+                entity: city
+              - name: date
+                entity: snips/datetime
+            utterances:
+              - find me a flight from [origin](Paris) to [destination](New York)
+              - I need a flight leaving [date](this weekend) to [destination](Berlin)
+              - show me flights to go to [destination](new york) leaving [date](this evening)
+
+        Raises:
+            IntentFormatError: When the YAML dict does not correspond to the
+                :ref:`expected intent format <yaml_intent_format>`
+        """
+        # pylint:enable=line-too-long
+        object_type = yaml_dict.get("type")
+        if object_type and object_type != "intent":
+            raise IntentFormatError("Wrong type: '%s'" % object_type)
+        intent_name = yaml_dict.get("name")
+        if not intent_name:
+            raise IntentFormatError("Missing 'name' attribute")
+        slot_mapping = dict()
+        for slot in yaml_dict.get("slots", []):
+            slot_mapping[slot["name"]] = slot["entity"]
+        utterances = [IntentUtterance.parse(u.strip())
+                      for u in yaml_dict["utterances"] if u.strip()]
+        if not utterances:
+            raise IntentFormatError(
+                "Intent must contain at least one utterance")
+        return cls(intent_name, utterances, slot_mapping)
+
+    @classmethod
+    @deprecated(deprecated_in="0.18.0", removed_in="0.19.0",
+                current_version=__version__, details="Use from_yaml instead")
+    def from_file(cls, filepath):
+        """Build an :class:`.Intent` from a text file"""
+        filepath = Path(filepath)
+        stem = filepath.stem
+        if not stem.startswith("intent_"):
+            raise IntentFormatError(
+                "Intent filename should start with 'intent_' but found: %s"
+                % stem)
+        intent_name = stem[7:]
+        if not intent_name:
+            raise IntentFormatError("Intent name must not be empty")
+        with filepath.open(encoding="utf-8") as f:
+            lines = iter(l.strip() for l in f if l.strip())
+            utterances = [IntentUtterance.parse(sample) for sample in lines]
+        return cls(intent_name, utterances)
+
+    def _complete_slot_name_mapping(self):
+        for utterance in self.utterances:
+            for chunk in utterance.slot_chunks:
+                if chunk.entity and chunk.slot_name not in self.slot_mapping:
+                    self.slot_mapping[chunk.slot_name] = chunk.entity
+        return self
+
+    def _ensure_entity_names(self):
+        for utterance in self.utterances:
+            for chunk in utterance.slot_chunks:
+                if chunk.entity:
+                    continue
+                chunk.entity = self.slot_mapping.get(
+                    chunk.slot_name, chunk.slot_name)
+        return self
+
+    @property
+    def json(self):
+        """Intent data in json format"""
+        return {
+            UTTERANCES: [
+                {DATA: [chunk.json for chunk in utterance.chunks]}
+                for utterance in self.utterances
+            ]
+        }
+
+    @property
+    def entities_names(self):
+        return set(chunk.entity for u in self.utterances
+                   for chunk in u.chunks if isinstance(chunk, SlotChunk))
+
+
+class IntentUtterance(object):
+    def __init__(self, chunks):
+        self.chunks = chunks
+
+    @property
+    def text(self):
+        return "".join((chunk.text for chunk in self.chunks))
+
+    @property
+    def slot_chunks(self):
+        return (chunk for chunk in self.chunks if isinstance(chunk, SlotChunk))
+
+    @classmethod
+    def parse(cls, string):
+        """Parses an utterance
+
+        Args:
+            string (str): an utterance in the class:`.Utterance` format
+
+        Examples:
+
+            >>> from snips_nlu.dataset.intent import IntentUtterance
+            >>> u = IntentUtterance.\
+                parse("president of [country:default](France)")
+            >>> u.text
+            'president of France'
+            >>> len(u.chunks)
+            2
+            >>> u.chunks[0].text
+            'president of '
+            >>> u.chunks[1].slot_name
+            'country'
+            >>> u.chunks[1].entity
+            'default'
+        """
+        sm = SM(string)
+        capture_text(sm)
+        return cls(sm.chunks)
+
+
+class Chunk(with_metaclass(ABCMeta, object)):
+    def __init__(self, text):
+        self.text = text
+
+    @abstractmethod
+    def json(self):
+        pass
+
+
+class SlotChunk(Chunk):
+    def __init__(self, slot_name, entity, text):
+        super(SlotChunk, self).__init__(text)
+        self.slot_name = slot_name
+        self.entity = entity
+
+    @property
+    def json(self):
+        return {
+            TEXT: self.text,
+            SLOT_NAME: self.slot_name,
+            ENTITY: self.entity,
+        }
+
+
+class TextChunk(Chunk):
+    @property
+    def json(self):
+        return {
+            TEXT: self.text
+        }
+
+
+class SM(object):
+    """State Machine for parsing"""
+
+    def __init__(self, input):
+        self.input = input
+        self.chunks = []
+        self.current = 0
+
+    @property
+    def end_of_input(self):
+        return self.current >= len(self.input)
+
+    def add_slot(self, name, entity=None):
+        """Adds a named slot
+
+        Args:
+            name (str): slot name
+            entity (str): entity name
+        """
+        chunk = SlotChunk(slot_name=name, entity=entity, text=None)
+        self.chunks.append(chunk)
+
+    def add_text(self, text):
+        """Adds a simple text chunk using the current position"""
+        chunk = TextChunk(text=text)
+        self.chunks.append(chunk)
+
+    def add_tagged(self, text):
+        """Adds text to the last slot"""
+        if not self.chunks:
+            raise AssertionError("Cannot add tagged text because chunks list "
+                                 "is empty")
+        self.chunks[-1].text = text
+
+    def find(self, s):
+        return self.input.find(s, self.current)
+
+    def move(self, pos):
+        """Moves the cursor of the state to position after given
+
+        Args:
+            pos (int): position to place the cursor just after
+        """
+        self.current = pos + 1
+
+    def peek(self):
+        if self.end_of_input:
+            return None
+        return self[0]
+
+    def read(self):
+        c = self[0]
+        self.current += 1
+        return c
+
+    def __getitem__(self, key):
+        current = self.current
+        if isinstance(key, int):
+            return self.input[current + key]
+        elif isinstance(key, slice):
+            start = current + key.start if key.start else current
+            return self.input[slice(start, key.stop, key.step)]
+        else:
+            raise TypeError("Bad key type: %s" % type(key))
+
+
+def capture_text(state):
+    next_pos = state.find('[')
+    sub = state[:] if next_pos < 0 else state[:next_pos]
+    if sub.strip():
+        state.add_text(sub)
+    if next_pos >= 0:
+        state.move(next_pos)
+        capture_slot(state)
+
+
+def capture_slot(state):
+    next_colon_pos = state.find(':')
+    next_square_bracket_pos = state.find(']')
+    if next_square_bracket_pos < 0:
+        raise IntentFormatError("Missing ending ']' in annotated utterance")
+    if next_colon_pos < 0 or next_square_bracket_pos < next_colon_pos:
+        slot_name = state[:next_square_bracket_pos]
+        state.move(next_square_bracket_pos)
+        state.add_slot(slot_name)
+    else:
+        slot_name = state[:next_colon_pos]
+        state.move(next_colon_pos)
+        entity = state[:next_square_bracket_pos]
+        state.move(next_square_bracket_pos)
+        state.add_slot(slot_name, entity)
+    if state.peek() == '(':
+        state.read()
+        capture_tagged(state)
+    else:
+        capture_text(state)
+
+
+def capture_tagged(state):
+    next_pos = state.find(')')
+    if next_pos < 1:
+        raise IntentFormatError("Missing ending ')' in annotated utterance")
+    else:
+        tagged_text = state[:next_pos]
+        state.add_tagged(tagged_text)
+        state.move(next_pos)
+        capture_text(state)
diff --git a/snips_nlu/dataset/utils.py b/snips_nlu/dataset/utils.py
new file mode 100644
index 000000000..b4fc0c33b
--- /dev/null
+++ b/snips_nlu/dataset/utils.py
@@ -0,0 +1,51 @@
+from __future__ import unicode_literals
+
+from future.utils import iteritems, itervalues
+from yaml import Loader, SafeLoader
+
+from snips_nlu.constants import (
+    DATA, ENTITIES, ENTITY, INTENTS, TEXT, UTTERANCES)
+from snips_nlu.entity_parser.builtin_entity_parser import is_gazetteer_entity
+
+
+def construct_yaml_str(self, node):
+    # Override the default string handling function
+    # to always return unicode objects
+    return self.construct_scalar(node)
+
+
+Loader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str)
+SafeLoader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str)
+
+
+def extract_utterance_entities(dataset):
+    entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]}
+
+    for intent in itervalues(dataset[INTENTS]):
+        for utterance in intent[UTTERANCES]:
+            for chunk in utterance[DATA]:
+                if ENTITY in chunk:
+                    entities_values[chunk[ENTITY]].add(chunk[TEXT].strip())
+    return {k: list(v) for k, v in iteritems(entities_values)}
+
+
+def extract_intent_entities(dataset, entity_filter=None):
+    intent_entities = {intent: set() for intent in dataset[INTENTS]}
+    for intent_name, intent_data in iteritems(dataset[INTENTS]):
+        for utterance in intent_data[UTTERANCES]:
+            for chunk in utterance[DATA]:
+                if ENTITY in chunk:
+                    if entity_filter and not entity_filter(chunk[ENTITY]):
+                        continue
+                    intent_entities[intent_name].add(chunk[ENTITY])
+    return intent_entities
+
+
+def get_text_from_chunks(chunks):
+    return "".join(chunk[TEXT] for chunk in chunks)
+
+
+def get_dataset_gazetteer_entities(dataset, intent=None):
+    if intent is not None:
+        return extract_intent_entities(dataset, is_gazetteer_entity)[intent]
+    return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)}
diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset/validation.py
similarity index 76%
rename from snips_nlu/dataset.py
rename to snips_nlu/dataset/validation.py
index 22fba7c30..5f5dc7e16 100644
--- a/snips_nlu/dataset.py
+++ b/snips_nlu/dataset/validation.py
@@ -12,36 +12,14 @@
     AUTOMATICALLY_EXTENSIBLE, CAPITALIZE, DATA, ENTITIES, ENTITY, INTENTS,
     LANGUAGE, MATCHING_STRICTNESS, SLOT_NAME, SYNONYMS, TEXT, USE_SYNONYMS,
     UTTERANCES, VALIDATED, VALUE)
+from snips_nlu.dataset import extract_utterance_entities
 from snips_nlu.entity_parser.builtin_entity_parser import (
-    BuiltinEntityParser, is_builtin_entity, is_gazetteer_entity)
+    BuiltinEntityParser, is_builtin_entity)
 from snips_nlu.preprocessing import tokenize_light
 from snips_nlu.string_variations import get_string_variations
 from snips_nlu.utils import validate_key, validate_keys, validate_type
 
 
-def extract_utterance_entities(dataset):
-    entities_values = {ent_name: set() for ent_name in dataset[ENTITIES]}
-
-    for intent in itervalues(dataset[INTENTS]):
-        for utterance in intent[UTTERANCES]:
-            for chunk in utterance[DATA]:
-                if ENTITY in chunk:
-                    entities_values[chunk[ENTITY]].add(chunk[TEXT].strip())
-    return {k: list(v) for k, v in iteritems(entities_values)}
-
-
-def extract_intent_entities(dataset, entity_filter=None):
-    intent_entities = {intent: set() for intent in dataset[INTENTS]}
-    for intent_name, intent_data in iteritems(dataset[INTENTS]):
-        for utterance in intent_data[UTTERANCES]:
-            for chunk in utterance[DATA]:
-                if ENTITY in chunk:
-                    if entity_filter and not entity_filter(chunk[ENTITY]):
-                        continue
-                    intent_entities[intent_name].add(chunk[ENTITY])
-    return intent_entities
-
-
 def validate_and_format_dataset(dataset):
     """Checks that the dataset is valid and format it"""
     # Make this function idempotent
@@ -61,7 +39,7 @@ def validate_and_format_dataset(dataset):
         raise ValueError("Unknown language: '%s'" % language)
 
     for intent in itervalues(dataset[INTENTS]):
-        validate_and_format_intent(intent, dataset[ENTITIES])
+        _validate_and_format_intent(intent, dataset[ENTITIES])
 
     utterance_entities_values = extract_utterance_entities(dataset)
     builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)
@@ -70,15 +48,17 @@ def validate_and_format_dataset(dataset):
         uterrance_entities = utterance_entities_values[entity_name]
         if is_builtin_entity(entity_name):
             dataset[ENTITIES][entity_name] = \
-                validate_and_format_builtin_entity(entity, uterrance_entities)
+                _validate_and_format_builtin_entity(entity, uterrance_entities)
         else:
-            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
-                entity, uterrance_entities, language, builtin_entity_parser)
+            dataset[ENTITIES][entity_name] = \
+                _validate_and_format_custom_entity(
+                    entity, uterrance_entities, language,
+                    builtin_entity_parser)
     dataset[VALIDATED] = True
     return dataset
 
 
-def validate_and_format_intent(intent, entities):
+def _validate_and_format_intent(intent, entities):
     validate_type(intent, dict)
     validate_key(intent, UTTERANCES, object_label="intent dict")
     validate_type(intent[UTTERANCES], list)
@@ -100,11 +80,7 @@ def validate_and_format_intent(intent, entities):
     return intent
 
 
-def get_text_from_chunks(chunks):
-    return "".join(chunk[TEXT] for chunk in chunks)
-
-
-def has_any_capitalization(entity_utterances, language):
+def _has_any_capitalization(entity_utterances, language):
     for utterance in entity_utterances:
         tokens = tokenize_light(utterance, language)
         if any(t.isupper() or t.istitle() for t in tokens):
@@ -112,7 +88,7 @@ def has_any_capitalization(entity_utterances, language):
     return False
 
 
-def add_entity_variations(utterances, entity_variations, entity_value):
+def _add_entity_variations(utterances, entity_variations, entity_value):
     utterances[entity_value] = entity_value
     for variation in entity_variations[entity_value]:
         if variation:
@@ -129,8 +105,8 @@ def _extract_entity_values(entity):
     return values
 
 
-def validate_and_format_custom_entity(entity, queries_entities, language,
-                                      builtin_entity_parser):
+def _validate_and_format_custom_entity(entity, queries_entities, language,
+                                       builtin_entity_parser):
     validate_type(entity, dict)
 
     # TODO: this is here temporarily, only to allow backward compatibility
@@ -169,8 +145,8 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
 
     # Compute capitalization before normalizing
     # Normalization lowercase and hence lead to bad capitalization calculation
-    formatted_entity[CAPITALIZE] = has_any_capitalization(queries_entities,
-                                                          language)
+    formatted_entity[CAPITALIZE] = _has_any_capitalization(queries_entities,
+                                                           language)
 
     validated_utterances = dict()
     # Map original values an synonyms
@@ -208,7 +184,7 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
 
     for entry in entity[DATA]:
         entry_value = entry[VALUE]
-        validated_utterances = add_entity_variations(
+        validated_utterances = _add_entity_variations(
             validated_utterances, non_colliding_variations, entry_value)
 
     # Merge queries entities
@@ -227,12 +203,6 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
     return formatted_entity
 
 
-def validate_and_format_builtin_entity(entity, queries_entities):
+def _validate_and_format_builtin_entity(entity, queries_entities):
     validate_type(entity, dict)
     return {UTTERANCES: set(queries_entities)}
-
-
-def get_dataset_gazetteer_entities(dataset, intent=None):
-    if intent is not None:
-        return extract_intent_entities(dataset, is_gazetteer_entity)[intent]
-    return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)}
diff --git a/snips_nlu/default_configs/config_de.py b/snips_nlu/default_configs/config_de.py
index 99cd61ead..34b6eabd6 100644
--- a/snips_nlu/default_configs/config_de.py
+++ b/snips_nlu/default_configs/config_de.py
@@ -175,6 +175,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_en.py b/snips_nlu/default_configs/config_en.py
index 5c12803f3..a7bbbfa5c 100644
--- a/snips_nlu/default_configs/config_en.py
+++ b/snips_nlu/default_configs/config_en.py
@@ -152,6 +152,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_es.py b/snips_nlu/default_configs/config_es.py
index dd6e6b8cc..3356b1395 100644
--- a/snips_nlu/default_configs/config_es.py
+++ b/snips_nlu/default_configs/config_es.py
@@ -139,6 +139,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_fr.py b/snips_nlu/default_configs/config_fr.py
index dd6e6b8cc..3356b1395 100644
--- a/snips_nlu/default_configs/config_fr.py
+++ b/snips_nlu/default_configs/config_fr.py
@@ -139,6 +139,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_it.py b/snips_nlu/default_configs/config_it.py
index dd6e6b8cc..3356b1395 100644
--- a/snips_nlu/default_configs/config_it.py
+++ b/snips_nlu/default_configs/config_it.py
@@ -139,6 +139,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_ja.py b/snips_nlu/default_configs/config_ja.py
index 46849b8b3..cfe6fac4a 100644
--- a/snips_nlu/default_configs/config_ja.py
+++ b/snips_nlu/default_configs/config_ja.py
@@ -195,6 +195,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/default_configs/config_ko.py b/snips_nlu/default_configs/config_ko.py
index 4da2fd365..0b8c61245 100644
--- a/snips_nlu/default_configs/config_ko.py
+++ b/snips_nlu/default_configs/config_ko.py
@@ -173,6 +173,7 @@
                     "min_utterances": 20,
                     "noise_factor": 5,
                     "add_builtin_entities_examples": True,
+                    "max_unknown_words": 0,
                     "unknown_word_prob": 0.0,
                     "unknown_words_replacement_string": None
                 },
diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
index 919b8b753..73f06493f 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -9,8 +9,8 @@
 import numpy as np
 from future.utils import iteritems, itervalues
 
-from snips_nlu.constants import (
-    DATA, ENTITY, INTENTS, TEXT, UNKNOWNWORD, UTTERANCES)
+from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT,
+                                 UNKNOWNWORD, UTTERANCES)
 from snips_nlu.data_augmentation import augment_utterances
 from snips_nlu.dataset import get_text_from_chunks
 from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
@@ -50,16 +50,16 @@ def get_noise_it(noise, mean_length, std_length, random_state):
         # pylint: enable=stop-iteration-return
 
 
-def generate_smart_noise(augmented_utterances, replacement_string, language):
+def generate_smart_noise(noise, augmented_utterances, replacement_string,
+                         language):
     text_utterances = [get_text_from_chunks(u[DATA])
                        for u in augmented_utterances]
     vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
     vocab = set(vocab)
-    noise = get_noise(language)
     return [w if w in vocab else replacement_string for w in noise]
 
 
-def generate_noise_utterances(augmented_utterances, num_intents,
+def generate_noise_utterances(augmented_utterances, noise, num_intents,
                               data_augmentation_config, language,
                               random_state):
     if not augmented_utterances or not num_intents:
@@ -67,11 +67,9 @@ def generate_noise_utterances(augmented_utterances, num_intents,
     avg_num_utterances = len(augmented_utterances) / float(num_intents)
     if data_augmentation_config.unknown_words_replacement_string is not None:
         noise = generate_smart_noise(
-            augmented_utterances,
+            noise, augmented_utterances,
             data_augmentation_config.unknown_words_replacement_string,
             language)
-    else:
-        noise = get_noise(language)
 
     noise_size = min(
         int(data_augmentation_config.noise_factor * avg_num_utterances),
@@ -89,14 +87,38 @@ def generate_noise_utterances(augmented_utterances, num_intents,
         for _ in range(noise_size)]
 
 
-def add_unknown_word_to_utterances(augmented_utterances, replacement_string,
-                                   unknown_word_prob, random_state):
-    for u in augmented_utterances:
-        for chunk in u[DATA]:
-            if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \
-                    and random_state.rand() < unknown_word_prob:
-                chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT])
-    return augmented_utterances
+def add_unknown_word_to_utterances(utterances, replacement_string,
+                                   unknown_word_prob, max_unknown_words,
+                                   random_state):
+    new_utterances = deepcopy(utterances)
+    for u in new_utterances:
+        if random_state.rand() < unknown_word_prob:
+            num_unknown = random_state.randint(1, max_unknown_words + 1)
+            # We choose to put the noise at the end of the sentence and not
+            # in the middle so that it doesn't impact to much ngrams
+            # computation
+            extra_chunk = {
+                TEXT: " " + " ".join(
+                    replacement_string for _ in range(num_unknown))
+            }
+            u[DATA].append(extra_chunk)
+    return new_utterances
+
+
+def get_dataset_specific_noise(dataset, language):
+    """Return a noise list that excludes the dataset entity values"""
+    entities_values = set()
+    for ent_name, ent in iteritems(dataset[ENTITIES]):
+        if is_builtin_entity(ent_name):
+            continue
+        for k, v in iteritems(ent[UTTERANCES]):
+            entities_values.add(k)
+            entities_values.add(v)
+    original_noise = get_noise(language)
+    specific_noise = [n for n in original_noise if n not in entities_values]
+    if not specific_noise:  # Avoid returning an empty noise
+        return original_noise
+    return specific_noise
 
 
 def build_training_data(dataset, language, data_augmentation_config,
@@ -129,17 +151,20 @@ def build_training_data(dataset, language, data_augmentation_config,
         augmented_utterances += utterances
         utterance_classes += [classes_mapping[intent_name] for _ in
                               range(len(utterances))]
-    augmented_utterances = add_unknown_word_to_utterances(
-        augmented_utterances,
-        data_augmentation_config.unknown_words_replacement_string,
-        data_augmentation_config.unknown_word_prob,
-        random_state
-    )
+    if data_augmentation_config.unknown_words_replacement_string is not None:
+        augmented_utterances = add_unknown_word_to_utterances(
+            augmented_utterances,
+            data_augmentation_config.unknown_words_replacement_string,
+            data_augmentation_config.unknown_word_prob,
+            data_augmentation_config.max_unknown_words,
+            random_state
+        )
 
     # Adding noise
+    noise = get_dataset_specific_noise(dataset, language)
     noisy_utterances = generate_noise_utterances(
-        augmented_utterances, len(intents), data_augmentation_config, language,
-        random_state)
+        augmented_utterances, noise, len(intents), data_augmentation_config,
+        language, random_state)
 
     augmented_utterances += noisy_utterances
     utterance_classes += [noise_class for _ in noisy_utterances]
diff --git a/snips_nlu/intent_classifier/modifiers.py b/snips_nlu/intent_classifier/modifiers.py
new file mode 100644
index 000000000..50d2bd937
--- /dev/null
+++ b/snips_nlu/intent_classifier/modifiers.py
@@ -0,0 +1,24 @@
+MODIFIERS = {
+    "it": {
+        "più",
+        "piu",
+        "meno",
+        "molto",
+        "non",
+        "troppo",
+        "troppa",
+        "ancora",
+        "senza",
+        "con",
+        "forte",
+        "forti",
+        "alto",
+        "alta",
+        "alti",
+        "alte"
+        "bassa",
+        "basso",
+        "bassi",
+        "basse"
+    }
+}
diff --git a/snips_nlu/nlu_engine/__init__.py b/snips_nlu/nlu_engine/__init__.py
index a4ec8db24..07c8ebc73 100644
--- a/snips_nlu/nlu_engine/__init__.py
+++ b/snips_nlu/nlu_engine/__init__.py
@@ -1 +1 @@
-from .nlu_engine import SnipsNLUEngine
+from snips_nlu.nlu_engine.nlu_engine import SnipsNLUEngine
diff --git a/snips_nlu/pipeline/configs/intent_classifier.py b/snips_nlu/pipeline/configs/intent_classifier.py
index 42abb8b6d..4ac330991 100644
--- a/snips_nlu/pipeline/configs/intent_classifier.py
+++ b/snips_nlu/pipeline/configs/intent_classifier.py
@@ -118,13 +118,15 @@ class IntentClassifierDataAugmentationConfig(Config):
 
     def __init__(self, min_utterances=20, noise_factor=5,
                  add_builtin_entities_examples=True, unknown_word_prob=0,
-                 unknown_words_replacement_string=None):
+                 unknown_words_replacement_string=None,
+                 max_unknown_words=None):
         self.min_utterances = min_utterances
         self.noise_factor = noise_factor
         self.add_builtin_entities_examples = add_builtin_entities_examples
         self.unknown_word_prob = unknown_word_prob
         self.unknown_words_replacement_string = \
             unknown_words_replacement_string
+        self.max_unknown_words = max_unknown_words
         if unknown_word_prob > 0 and unknown_words_replacement_string is None:
             raise ValueError("unknown_word_prob is positive (%s) but the "
                              "replacement string is None" % unknown_word_prob)
@@ -145,6 +147,7 @@ def to_dict(self):
             "unknown_word_prob": self.unknown_word_prob,
             "unknown_words_replacement_string":
                 self.unknown_words_replacement_string,
+            "max_unknown_words": self.max_unknown_words
         }
 
     @classmethod
diff --git a/snips_nlu/tests/test_cli.py b/snips_nlu/tests/test_cli.py
index 57542d64b..d55d2f87b 100644
--- a/snips_nlu/tests/test_cli.py
+++ b/snips_nlu/tests/test_cli.py
@@ -6,11 +6,6 @@
 
 from snips_nlu import SnipsNLUEngine
 from snips_nlu.cli import cross_val_metrics, parse, train, train_test_metrics
-from snips_nlu.cli.dataset import AssistantDataset
-from snips_nlu.cli.dataset.entities import CustomEntity
-from snips_nlu.cli.dataset.intent_dataset import IntentDataset
-from snips_nlu.constants import PACKAGE_PATH
-from snips_nlu.dataset import validate_and_format_dataset
 from snips_nlu.tests.utils import BEVERAGE_DATASET_PATH, SnipsTest, TEST_PATH
 
 
@@ -74,359 +69,3 @@ def test_train_test_metrics(self):
         # Then
         if not self.tmp_file_path.exists():
             self.fail("No metrics found")
-
-    def test_should_generate_intent_from_file(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file = examples_path / "intent_getWeather.txt"
-
-        # When
-        intent_dataset = IntentDataset.from_file(intent_file)
-        intent_dict = intent_dataset.json
-
-        # Then
-        expected_intent_dict = {
-            "utterances": [
-                {
-                    "data": [
-                        {
-                            "text": "what is the weather in "
-                        },
-                        {
-                            "entity": "location",
-                            "slot_name": "weatherLocation",
-                            "text": "Paris"
-                        },
-                        {
-                            "text": "?"
-                        }
-                    ]
-                },
-                {
-                    "data": [
-                        {
-                            "text": "Will it rain "
-                        },
-                        {
-                            "entity": "snips/datetime",
-                            "slot_name": "weatherDate",
-                            "text": "tomorrow"
-                        },
-                        {
-                            "text": " in "
-                        },
-                        {
-                            "entity": "location",
-                            "slot_name": "weatherLocation",
-                            "text": "Moscow"
-                        },
-                        {
-                            "text": "?"
-                        }
-                    ]
-                },
-                {
-                    "data": [
-                        {
-                            "text": "How is the weather in "
-                        },
-                        {
-                            "entity": "location",
-                            "slot_name": "weatherLocation",
-                            "text": "San Francisco"
-                        },
-                        {
-                            "entity": "snips/datetime",
-                            "slot_name": "weatherDate",
-                            "text": "today"
-                        },
-                        {
-                            "text": "?"
-                        }
-                    ]
-                }
-            ]
-        }
-
-        self.assertDictEqual(expected_intent_dict, intent_dict)
-
-    def test_should_generate_entity_from_file(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "entity_location.txt"
-
-        # When
-        entity_dataset = CustomEntity.from_file(entity_file)
-        entity_dict = entity_dataset.json
-
-        # Then
-        expected_entity_dict = {
-            "automatically_extensible": True,
-            "data": [
-                {
-                    "synonyms": [
-                        "big apple"
-                    ],
-                    "value": "new york"
-                },
-                {
-                    "synonyms": [
-                        "city of lights"
-                    ],
-                    "value": "paris"
-                },
-                {
-                    "synonyms": [],
-                    "value": "london"
-                }
-            ],
-            "use_synonyms": True,
-            "matching_strictness": 1.0
-        }
-        self.assertDictEqual(expected_entity_dict, entity_dict)
-
-    def test_should_generate_entity_from_file_with_autoextensible(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "entity_location_autoextent_false.txt"
-
-        # When
-        entity_dataset = CustomEntity.from_file(entity_file)
-        entity_dict = entity_dataset.json
-
-        # Then
-        expected_entity_dict = {
-            "automatically_extensible": False,
-            "data": [
-                {
-                    "synonyms": [
-                        "big apple"
-                    ],
-                    "value": "new york"
-                },
-                {
-                    "synonyms": [
-                        "city of lights"
-                    ],
-                    "value": "paris"
-                },
-                {
-                    "synonyms": [],
-                    "value": "london"
-                }
-            ],
-            "use_synonyms": True,
-            "matching_strictness": 1.0
-        }
-        self.assertDictEqual(expected_entity_dict, entity_dict)
-
-    def test_should_generate_dataset_from_files(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file_1 = examples_path / "intent_whoIsGame.txt"
-        intent_file_2 = examples_path / "intent_getWeather.txt"
-        entity_file_1 = examples_path / "entity_location.txt"
-
-        dataset = AssistantDataset.from_files(
-            "en", [intent_file_1, intent_file_2, entity_file_1])
-        dataset_dict = dataset.json
-
-        # When / Then
-        expected_dataset_dict = {
-            "entities": {
-                "company": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "country": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "location": {
-                    "automatically_extensible": True,
-                    "data": [
-                        {
-                            "synonyms": [
-                                "big apple"
-                            ],
-                            "value": "new york"
-                        },
-                        {
-                            "synonyms": [
-                                "city of lights"
-                            ],
-                            "value": "paris"
-                        },
-                        {
-                            "synonyms": [],
-                            "value": "london"
-                        }
-                    ],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "role": {
-                    "automatically_extensible": True,
-                    "data": [],
-                    "use_synonyms": True,
-                    "matching_strictness": 1.0,
-                },
-                "snips/datetime": {}
-            },
-            "intents": {
-                "getWeather": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "what is the weather in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "Paris"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "Will it rain "
-                                },
-                                {
-                                    "entity": "snips/datetime",
-                                    "slot_name": "weatherDate",
-                                    "text": "tomorrow"
-                                },
-                                {
-                                    "text": " in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "Moscow"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "How is the weather in "
-                                },
-                                {
-                                    "entity": "location",
-                                    "slot_name": "weatherLocation",
-                                    "text": "San Francisco"
-                                },
-                                {
-                                    "entity": "snips/datetime",
-                                    "slot_name": "weatherDate",
-                                    "text": "today"
-                                },
-                                {
-                                    "text": "?"
-                                }
-                            ]
-                        }
-                    ]
-                },
-                "whoIsGame": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "president"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "country",
-                                    "slot_name": "country",
-                                    "text": "France"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "prime minister"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "country",
-                                    "slot_name": "country",
-                                    "text": "UK"
-                                }
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "who is the "
-                                },
-                                {
-                                    "entity": "role",
-                                    "slot_name": "role",
-                                    "text": "CEO"
-                                },
-                                {
-                                    "text": " of "
-                                },
-                                {
-                                    "entity": "company",
-                                    "slot_name": "company",
-                                    "text": "Google"
-                                },
-                                {
-                                    "text": " please"
-                                }
-                            ]
-                        }
-                    ]
-                }
-            },
-            "language": "en"
-        }
-        validate_and_format_dataset(dataset_dict)
-        self.assertDictEqual(expected_dataset_dict, dataset_dict)
-
-    def test_should_fail_generating_intent_with_wrong_file_name(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        intent_file = examples_path / "getWeather.txt"
-
-        # When / Then
-        with self.assertRaises(AssertionError):
-            IntentDataset.from_file(intent_file)
-
-    def test_should_fail_generating_entity_with_wrong_file_name(self):
-        # Given
-        examples_path = PACKAGE_PATH / "cli" / "dataset" / "examples"
-        entity_file = examples_path / "location.txt"
-
-        # When / Then
-        with self.assertRaises(AssertionError):
-            CustomEntity.from_file(entity_file)
diff --git a/snips_nlu/tests/test_config.py b/snips_nlu/tests/test_config.py
index 16fd8c375..3077ae149 100644
--- a/snips_nlu/tests/test_config.py
+++ b/snips_nlu/tests/test_config.py
@@ -27,6 +27,7 @@ def test_intent_classifier_data_augmentation_config(self):
             "add_builtin_entities_examples": False,
             "unknown_word_prob": 0.1,
             "unknown_words_replacement_string": "foobar",
+            "max_unknown_words": None,
         }
 
         # When
diff --git a/snips_nlu/tests/test_dataset_loading.py b/snips_nlu/tests/test_dataset_loading.py
new file mode 100644
index 000000000..9b825b049
--- /dev/null
+++ b/snips_nlu/tests/test_dataset_loading.py
@@ -0,0 +1,294 @@
+from __future__ import unicode_literals
+
+import io
+from unittest import TestCase
+
+from deprecation import fail_if_not_removed
+from mock import patch
+
+from snips_nlu.dataset import Dataset, validate_and_format_dataset
+
+EXPECTED_DATASET_DICT = {
+    "entities": {
+        "company": {
+            "automatically_extensible": True,
+            "data": [],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "country": {
+            "automatically_extensible": True,
+            "data": [],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "location": {
+            "automatically_extensible": True,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "role": {
+            "automatically_extensible": True,
+            "data": [],
+            "use_synonyms": True,
+            "matching_strictness": 1.0,
+        },
+        "snips/datetime": {}
+    },
+    "intents": {
+        "getWeather": {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "Paris"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "is it raining in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "new york"
+                        },
+                        {
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": "Today"
+                        }
+                    ]
+                }
+            ]
+        },
+        "whoIsGame": {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "who is the "
+                        },
+                        {
+                            "entity": "role",
+                            "slot_name": "role",
+                            "text": "president"
+                        },
+                        {
+                            "text": " of "
+                        },
+                        {
+                            "entity": "country",
+                            "slot_name": "country",
+                            "text": "France"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "who is the "
+                        },
+                        {
+                            "entity": "role",
+                            "slot_name": "role",
+                            "text": "CEO"
+                        },
+                        {
+                            "text": " of "
+                        },
+                        {
+                            "entity": "company",
+                            "slot_name": "company",
+                            "text": "Google"
+                        },
+                        {
+                            "text": " please"
+                        }
+                    ]
+                }
+            ]
+        }
+    },
+    "language": "en"
+}
+
+
+class TestDatasetLoading(TestCase):
+    @patch("snips_nlu.dataset.dataset.io")
+    def test_should_generate_dataset_from_yaml_files(self, mock_io):
+        # Given
+        intent_file_1 = "whoIsGame.yaml"
+        intent_file_2 = "getWeather.yaml"
+        entity_file_1 = "location.yaml"
+
+        who_is_game_yaml = """
+# whoIsGame Intent
+---
+type: intent
+name: whoIsGame
+utterances:
+  - who is the [role](president) of [country](France)
+  - who is the [role](CEO) of [company](Google) please
+        """
+
+        get_weather_yaml = """
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [weatherLocation:location](Paris)?
+  - is it raining in [weatherLocation] [weatherDate:snips/datetime]
+        """
+
+        location_yaml = """
+# Location Entity
+---
+type: entity
+name: location
+automatically_extensible: true
+values:
+- [new york, big apple]
+- london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(filename, **kwargs):
+            if filename == intent_file_1:
+                return io.StringIO(who_is_game_yaml)
+            if filename == intent_file_2:
+                return io.StringIO(get_weather_yaml)
+            if filename == entity_file_1:
+                return io.StringIO(location_yaml)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
+        dataset_files = [intent_file_1, intent_file_2, entity_file_1]
+
+        # When
+        dataset = Dataset.from_yaml_files("en", dataset_files)
+        dataset_dict = dataset.json
+
+        # Then
+        validate_and_format_dataset(dataset_dict)
+        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
+
+    @patch("snips_nlu.dataset.dataset.io")
+    def test_should_generate_dataset_from_merged_yaml_file(self, mock_io):
+        # Given
+        dataset_file = "dataset.yaml"
+        dataset_yaml = """
+# whoIsGame Intent
+---
+type: intent
+name: whoIsGame
+utterances:
+  - who is the [role](president) of [country](France)
+  - who is the [role](CEO) of [company](Google) please
+
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [weatherLocation:location](Paris)?
+  - is it raining in [weatherLocation] [weatherDate:snips/datetime]
+  
+# Location Entity
+---
+type: entity
+name: location
+automatically_extensible: true
+values:
+- [new york, big apple]
+- london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(filename, **kwargs):
+            if filename == dataset_file:
+                return io.StringIO(dataset_yaml)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
+
+        # When
+        dataset = Dataset.from_yaml_files("en", [dataset_file])
+        dataset_dict = dataset.json
+
+        # Then
+        validate_and_format_dataset(dataset_dict)
+        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
+
+    @fail_if_not_removed
+    def test_should_generate_dataset_from_files(self):
+        # Given
+        intent_file_1 = "intent_whoIsGame.txt"
+        intent_file_2 = "intent_getWeather.txt"
+        entity_file_1 = "entity_location.txt"
+
+        who_is_game_txt = """
+who is the [role:role](president) of [country:country](France)
+who is the [role:role](CEO) of [company:company](Google) please
+"""
+
+        get_weather_txt = """
+what is the weather in [weatherLocation:location](Paris)?
+is it raining in [weatherLocation] [weatherDate:snips/datetime]
+"""
+
+        location_txt = """
+new york,big apple
+london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == intent_file_1:
+                return io.StringIO(who_is_game_txt)
+            if str(self_) == intent_file_2:
+                return io.StringIO(get_weather_txt)
+            if str(self_) == entity_file_1:
+                return io.StringIO(location_txt)
+            return None
+
+        # pylint:enable=unused-argument
+
+        dataset_files = [intent_file_1, intent_file_2, entity_file_1]
+
+        # When
+        with patch("pathlib.io") as mock_io:
+            mock_io.open.side_effect = mock_open
+            dataset = Dataset.from_files("en", dataset_files)
+        dataset_dict = dataset.json
+
+        # When / Then
+        validate_and_format_dataset(dataset_dict)
+        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset_validation.py
similarity index 98%
rename from snips_nlu/tests/test_dataset.py
rename to snips_nlu/tests/test_dataset_validation.py
index 0f0298a08..1aa9f457d 100644
--- a/snips_nlu/tests/test_dataset.py
+++ b/snips_nlu/tests/test_dataset_validation.py
@@ -5,13 +5,12 @@
 
 from mock import mock
 
-from snips_nlu.constants import (
-    ENTITIES, SNIPS_DATETIME)
+from snips_nlu.constants import ENTITIES, SNIPS_DATETIME
 from snips_nlu.dataset import validate_and_format_dataset
 from snips_nlu.tests.utils import SnipsTest
 
 
-class TestDataset(SnipsTest):
+class TestDatasetValidation(SnipsTest):
     def test_missing_intent_key_should_raise_exception(self):
         # Given
         dataset = {
@@ -155,7 +154,7 @@ def test_invalid_language_should_raise_exception(self):
             validate_and_format_dataset(dataset)
         self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0]))
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_format_dataset_by_adding_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -208,7 +207,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertDictEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_format_dataset_by_adding_entity_values(
             self, mocked_get_string_variations):
         # Given
@@ -321,7 +320,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_add_missing_reference_entity_values_when_not_use_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -462,7 +461,7 @@ def test_should_not_require_data_for_builtin_entities(self):
         with self.fail_if_exception("Could not validate dataset"):
             validate_and_format_dataset(dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_remove_empty_entities_value_and_empty_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -576,7 +575,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_add_capitalize_field(
             self, mocked_get_string_variations):
         # Given
@@ -752,7 +751,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertDictEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_should_normalize_synonyms(
             self, mocked_get_string_variations):
         # Given
@@ -827,7 +826,7 @@ def mock_get_string_variations(variation, language,
         # Then
         self.assertDictEqual(expected_dataset, dataset)
 
-    @mock.patch("snips_nlu.dataset.get_string_variations")
+    @mock.patch("snips_nlu.dataset.validation.get_string_variations")
     def test_dataset_should_handle_synonyms(
             self, mocked_get_string_variations):
         # Given
diff --git a/snips_nlu/tests/test_entity_loading.py b/snips_nlu/tests/test_entity_loading.py
new file mode 100644
index 000000000..4e49c7553
--- /dev/null
+++ b/snips_nlu/tests/test_entity_loading.py
@@ -0,0 +1,246 @@
+from __future__ import unicode_literals
+
+import io
+from unittest import TestCase
+
+import yaml
+from deprecation import fail_if_not_removed
+from mock import patch
+
+from snips_nlu.dataset import Entity, EntityFormatError
+
+
+class TestEntityLoading(TestCase):
+    def test_from_yaml_file(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+type: entity
+name: location
+automatically_extensible: no
+use_synonyms: yes
+matching_strictness: 0.5
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        entity = Entity.from_yaml(yaml_dict)
+        entity_dict = entity.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": False,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 0.5
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    def test_from_yaml_file_with_defaults(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+name: location
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        entity = Entity.from_yaml(yaml_dict)
+        entity_dict = entity.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": True,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    def test_fail_from_yaml_file_when_wrong_type(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+type: intent
+name: location
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When / Then
+        with self.assertRaises(EntityFormatError):
+            Entity.from_yaml(yaml_dict)
+
+    def test_fail_from_yaml_file_when_no_name(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# Location Entity
+---
+values:
+- [new york, big apple]
+- [paris, city of lights]
+- london
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When / Then
+        with self.assertRaises(EntityFormatError):
+            Entity.from_yaml(yaml_dict)
+
+    @patch("pathlib.io")
+    @fail_if_not_removed
+    def test_from_text_file(self, mock_io):
+        # Given
+        entity_file = "entity_location.txt"
+        location_txt = """
+new york,big apple
+paris,city of lights
+london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == entity_file:
+                return io.StringIO(location_txt)
+            return None
+
+        # pylint:enable=unused-argument
+        mock_io.open.side_effect = mock_open
+
+        # When
+        entity = Entity.from_file(entity_file)
+        entity_dict = entity.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": True,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    @patch("pathlib.io")
+    @fail_if_not_removed
+    def test_from_file_with_autoextensible(self, mock_io):
+        # Given
+        entity_file = "entity_location.txt"
+        location_txt = """# automatically_extensible=false
+new york,big apple
+paris,city of lights
+london
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == entity_file:
+                return io.StringIO(location_txt)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
+
+        # When
+        entity_dataset = Entity.from_file(entity_file)
+        entity_dict = entity_dataset.json
+
+        # Then
+        expected_entity_dict = {
+            "automatically_extensible": False,
+            "data": [
+                {
+                    "synonyms": [
+                        "big apple"
+                    ],
+                    "value": "new york"
+                },
+                {
+                    "synonyms": [
+                        "city of lights"
+                    ],
+                    "value": "paris"
+                },
+                {
+                    "synonyms": [],
+                    "value": "london"
+                }
+            ],
+            "use_synonyms": True,
+            "matching_strictness": 1.0
+        }
+        self.assertDictEqual(expected_entity_dict, entity_dict)
+
+    @fail_if_not_removed
+    def test_should_fail_generating_entity_with_wrong_file_name(self):
+        # Given
+        entity_file = "location.txt"
+
+        # When / Then
+        with self.assertRaises(EntityFormatError):
+            Entity.from_file(entity_file)
diff --git a/snips_nlu/tests/test_intent_loading.py b/snips_nlu/tests/test_intent_loading.py
new file mode 100644
index 000000000..50332d18f
--- /dev/null
+++ b/snips_nlu/tests/test_intent_loading.py
@@ -0,0 +1,298 @@
+from __future__ import unicode_literals
+
+import io
+from unittest import TestCase
+
+import yaml
+from deprecation import fail_if_not_removed
+from mock import patch
+
+from snips_nlu.dataset import Intent, IntentFormatError
+
+
+class TestIntentLoading(TestCase):
+    def test_should_load_from_yaml_file(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [weatherLocation:location](paris) ?
+  - "Will it rain [date:snips/datetime](tomorrow) in
+    [weatherLocation:location](london)?"
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        intent = Intent.from_yaml(yaml_dict)
+        intent_dict = intent.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "text": "paris",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": " ?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "Will it rain "
+                        },
+                        {
+                            "text": "tomorrow",
+                            "entity": "snips/datetime",
+                            "slot_name": "date"
+                        },
+                        {
+                            "text": " in "
+                        },
+                        {
+                            "text": "london",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                }
+            ]
+        }
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    def test_should_load_from_yaml_file_using_slot_mapping(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# getWeather Intent
+---
+type: intent
+name: getWeather
+slots:
+  - name: date
+    entity: snips/datetime
+  - name: weatherLocation
+    entity: location
+utterances:
+  - what is the weather in [weatherLocation](paris) ?
+  - Will it rain [date] in [weatherLocation](london)?
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        intent = Intent.from_yaml(yaml_dict)
+        intent_dict = intent.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "text": "paris",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": " ?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "Will it rain "
+                        },
+                        {
+                            "text": None,
+                            "entity": "snips/datetime",
+                            "slot_name": "date"
+                        },
+                        {
+                            "text": " in "
+                        },
+                        {
+                            "text": "london",
+                            "entity": "location",
+                            "slot_name": "weatherLocation"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                }
+            ]
+        }
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    def test_should_load_from_yaml_file_using_implicit_values(self):
+        # Given
+        yaml_stream = io.StringIO("""
+# getWeather Intent
+---
+type: intent
+name: getWeather
+utterances:
+  - what is the weather in [location] ?
+        """)
+        yaml_dict = yaml.safe_load(yaml_stream)
+
+        # When
+        intent = Intent.from_yaml(yaml_dict)
+        intent_dict = intent.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "text": None,
+                            "entity": "location",
+                            "slot_name": "location"
+                        },
+                        {
+                            "text": " ?"
+                        }
+                    ]
+                }
+            ]
+        }
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    @patch("pathlib.io")
+    @fail_if_not_removed
+    def test_should_generate_intent_from_text_file(self, mock_io):
+        # Given
+        intent_file = "intent_getWeather.txt"
+        get_weather_txt = """
+what is the weather in [weatherLocation:location](Paris)?
+Will it rain [weatherDate:snips/datetime](tomorrow) in [weatherLocation](Moscow)?
+How is the weather in [weatherLocation:location] [weatherDate] please?
+is it raining in [weatherLocation] [weatherDate:snips/datetime]
+        """
+
+        # pylint:disable=unused-argument
+        def mock_open(self_, *args, **kwargs):
+            if str(self_) == intent_file:
+                return io.StringIO(get_weather_txt)
+            return None
+
+        # pylint:enable=unused-argument
+
+        mock_io.open.side_effect = mock_open
+
+        # When
+        intent_dataset = Intent.from_file(intent_file)
+        intent_dict = intent_dataset.json
+
+        # Then
+        expected_intent_dict = {
+            "utterances": [
+                {
+                    "data": [
+                        {
+                            "text": "what is the weather in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "Paris"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "Will it rain "
+                        },
+                        {
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": "tomorrow"
+                        },
+                        {
+                            "text": " in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": "Moscow"
+                        },
+                        {
+                            "text": "?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "How is the weather in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": None
+                        },
+                        {
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": None
+                        },
+                        {
+                            "text": " please?"
+                        }
+                    ]
+                },
+                {
+                    "data": [
+                        {
+                            "text": "is it raining in "
+                        },
+                        {
+                            "entity": "location",
+                            "slot_name": "weatherLocation",
+                            "text": None
+                        },
+                        {
+                            "entity": "snips/datetime",
+                            "slot_name": "weatherDate",
+                            "text": None
+                        }
+                    ]
+                }
+            ]
+        }
+
+        self.assertDictEqual(expected_intent_dict, intent_dict)
+
+    @fail_if_not_removed
+    def test_should_fail_generating_intent_with_wrong_file_name(self):
+        # Given
+        intent_file = "getWeather.txt"
+
+        # When / Then
+        with self.assertRaises(IntentFormatError):
+            Intent.from_file(intent_file)
diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py
new file mode 100644
index 000000000..589b79c30
--- /dev/null
+++ b/snips_nlu/tests/test_log_reg_classifier_utils.py
@@ -0,0 +1,527 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+from copy import deepcopy
+from itertools import cycle
+
+import numpy as np
+from future.utils import itervalues
+from mock import MagicMock, patch
+
+from snips_nlu.constants import INTENTS, LANGUAGE_EN, UTTERANCES
+from snips_nlu.dataset import validate_and_format_dataset
+from snips_nlu.intent_classifier.log_reg_classifier_utils import (
+    add_unknown_word_to_utterances, build_training_data,
+    generate_noise_utterances, generate_smart_noise, get_noise_it,
+    remove_builtin_slots, text_to_utterance, get_dataset_specific_noise)
+from snips_nlu.pipeline.configs import (
+    IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig)
+from snips_nlu.tests.test_log_reg_intent_classifier import (
+    get_mocked_augment_utterances)
+from snips_nlu.tests.utils import (SAMPLE_DATASET, SnipsTest,
+                                   get_empty_dataset)
+
+
+class TestLogRegClassifierUtils(SnipsTest):
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
+           ".augment_utterances")
+    def test_should_build_training_data_with_no_stemming_no_noise(
+            self, mocked_augment_utterances):
+        # Given
+        dataset = validate_and_format_dataset(SAMPLE_DATASET)
+        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
+        random_state = np.random.RandomState(1)
+
+        # When
+        data_augmentation_config = IntentClassifierDataAugmentationConfig(
+            noise_factor=0)
+        utterances, _, intent_mapping = build_training_data(
+            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = [utterance for intent
+                               in itervalues(dataset[INTENTS])
+                               for utterance in intent[UTTERANCES]]
+        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
+        self.assertListEqual(expected_utterances, utterances)
+        self.assertListEqual(expected_intent_mapping, intent_mapping)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
+           ".augment_utterances")
+    def test_should_build_training_data_with_noise(
+            self, mocked_augment_utterances, mocked_get_noise):
+        # Given
+        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
+        mocked_get_noise.return_value = mocked_noises
+        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
+
+        num_intents = 3
+        utterances_length = 5
+        num_queries_per_intent = 3
+        fake_utterance = {
+            "data": [
+                {"text": " ".join("1" for _ in range(utterances_length))}
+            ]
+        }
+        dataset = {
+            "intents": {
+                str(i): {
+                    "utterances": [fake_utterance] * num_queries_per_intent
+                } for i in range(num_intents)
+            },
+            "entities": {}
+        }
+        random_state = np.random.RandomState(1)
+
+        # When
+        np.random.seed(42)
+        noise_factor = 2
+        data_augmentation_config = IntentClassifierDataAugmentationConfig(
+            noise_factor=noise_factor, unknown_word_prob=0,
+            unknown_words_replacement_string=None)
+        utterances, _, intent_mapping = build_training_data(
+            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = [utterance
+                               for intent in itervalues(dataset[INTENTS])
+                               for utterance in intent[UTTERANCES]]
+        np.random.seed(42)
+        noise = list(mocked_noises)
+        noise_size = int(min(noise_factor * num_queries_per_intent,
+                             len(noise)))
+        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
+                                random_state)
+        noisy_utterances = [text_to_utterance(next(noise_it))
+                            for _ in range(noise_size)]
+        expected_utterances += noisy_utterances
+        expected_intent_mapping = sorted(dataset["intents"])
+        expected_intent_mapping.append(None)
+        self.assertListEqual(expected_utterances, utterances)
+        self.assertListEqual(intent_mapping, expected_intent_mapping)
+
+    def test_add_unknown_words_to_utterances(self):
+        # Given
+        base_utterances = {
+            "data": [
+                {
+                    "text": "hello "
+                },
+                {
+                    "text": " you ",
+                    "entity": "you"
+                },
+                {
+                    "text": " how are you "
+                },
+                {
+                    "text": "cat",
+                    "entity": "cat"
+                }
+            ]
+        }
+        utterances = []
+        for _ in range(6):
+            utterances.append(deepcopy(base_utterances))
+
+        rand_it = cycle([0, 1])
+
+        def mocked_rand():
+            return next(rand_it)
+
+        max_unknown_words = 3
+        rg_it = cycle([i for i in range(1, max_unknown_words + 1)])
+
+        def mocked_randint(a, b):  # pylint: disable=unused-argument
+            return next(rg_it)
+
+        unknownword_prob = .5
+
+        random_state = MagicMock()
+        random_state_rand = MagicMock()
+        random_state_rand.side_effect = mocked_rand
+        random_state_choice = MagicMock()
+        random_state_choice.side_effect = mocked_randint
+
+        random_state.rand = random_state_rand
+        random_state.randint = random_state_choice
+
+        # When
+        replacement_string = "unknownword"
+        noisy_utterances = add_unknown_word_to_utterances(
+            utterances, unknown_word_prob=unknownword_prob,
+            replacement_string=replacement_string,
+            max_unknown_words=max_unknown_words,
+            random_state=random_state
+        )
+
+        # Then
+        expected_utterances = [
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                    {
+                        "text": " unknownword"
+                    }
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                    {
+                        "text": " unknownword unknownword"
+                    }
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                    {
+                        "text": " unknownword unknownword unknownword"
+                    }
+
+                ]
+            },
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "cat",
+                        "entity": "cat"
+                    },
+                ]
+            }
+        ]
+        self.assertEqual(expected_utterances, noisy_utterances)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    def test_generate_noise_utterances_should_replace_unknown_words(
+            self, mocked_noise):
+        # Given
+        utterances = [
+            {
+                "data": [
+                    {
+                        "text": "hello "
+                    },
+                    {
+                        "text": " you ",
+                        "entity": "you"
+                    },
+                    {
+                        "text": " how are you "
+                    },
+                    {
+                        "text": "bobby",
+                        "entity": "you"
+                    }
+                ]
+            }
+        ]
+        language = LANGUAGE_EN
+        base_noise = ["hello", "dear", "you", "fool"]
+        mocked_noise.return_value = base_noise
+        replacement_string = "unknownword"
+
+        # When
+        noise = generate_smart_noise(
+            base_noise, utterances, replacement_string, language)
+
+        # Then
+        expected_noise = ["hello", replacement_string, "you",
+                          replacement_string]
+        self.assertEqual(noise, expected_noise)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
+           ".augment_utterances")
+    def test_should_build_training_data_with_unknown_noise(
+            self, mocked_augment_utterances, mocked_get_noise):
+        # Given
+        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
+        mocked_get_noise.return_value = mocked_noises
+        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
+
+        num_intents = 3
+        utterances_length = 5
+        num_queries_per_intent = 3
+        fake_utterance = {
+            "data": [
+                {"text": " ".join("1" for _ in range(utterances_length))}
+            ]
+        }
+        dataset = {
+            "intents": {
+                str(i): {
+                    "utterances": [fake_utterance] * num_queries_per_intent
+                } for i in range(num_intents)
+            },
+            "entities": {}
+        }
+        random_state = np.random.RandomState(1)
+
+        # When
+        np.random.seed(42)
+        noise_factor = 2
+        replacement_string = "unknownword"
+        data_augmentation_config = IntentClassifierDataAugmentationConfig(
+            noise_factor=noise_factor, unknown_word_prob=0,
+            unknown_words_replacement_string=replacement_string)
+        utterances, _, intent_mapping = build_training_data(
+            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = [utterance
+                               for intent in itervalues(dataset[INTENTS])
+                               for utterance in intent[UTTERANCES]]
+        np.random.seed(42)
+        noise = list(mocked_noises)
+        noise_size = int(min(noise_factor * num_queries_per_intent,
+                             len(noise)))
+        noisy_utterances = [text_to_utterance(replacement_string)
+                            for _ in range(noise_size)]
+        expected_utterances += noisy_utterances
+        expected_intent_mapping = sorted(dataset["intents"])
+        expected_intent_mapping.append(None)
+        self.assertListEqual(expected_utterances, utterances)
+        self.assertListEqual(expected_intent_mapping, intent_mapping)
+
+    def test_should_build_training_data_with_no_data(self):
+        # Given
+        language = LANGUAGE_EN
+        dataset = validate_and_format_dataset(get_empty_dataset(language))
+        random_state = np.random.RandomState(1)
+
+        # When
+        data_augmentation_config = LogRegIntentClassifierConfig() \
+            .data_augmentation_config
+        utterances, _, intent_mapping = build_training_data(
+            dataset, language, data_augmentation_config, random_state)
+
+        # Then
+        expected_utterances = []
+        expected_intent_mapping = []
+        self.assertListEqual(utterances, expected_utterances)
+        self.assertListEqual(intent_mapping, expected_intent_mapping)
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    def test_generate_noise_utterances(self, mocked_get_noise):
+        # Given
+        language = LANGUAGE_EN
+        num_intents = 2
+        noise_factor = 1
+        utterances_length = 5
+
+        noise = [str(i) for i in range(utterances_length)]
+        mocked_get_noise.return_value = noise
+
+        augmented_utterances = [
+            {
+                "data": [
+                    {
+                        "text": " ".join(
+                            "{}".format(i) for i in range(utterances_length))
+                    }
+                ]
+            }
+        ]
+        num_utterances = 10
+        random_state = np.random.RandomState(1)
+
+        augmented_utterances = augmented_utterances * num_utterances
+        config = IntentClassifierDataAugmentationConfig(
+            noise_factor=noise_factor)
+        # When
+        noise_utterances = generate_noise_utterances(
+            augmented_utterances, noise, num_intents, config, language,
+            random_state)
+
+        # Then
+        joined_noise = text_to_utterance(" ".join(noise))
+        for u in noise_utterances:
+            self.assertEqual(u, joined_noise)
+
+    def test_remove_builtin_slots(self):
+        # Given
+        language = LANGUAGE_EN
+        dataset = {
+            "entities": {
+                "snips/number": {}
+            },
+            "intents": {
+                "dummy_intent_1": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "I want ",
+                                },
+                                {
+                                    "text": "three",
+                                    "slot_name": "number_of_cups",
+                                    "entity": "snips/number"
+                                },
+                                {
+                                    "text": " cups",
+                                },
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "give me ",
+                                },
+                                {
+                                    "text": "twenty two",
+                                    "slot_name": "number_of_cups",
+                                    "entity": "snips/number"
+                                },
+                                {
+                                    "text": " big cups please",
+                                },
+                            ]
+                        }
+                    ]
+                }
+            },
+            "language": language
+        }
+
+        # When
+        filtered_dataset = remove_builtin_slots(dataset)
+
+        # Then
+        expected_dataset = {
+            "entities": {
+                "snips/number": {}
+            },
+            "intents": {
+                "dummy_intent_1": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "I want ",
+                                },
+                                {
+                                    "text": " cups",
+                                },
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "give me ",
+                                },
+                                {
+                                    "text": " big cups please",
+                                },
+                            ]
+                        }
+                    ]
+                }
+            },
+            "language": language
+        }
+
+        self.assertDictEqual(expected_dataset, filtered_dataset)
+
+
+    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
+    def test_get_dataset_specific_noise(self, mocked_noise):
+        # Given
+        dataset = validate_and_format_dataset(SAMPLE_DATASET)
+        language = "en"
+        mocked_noise.return_value = ["dummy_a", "yo"]
+
+
+        # When
+        noise = get_dataset_specific_noise(dataset, language)
+
+        # Then
+        self.assertEqual(["yo"], noise)
diff --git a/snips_nlu/tests/test_log_reg_intent_classifier.py b/snips_nlu/tests/test_log_reg_intent_classifier.py
index 64ee92a55..1c17c423b 100644
--- a/snips_nlu/tests/test_log_reg_intent_classifier.py
+++ b/snips_nlu/tests/test_log_reg_intent_classifier.py
@@ -1,26 +1,20 @@
 # coding=utf-8
 from __future__ import unicode_literals
 
-from builtins import next, range, str
-
-import numpy as np
-from future.utils import itervalues
 from mock import patch
 
 from snips_nlu.constants import (
     INTENTS, LANGUAGE_EN, RES_INTENT_NAME, UTTERANCES)
 from snips_nlu.dataset import validate_and_format_dataset
 from snips_nlu.entity_parser import BuiltinEntityParser, CustomEntityParser
-from snips_nlu.entity_parser.custom_entity_parser_usage import \
-    CustomEntityParserUsage
+from snips_nlu.entity_parser.custom_entity_parser_usage import (
+    CustomEntityParserUsage)
 from snips_nlu.intent_classifier import LogRegIntentClassifier
 from snips_nlu.intent_classifier.featurizer import Featurizer
 from snips_nlu.intent_classifier.log_reg_classifier_utils import (
-    add_unknown_word_to_utterances, build_training_data,
-    generate_noise_utterances, generate_smart_noise, get_noise_it,
-    remove_builtin_slots, text_to_utterance)
+    text_to_utterance)
 from snips_nlu.pipeline.configs import (
-    IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig)
+    LogRegIntentClassifierConfig)
 from snips_nlu.tests.utils import (
     BEVERAGE_DATASET, FixtureTest, SAMPLE_DATASET, get_empty_dataset)
 from snips_nlu.utils import NotTrained
@@ -264,400 +258,3 @@ def test_empty_vocabulary_should_fit_and_return_none_intent(
         intent_classifier = LogRegIntentClassifier().fit(dataset)
         intent = intent_classifier.get_intent("no intent there")
         self.assertEqual(None, intent)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
-           ".augment_utterances")
-    def test_should_build_training_data_with_no_stemming_no_noise(
-            self, mocked_augment_utterances):
-        # Given
-        dataset = SAMPLE_DATASET
-        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
-        random_state = np.random.RandomState(1)
-
-        # When
-        data_augmentation_config = IntentClassifierDataAugmentationConfig(
-            noise_factor=0)
-        utterances, _, intent_mapping = build_training_data(
-            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = [utterance for intent
-                               in itervalues(dataset[INTENTS])
-                               for utterance in intent[UTTERANCES]]
-        expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2']
-        self.assertListEqual(expected_utterances, utterances)
-        self.assertListEqual(expected_intent_mapping, intent_mapping)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
-           ".augment_utterances")
-    def test_should_build_training_data_with_noise(
-            self, mocked_augment_utterances, mocked_get_noise):
-        # Given
-        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
-        mocked_get_noise.return_value = mocked_noises
-        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
-
-        num_intents = 3
-        utterances_length = 5
-        num_queries_per_intent = 3
-        fake_utterance = {
-            "data": [
-                {"text": " ".join("1" for _ in range(utterances_length))}
-            ]
-        }
-        dataset = {
-            "intents": {
-                str(i): {
-                    "utterances": [fake_utterance] * num_queries_per_intent
-                } for i in range(num_intents)
-            }
-        }
-        random_state = np.random.RandomState(1)
-
-        # When
-        np.random.seed(42)
-        noise_factor = 2
-        data_augmentation_config = IntentClassifierDataAugmentationConfig(
-            noise_factor=noise_factor, unknown_word_prob=0,
-            unknown_words_replacement_string=None)
-        utterances, _, intent_mapping = build_training_data(
-            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = [utterance
-                               for intent in itervalues(dataset[INTENTS])
-                               for utterance in intent[UTTERANCES]]
-        np.random.seed(42)
-        noise = list(mocked_noises)
-        noise_size = int(min(noise_factor * num_queries_per_intent,
-                             len(noise)))
-        noise_it = get_noise_it(mocked_noises, utterances_length, 0,
-                                random_state)
-        noisy_utterances = [text_to_utterance(next(noise_it))
-                            for _ in range(noise_size)]
-        expected_utterances += noisy_utterances
-        expected_intent_mapping = sorted(dataset["intents"])
-        expected_intent_mapping.append(None)
-        self.assertListEqual(expected_utterances, utterances)
-        self.assertListEqual(intent_mapping, expected_intent_mapping)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils"
-           ".augment_utterances")
-    def test_should_build_training_data_with_unknown_noise(
-            self, mocked_augment_utterances, mocked_get_noise):
-        # Given
-        mocked_noises = ["mocked_noise_%s" % i for i in range(100)]
-        mocked_get_noise.return_value = mocked_noises
-        mocked_augment_utterances.side_effect = get_mocked_augment_utterances
-
-        num_intents = 3
-        utterances_length = 5
-        num_queries_per_intent = 3
-        fake_utterance = {
-            "data": [
-                {"text": " ".join("1" for _ in range(utterances_length))}
-            ]
-        }
-        dataset = {
-            "intents": {
-                str(i): {
-                    "utterances": [fake_utterance] * num_queries_per_intent
-                } for i in range(num_intents)
-            }
-        }
-        random_state = np.random.RandomState(1)
-
-        # When
-        np.random.seed(42)
-        noise_factor = 2
-        replacement_string = "unknownword"
-        data_augmentation_config = IntentClassifierDataAugmentationConfig(
-            noise_factor=noise_factor, unknown_word_prob=0,
-            unknown_words_replacement_string=replacement_string)
-        utterances, _, intent_mapping = build_training_data(
-            dataset, LANGUAGE_EN, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = [utterance
-                               for intent in itervalues(dataset[INTENTS])
-                               for utterance in intent[UTTERANCES]]
-        np.random.seed(42)
-        noise = list(mocked_noises)
-        noise_size = int(min(noise_factor * num_queries_per_intent,
-                             len(noise)))
-        noisy_utterances = [text_to_utterance(replacement_string)
-                            for _ in range(noise_size)]
-        expected_utterances += noisy_utterances
-        expected_intent_mapping = sorted(dataset["intents"])
-        expected_intent_mapping.append(None)
-        self.assertListEqual(expected_utterances, utterances)
-        self.assertListEqual(expected_intent_mapping, intent_mapping)
-
-    def test_should_build_training_data_with_no_data(self):
-        # Given
-        language = LANGUAGE_EN
-        dataset = validate_and_format_dataset(get_empty_dataset(language))
-        random_state = np.random.RandomState(1)
-
-        # When
-        data_augmentation_config = LogRegIntentClassifierConfig() \
-            .data_augmentation_config
-        utterances, _, intent_mapping = build_training_data(
-            dataset, language, data_augmentation_config, random_state)
-
-        # Then
-        expected_utterances = []
-        expected_intent_mapping = []
-        self.assertListEqual(utterances, expected_utterances)
-        self.assertListEqual(intent_mapping, expected_intent_mapping)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    def test_generate_noise_utterances(self, mocked_get_noise):
-        # Given
-        language = LANGUAGE_EN
-        num_intents = 2
-        noise_factor = 1
-        utterances_length = 5
-
-        noise = [str(i) for i in range(utterances_length)]
-        mocked_get_noise.return_value = noise
-
-        augmented_utterances = [
-            {
-                "data": [
-                    {
-                        "text": " ".join(
-                            "{}".format(i) for i in range(utterances_length))
-                    }
-                ]
-            }
-        ]
-        num_utterances = 10
-        random_state = np.random.RandomState(1)
-
-        augmented_utterances = augmented_utterances * num_utterances
-        config = IntentClassifierDataAugmentationConfig(
-            noise_factor=noise_factor)
-        # When
-        noise_utterances = generate_noise_utterances(
-            augmented_utterances, num_intents, config, language, random_state)
-
-        # Then
-        joined_noise = text_to_utterance(" ".join(noise))
-        for u in noise_utterances:
-            self.assertEqual(u, joined_noise)
-
-    def test_add_unknown_words_to_utterances(self):
-        # Given
-        utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": " you ",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": "dude",
-                        "entity": "you"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": "dude",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-
-                    },
-                    {
-                        "text": " you ",
-                        "entity": "you"
-                    }
-                ]
-            }
-        ]
-        unknownword_prob = .5
-        random_state = np.random.RandomState(1)
-
-        # When
-        replacement_string = "unknownword"
-        noisy_utterances = add_unknown_word_to_utterances(
-            utterances, unknown_word_prob=unknownword_prob,
-            replacement_string=replacement_string, random_state=random_state
-        )
-
-        # Then
-        expected_utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": " unknownword ",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": "dude",
-                        "entity": "you"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": "unknownword",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": " unknownword ",
-                        "entity": "you"
-                    }
-                ]
-            }
-        ]
-        self.assertEqual(expected_utterances, noisy_utterances)
-
-    @patch("snips_nlu.intent_classifier.log_reg_classifier_utils.get_noise")
-    def test_generate_noise_utterances_should_replace_unknown_words(
-            self, mocked_noise):
-        # Given
-        utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hello "
-                    },
-                    {
-                        "text": " you ",
-                        "entity": "you"
-                    },
-                    {
-                        "text": " how are you "
-                    },
-                    {
-                        "text": "bobby",
-                        "entity": "you"
-                    }
-                ]
-            }
-        ]
-        language = LANGUAGE_EN
-        mocked_noise.return_value = ["hello", "dear", "you", "fool"]
-        replacement_string = "unknownword"
-
-        # When
-        noise = generate_smart_noise(utterances, replacement_string, language)
-
-        # Then
-        expected_noise = ["hello", replacement_string, "you",
-                          replacement_string]
-        self.assertEqual(noise, expected_noise)
-
-    def test_remove_builtin_slots(self):
-        # Given
-        language = LANGUAGE_EN
-        dataset = {
-            "entities": {
-                "snips/number": {}
-            },
-            "intents": {
-                "dummy_intent_1": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "I want ",
-                                },
-                                {
-                                    "text": "three",
-                                    "slot_name": "number_of_cups",
-                                    "entity": "snips/number"
-                                },
-                                {
-                                    "text": " cups",
-                                },
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "give me ",
-                                },
-                                {
-                                    "text": "twenty two",
-                                    "slot_name": "number_of_cups",
-                                    "entity": "snips/number"
-                                },
-                                {
-                                    "text": " big cups please",
-                                },
-                            ]
-                        }
-                    ]
-                }
-            },
-            "language": language
-        }
-
-        # When
-        filtered_dataset = remove_builtin_slots(dataset)
-
-        # Then
-        expected_dataset = {
-            "entities": {
-                "snips/number": {}
-            },
-            "intents": {
-                "dummy_intent_1": {
-                    "utterances": [
-                        {
-                            "data": [
-                                {
-                                    "text": "I want ",
-                                },
-                                {
-                                    "text": " cups",
-                                },
-                            ]
-                        },
-                        {
-                            "data": [
-                                {
-                                    "text": "give me ",
-                                },
-                                {
-                                    "text": " big cups please",
-                                },
-                            ]
-                        }
-                    ]
-                }
-            },
-            "language": language
-        }
-
-        self.assertDictEqual(expected_dataset, filtered_dataset)
diff --git a/snips_nlu/tests/utils.py b/snips_nlu/tests/utils.py
index d057ce4e1..7ea7c219f 100644
--- a/snips_nlu/tests/utils.py
+++ b/snips_nlu/tests/utils.py
@@ -14,10 +14,11 @@
 from snips_nlu.utils import json_string, unicode_string
 
 TEST_PATH = Path(__file__).parent
-SAMPLE_DATASET_PATH = TEST_PATH / "resources" / "sample_dataset.json"
-BEVERAGE_DATASET_PATH = TEST_PATH / "resources" / "beverage_dataset.json"
-WEATHER_DATASET_PATH = TEST_PATH / "resources" / "weather_dataset.json"
-PERFORMANCE_DATASET_PATH = TEST_PATH / "resources" / "performance_dataset.json"
+TEST_RESOURCES_PATH = TEST_PATH / "resources"
+SAMPLE_DATASET_PATH = TEST_RESOURCES_PATH / "sample_dataset.json"
+BEVERAGE_DATASET_PATH = TEST_RESOURCES_PATH / "beverage_dataset.json"
+WEATHER_DATASET_PATH = TEST_RESOURCES_PATH / "weather_dataset.json"
+PERFORMANCE_DATASET_PATH = TEST_RESOURCES_PATH / "performance_dataset.json"
 
 
 # pylint: disable=invalid-name
diff --git a/snips_nlu/utils.py b/snips_nlu/utils.py
index 198722b7a..b12adfdbb 100644
--- a/snips_nlu/utils.py
+++ b/snips_nlu/utils.py
@@ -7,7 +7,7 @@
 import os
 import shutil
 from builtins import bytes, object, str
-from collections import Mapping, OrderedDict, namedtuple
+from collections import OrderedDict
 from contextlib import contextmanager
 from datetime import datetime
 from functools import wraps
@@ -24,6 +24,7 @@
 REGEX_PUNCT = {'\\', '.', '+', '*', '?', '(', ')', '|', '[', ']', '{', '}',
                '^', '$', '#', '&', '-', '~'}
 
+
 # pylint: disable=invalid-name
 
 class abstractclassmethod(classmethod):
@@ -97,12 +98,6 @@ def validate_keys(obj, keys, object_label=None):
         validate_key(obj, key, object_label)
 
 
-def validate_range(rng):
-    if not isinstance(rng, (list, tuple)) or len(rng) != 2 or rng[0] > rng[1]:
-        raise ValueError("range must be a length 2 list or tuple and must be "
-                         "valid")
-
-
 class LimitedSizeDict(OrderedDict):
     def __init__(self, *args, **kwds):
         if "size_limit" not in kwds:
@@ -138,17 +133,6 @@ def __setitem__(self, key, value):
         super(UnupdatableDict, self).__setitem__(key, value)
 
 
-def namedtuple_with_defaults(typename, field_names, default_values=()):
-    T = namedtuple(typename, field_names)  # pylint: disable=C0103
-    T.__new__.__defaults__ = (None,) * len(T._fields)
-    if isinstance(default_values, Mapping):
-        prototype = T(**default_values)
-    else:
-        prototype = T(*default_values)
-    T.__new__.__defaults__ = tuple(prototype)
-    return T
-
-
 def mkdir_p(path):
     """Reproduces the 'mkdir -p shell' command