From 8437e3947e9f0c84034a2af679cb0217b1431704 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 12 Feb 2019 10:17:30 +0100
Subject: [PATCH 01/14] Allow to fetch resources based only on major and minor

---
 snips_nlu/cli/utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/snips_nlu/cli/utils.py b/snips_nlu/cli/utils.py
index 8a99f51d3..7c26b5933 100644
--- a/snips_nlu/cli/utils.py
+++ b/snips_nlu/cli/utils.py
@@ -7,6 +7,7 @@
 from enum import Enum, unique
 
 import requests
+from semantic_version import Version
 
 import snips_nlu
 from snips_nlu import __about__
@@ -71,13 +72,16 @@ def get_json(url, desc):
 
 def get_compatibility():
     version = __about__.__version__
+    semver_version = Version(version)
+    minor_version = "%d.%d" % (semver_version.major, semver_version.minor)
     table = get_json(__about__.__compatibility__, "Compatibility table")
-    compatibility = table["snips-nlu"]
-    if version not in compatibility:
+    nlu_table = table["snips-nlu"]
+    compatibility = nlu_table.get(version, nlu_table.get(minor_version))
+    if compatibility is None:
         pretty_print("No compatible resources found for version %s" % version,
                      title="Resources compatibility error", exits=1,
                      level=PrettyPrintLevel.ERROR)
-    return compatibility[version]
+    return compatibility
 
 
 def get_resources_version(resource_fullname, resource_alias, compatibility):

From f502431b432381e6fb3021cf3d9fb5dded88842e Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Wed, 13 Feb 2019 15:37:19 +0100
Subject: [PATCH 02/14] Fix a bug which was mutating the CRFSlotFillerConfig

---
 snips_nlu/slot_filler/crf_slot_filler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/snips_nlu/slot_filler/crf_slot_filler.py b/snips_nlu/slot_filler/crf_slot_filler.py
index ff9b0a4c6..3b21e6821 100644
--- a/snips_nlu/slot_filler/crf_slot_filler.py
+++ b/snips_nlu/slot_filler/crf_slot_filler.py
@@ -7,6 +7,7 @@
 import shutil
 import tempfile
 from builtins import range
+from copy import deepcopy
 from pathlib import Path
 
 from future.utils import iteritems
@@ -48,6 +49,9 @@ class CRFSlotFiller(SlotFiller):
     def __init__(self, config=None, **shared):
         """The CRF slot filler can be configured by passing a
         :class:`.CRFSlotFillerConfig`"""
+        # The CRFSlotFillerConfig must be deep-copied as it is mutated when
+        # fitting the feature factories
+        config = deepcopy(config)
         super(CRFSlotFiller, self).__init__(config, **shared)
         self.crf_model = None
         self.features_factories = [

From 48e97b4ca4dbc72549375fb96930507708cdb22b Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Thu, 14 Feb 2019 11:00:39 +0100
Subject: [PATCH 03/14] Allow matching_strictness property to be int

---
 snips_nlu/dataset/validation.py            |  2 +-
 snips_nlu/tests/test_dataset_validation.py | 29 ++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/snips_nlu/dataset/validation.py b/snips_nlu/dataset/validation.py
index 61babeec7..179763ce0 100644
--- a/snips_nlu/dataset/validation.py
+++ b/snips_nlu/dataset/validation.py
@@ -128,7 +128,7 @@ def _validate_and_format_custom_entity(entity, queries_entities, language,
     validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool,
                   object_label="automatically_extensible")
     validate_type(entity[DATA], list, object_label="entity data")
-    validate_type(entity[MATCHING_STRICTNESS], float,
+    validate_type(entity[MATCHING_STRICTNESS], (float, int),
                   object_label="matching_strictness")
 
     formatted_entity = dict()
diff --git a/snips_nlu/tests/test_dataset_validation.py b/snips_nlu/tests/test_dataset_validation.py
index 083f2f1e3..ee4981b39 100644
--- a/snips_nlu/tests/test_dataset_validation.py
+++ b/snips_nlu/tests/test_dataset_validation.py
@@ -93,6 +93,35 @@ def test_missing_entity_key_should_raise_exception(self):
         self.assertEqual("Expected custom entity to have key: 'use_synonyms'",
                          str(ctx.exception.args[0]))
 
+    def test_should_support_int_or_float_for_matching_strictness(self):
+        # Given
+        dataset = {
+            "intents": {},
+            "entities": {
+                "entity1": {
+                    "data": [],
+                    "automatically_extensible": False,
+                    "use_synonyms": True,
+                    "matching_strictness": 0.5
+                },
+                "entity2": {
+                    "data": [],
+                    "automatically_extensible": False,
+                    "use_synonyms": True,
+                    "matching_strictness": 1
+                }
+            },
+            "language": "en",
+        }
+
+        # When/Then
+        dataset = validate_and_format_dataset(dataset)
+
+        self.assertEqual(
+            0.5, dataset["entities"]["entity1"].get("matching_strictness"))
+        self.assertEqual(
+            1, dataset["entities"]["entity2"].get("matching_strictness"))
+
     def test_missing_matching_strictness_should_be_handled(self):
         # TODO: This test is temporary, and must be removed once the backward
         # compatibility with the previous dataset format, without

From 6c4fcea4e014c7bb6c3d8b9b8602cdeb1bbbca30 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Mon, 18 Feb 2019 17:40:12 +0100
Subject: [PATCH 04/14] Fix issue with resources required by the deterministic
 intent parser

---
 snips_nlu/intent_parser/deterministic_intent_parser.py | 1 +
 snips_nlu/pipeline/configs/intent_parser.py            | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py
index 1f3e404a0..4c442ec58 100644
--- a/snips_nlu/intent_parser/deterministic_intent_parser.py
+++ b/snips_nlu/intent_parser/deterministic_intent_parser.py
@@ -127,6 +127,7 @@ def fit(self, dataset, force_retrain=True):
         """Fits the intent parser with a valid Snips dataset"""
         logger.info("Fitting deterministic parser...")
         dataset = validate_and_format_dataset(dataset)
+        self.load_resources_if_needed(dataset[LANGUAGE])
         self.fit_builtin_entity_parser_if_needed(dataset)
         self.fit_custom_entity_parser_if_needed(dataset)
         self.language = dataset[LANGUAGE]
diff --git a/snips_nlu/pipeline/configs/intent_parser.py b/snips_nlu/pipeline/configs/intent_parser.py
index 9a4fea7d2..bdc56e083 100644
--- a/snips_nlu/pipeline/configs/intent_parser.py
+++ b/snips_nlu/pipeline/configs/intent_parser.py
@@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 
 from snips_nlu.common.from_dict import FromDict
-from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE
+from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS
 from snips_nlu.entity_parser import CustomEntityParserUsage
 from snips_nlu.pipeline.configs import ProcessingUnitConfig
 from snips_nlu.resources import merge_required_resources
@@ -84,7 +84,8 @@ def unit_name(self):
 
     def get_required_resources(self):
         return {
-            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
+            CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
+            STOP_WORDS: self.ignore_stop_words
         }
 
     def to_dict(self):

From 812be2d19de9f1a3b54b9664b84e5da529248cba Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Fri, 1 Mar 2019 14:15:24 +0100
Subject: [PATCH 05/14] Fix issue with group names in deterministic parser

---
 .../deterministic_intent_parser.py            |  2 +-
 .../tests/test_deterministic_intent_parser.py | 41 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py
index 4c442ec58..63238b9e6 100644
--- a/snips_nlu/intent_parser/deterministic_intent_parser.py
+++ b/snips_nlu/intent_parser/deterministic_intent_parser.py
@@ -314,7 +314,7 @@ def _get_matching_result(self, text, processed_text, regex, intent,
         for group_name in found_result.groupdict():
             ref_group_name = group_name
             if "_" in group_name:
-                ref_group_name = group_name[:(len(group_name) - 2)]
+                ref_group_name = group_name.split("_")[0]
             slot_name = self.group_names_to_slot_names[ref_group_name]
             entity = self.slot_names_to_entities[intent][slot_name]
             rng = (found_result.start(group_name),
diff --git a/snips_nlu/tests/test_deterministic_intent_parser.py b/snips_nlu/tests/test_deterministic_intent_parser.py
index 5af116d95..094b74e8c 100644
--- a/snips_nlu/tests/test_deterministic_intent_parser.py
+++ b/snips_nlu/tests/test_deterministic_intent_parser.py
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 
 import io
-
 from builtins import range
 
 from mock import patch
@@ -162,6 +161,46 @@ def test_should_parse_intent_with_stop_words(self, mock_get_stop_words):
 
         self.assertEqual(expected_intent, parsing[RES_INTENT])
 
+    def test_should_parse_intent_with_duplicated_slot_names(self):
+        # Given
+        slots_dataset_stream = io.StringIO("""
+---
+type: intent
+name: math_operation
+slots:
+  - name: number
+    entity: snips/number
+utterances:
+  - what is [number](one) plus [number](one)""")
+        dataset = Dataset.from_yaml_files("en", [slots_dataset_stream]).json
+        parser = DeterministicIntentParser().fit(dataset)
+        text = "what is one plus one"
+
+        # When
+        parsing = parser.parse(text)
+
+        # Then
+        probability = 1.0
+        expected_intent = intent_classification_result(
+            intent_name="math_operation", probability=probability)
+        expected_slots = [
+            {
+                "entity": "snips/number",
+                "range": {"end": 11, "start": 8},
+                "slotName": "number",
+                "value": "one"
+            },
+            {
+                "entity": "snips/number",
+                "range": {"end": 20, "start": 17},
+                "slotName": "number",
+                "value": "one"
+            }
+        ]
+
+        self.assertDictEqual(expected_intent, parsing[RES_INTENT])
+        self.assertListEqual(expected_slots, parsing[RES_SLOTS])
+
     def test_should_ignore_ambiguous_utterances(self):
         # Given
         dataset_stream = io.StringIO("""

From 7ff270b67f0a8f0a44139ed392753e3febc6ddd7 Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Mon, 4 Mar 2019 14:00:17 +0100
Subject: [PATCH 06/14] Fix inference CLI

---
 snips_nlu/cli/inference.py    | 3 +++
 snips_nlu/common/log_utils.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/snips_nlu/cli/inference.py b/snips_nlu/cli/inference.py
index 004526671..867fa9605 100644
--- a/snips_nlu/cli/inference.py
+++ b/snips_nlu/cli/inference.py
@@ -18,6 +18,7 @@
 )
 def parse(training_path, query, verbose=False):
     """Load a trained NLU engine and play with its parsing API interactively"""
+    from builtins import str
     if verbose:
         set_nlu_logger(logging.DEBUG)
 
@@ -29,6 +30,8 @@ def parse(training_path, query, verbose=False):
 
     while True:
         query = input("Enter a query (type 'q' to quit): ").strip()
+        if not isinstance(query, str):
+            query = query.decode("utf-8")
         if query == "q":
             break
         print_parsing_result(engine, query)
diff --git a/snips_nlu/common/log_utils.py b/snips_nlu/common/log_utils.py
index 06d27ca18..6d29a9038 100644
--- a/snips_nlu/common/log_utils.py
+++ b/snips_nlu/common/log_utils.py
@@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+from builtins import str
 from datetime import datetime
 from functools import wraps
 

From 5907ce254d4fccde5e2fb9cce39988701ccf0007 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 12:24:16 +0100
Subject: [PATCH 07/14] Fix issue with intent classifier when preprocessing
 data

---
 snips_nlu/intent_classifier/featurizer.py     | 106 ++---
 .../test_intent_classifier_featurizer.py      | 434 +-----------------
 2 files changed, 55 insertions(+), 485 deletions(-)

diff --git a/snips_nlu/intent_classifier/featurizer.py b/snips_nlu/intent_classifier/featurizer.py
index a2c7a2e5f..b5458d0d8 100644
--- a/snips_nlu/intent_classifier/featurizer.py
+++ b/snips_nlu/intent_classifier/featurizer.py
@@ -17,8 +17,7 @@
     json_string, fitted_required, replace_entities_with_placeholders,
     check_persisted_path)
 from snips_nlu.constants import (
-    DATA, END, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, RES_MATCH_RANGE,
-    RES_VALUE, START, TEXT, ENTITIES)
+    DATA, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, TEXT, ENTITIES)
 from snips_nlu.dataset import get_text_from_chunks, validate_and_format_dataset
 from snips_nlu.entity_parser.builtin_entity_parser import (
     is_builtin_entity)
@@ -264,7 +263,7 @@ def fit(self, x, dataset):
         self._init_vectorizer(self._language)
         self.builtin_entity_scope = set(
             e for e in dataset[ENTITIES] if is_builtin_entity(e))
-        preprocessed_data = self._preprocess(x, training=True)
+        preprocessed_data = self._preprocess(x)
         utterances = [
             self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
             for u, builtin_ents, custom_ents, w_clusters
@@ -296,7 +295,7 @@ def fit_transform(self, x, dataset):
         self._init_vectorizer(self._language)
         self.builtin_entity_scope = set(
             e for e in dataset[ENTITIES] if is_builtin_entity(e))
-        preprocessed_data = self._preprocess(x, training=True)
+        preprocessed_data = self._preprocess(x)
         utterances = [
             self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
             for u, builtin_ents, custom_ents, w_clusters
@@ -330,31 +329,30 @@ def transform(self, x):
                       for data in zip(*self._preprocess(x))]
         return self._tfidf_vectorizer.transform(utterances)
 
-    def _preprocess(self, utterances, training=False):
+    def _preprocess(self, utterances):
         normalized_utterances = deepcopy(utterances)
         for u in normalized_utterances:
-            for chunk in u[DATA]:
+            nb_chunks = len(u[DATA])
+            for i, chunk in enumerate(u[DATA]):
                 chunk[TEXT] = _normalize_stem(
                     chunk[TEXT], self.language, self.resources,
                     self.config.use_stemming)
-
-        if training:
-            builtin_ents, custom_ents = zip(
-                *[_entities_from_utterance(u) for u in utterances])
-        else:
-            # Extract builtin entities on unormalized utterances
-            builtin_ents = [
-                self.builtin_entity_parser.parse(
-                    get_text_from_chunks(u[DATA]),
-                    self.builtin_entity_scope, use_cache=True)
-                for u in utterances
-            ]
-            # Extract builtin entities on normalized utterances
-            custom_ents = [
-                self.custom_entity_parser.parse(
-                    get_text_from_chunks(u[DATA]), use_cache=True)
-                for u in normalized_utterances
-            ]
+                if i < nb_chunks - 1:
+                    chunk[TEXT] += " "
+
+        # Extract builtin entities on unormalized utterances
+        builtin_ents = [
+            self.builtin_entity_parser.parse(
+                get_text_from_chunks(u[DATA]),
+                self.builtin_entity_scope, use_cache=True)
+            for u in utterances
+        ]
+        # Extract builtin entities on normalized utterances
+        custom_ents = [
+            self.custom_entity_parser.parse(
+                get_text_from_chunks(u[DATA]), use_cache=True)
+            for u in normalized_utterances
+        ]
         if self.config.word_clusters_name:
             # Extract world clusters on unormalized utterances
             original_utterances_text = [get_text_from_chunks(u[DATA])
@@ -582,7 +580,7 @@ def fit(self, x, dataset):
         self.builtin_entity_scope = set(
             e for e in dataset[ENTITIES] if is_builtin_entity(e))
 
-        preprocessed = self._preprocess(list(x), training=True)
+        preprocessed = self._preprocess(list(x))
         utterances = [
             self._enrich_utterance(utterance, builtin_ents, custom_ent)
             for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
@@ -648,7 +646,7 @@ def transform(self, x):
         Raises:
             NotTrained: when the vectorizer is not fitted
         """
-        preprocessed = self._preprocess(x, training=False)
+        preprocessed = self._preprocess(x)
         utterances = [
             self._enrich_utterance(utterance, builtin_ents, custom_ent)
             for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
@@ -661,24 +659,20 @@ def transform(self, x):
 
         return x_coo.tocsr()
 
-    def _preprocess(self, x, training=False):
-        if training:
-            builtin_ents, custom_ents = zip(
-                *[_entities_from_utterance(u) for u in x])
-        else:
-            # Extract all entities on unnormalized data
-            builtin_ents = [
-                self.builtin_entity_parser.parse(
-                    get_text_from_chunks(u[DATA]),
-                    self.builtin_entity_scope,
-                    use_cache=True
-                ) for u in x
-            ]
-            custom_ents = [
-                self.custom_entity_parser.parse(
-                    get_text_from_chunks(u[DATA]), use_cache=True)
-                for u in x
-            ]
+    def _preprocess(self, x):
+        # Extract all entities on unnormalized data
+        builtin_ents = [
+            self.builtin_entity_parser.parse(
+                get_text_from_chunks(u[DATA]),
+                self.builtin_entity_scope,
+                use_cache=True
+            ) for u in x
+        ]
+        custom_ents = [
+            self.custom_entity_parser.parse(
+                get_text_from_chunks(u[DATA]), use_cache=True)
+            for u in x
+        ]
         return x, builtin_ents, custom_ents
 
     def _extract_word_pairs(self, utterance):
@@ -805,27 +799,3 @@ def _get_word_cluster_features(query_tokens, clusters_name, resources):
         if cluster is not None:
             cluster_features.append(cluster)
     return cluster_features
-
-
-def _entities_from_utterance(utterance):
-    builtin_ents = []
-    custom_ents = []
-    current_ix = 0
-    for chunk in utterance[DATA]:
-        text = chunk[TEXT]
-        text_length = len(text)
-        if ENTITY in chunk:
-            ent = {
-                ENTITY_KIND: chunk[ENTITY],
-                RES_VALUE: text,
-                RES_MATCH_RANGE: {
-                    START: current_ix,
-                    END: current_ix + text_length
-                }
-            }
-            if is_builtin_entity(ent[ENTITY_KIND]):
-                builtin_ents.append(ent)
-            else:
-                custom_ents.append(ent)
-        current_ix += text_length
-    return builtin_ents, custom_ents
diff --git a/snips_nlu/tests/test_intent_classifier_featurizer.py b/snips_nlu/tests/test_intent_classifier_featurizer.py
index 98c6160af..f2d571d44 100644
--- a/snips_nlu/tests/test_intent_classifier_featurizer.py
+++ b/snips_nlu/tests/test_intent_classifier_featurizer.py
@@ -538,9 +538,6 @@ def test_preprocess(self):
 ---
 type: entity
 name: entity_1
-automatically_extensible: false
-use_synononyms: false
-matching_strictness: 1.0
 values:
   - [entity 1, alternative entity 1]
   - [éntity 1, alternative entity 1]
@@ -548,9 +545,6 @@ def test_preprocess(self):
 ---
 type: entity
 name: entity_2
-automatically_extensible: false
-use_synononyms: true
-matching_strictness: 1.0
 values:
   - entity 1
   - [Éntity 2, Éntity_2, Alternative entity 2]""")
@@ -579,8 +573,7 @@ def test_preprocess(self):
         vectorizer.builtin_entity_scope = {"snips/number"}
 
         # When
-        processed_data = vectorizer._preprocess(
-            utterances, training=False)
+        processed_data = vectorizer._preprocess(utterances)
         processed_data = list(zip(*processed_data))
 
         # Then
@@ -682,238 +675,6 @@ def test_preprocess(self):
 
         self.assertSequenceEqual(expected_data, processed_data)
 
-    def test_preprocess_for_training(self):
-        # Given
-        language = LANGUAGE_EN
-        resources = {
-            STEMS: {
-                "beautiful": "beauty",
-                "birdy": "bird",
-                "entity": "ent"
-            },
-            WORD_CLUSTERS: {
-                "my_word_clusters": {
-                    "beautiful": "cluster_1",
-                    "birdy": "cluster_2",
-                    "entity": "cluster_3"
-                }
-            },
-            STOP_WORDS: set()
-        }
-
-        dataset_stream = io.StringIO("""
----
-type: intent
-name: intent1
-utterances:
-    - dummy utterance
-
----
-type: entity
-name: entity_1
-automatically_extensible: false
-use_synononyms: false
-matching_strictness: 1.0
-values:
-  - [entity 1, alternative entity 1]
-  - [éntity 1, alternative entity 1]
-
----
-type: entity
-name: entity_2
-automatically_extensible: false
-use_synononyms: true
-matching_strictness: 1.0
-values:
-  - entity 1
-  - [Éntity 2, Éntity_2, Alternative entity 2]""")
-        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-
-        custom_entity_parser = CustomEntityParser.build(
-            dataset, CustomEntityParserUsage.WITH_STEMS, resources)
-
-        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
-        utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hÉllo wOrld "
-                    },
-                    {
-                        "text": " yo "
-                    },
-                    {
-                        "text": " yo "
-                    },
-                    {
-                        "text": "yo "
-                    },
-                    {
-                        "text": "Éntity_2",
-                        "entity": "entity_2"
-                    },
-                    {
-                        "text": " "
-                    },
-                    {
-                        "text": "Éntity_2",
-                        "entity": "entity_2"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "beauTiful World "
-                    },
-                    {
-                        "text": "entity 1",
-                        "entity": "entity_1"
-                    },
-                    {
-                        "text": " "
-                    },
-                    {
-                        "text": "2",
-                        "entity": "snips/number"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "Bird bïrdy"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "Bird birdy"
-                    }
-                ]
-            }
-        ]
-
-        config = TfidfVectorizerConfig(
-            use_stemming=True, word_clusters_name="my_word_clusters")
-        vectorizer = TfidfVectorizer(
-            config=config,
-            custom_entity_parser=custom_entity_parser,
-            builtin_entity_parser=builtin_entity_parser,
-            resources=resources
-        )
-        vectorizer._language = language
-
-        # When
-        processed_data = vectorizer._preprocess(utterances, training=True)
-        processed_data = list(zip(*processed_data))
-
-        # Then
-        u_0 = {
-            "data": [
-                {
-                    "text": "hello world"
-                },
-                {
-                    "text": "yo"
-                },
-                {
-                    "text": "yo"
-                },
-                {
-                    "text": "yo"
-                },
-                {
-                    "text": "entity_2",
-                    "entity": "entity_2"
-                },
-                {
-                    "text": ""
-                },
-                {
-                    "text": "entity_2",
-                    "entity": "entity_2"
-                }
-            ]
-        }
-        u_1 = {
-            "data": [
-                {
-                    "text": "beauty world"
-                },
-                {
-                    "text": "ent 1",
-                    "entity": "entity_1"
-                },
-                {
-                    "text": ""
-                },
-                {
-                    "text": "2",
-                    "entity": "snips/number"
-                }
-            ]
-        }
-        u_2 = {
-            "data": [
-                {
-                    "text": "bird bird"
-                }
-            ]
-        }
-
-        ent_00 = {
-            "entity_kind": "entity_2",
-            "value": "Éntity_2",
-            "range": {"start": 23, "end": 31}
-        }
-        ent_01 = {
-            "entity_kind": "entity_2",
-            "value": "Éntity_2",
-            "range": {"start": 32, "end": 40}
-        }
-
-        ent_1 = {
-            "entity_kind": "entity_1",
-            "value": "entity 1",
-            "range": {"start": 16, "end": 24}
-        }
-        num_1 = {
-            "entity_kind": "snips/number",
-            "value": "2",
-            "range": {"start": 25, "end": 26}
-        }
-
-        expected_data = [
-            (
-                u_0,
-                [],
-                [ent_00, ent_01],
-                []
-            ),
-            (
-                u_1,
-                [num_1],
-                [ent_1],
-                ["cluster_1", "cluster_3"]
-            ),
-            (
-                u_2,
-                [],
-                [],
-                []
-            ),
-            (
-                u_2,
-                [],
-                [],
-                ["cluster_2"]
-            )
-        ]
-
-        self.assertSequenceEqual(expected_data, processed_data)
-
 
 class CooccurrenceVectorizerTest(FixtureTest):
 
@@ -1086,8 +847,9 @@ def test_transform(self):
         expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]]
         self.assertEqual(expected, x.todense().tolist())
 
-    @patch("snips_nlu.intent_classifier.featurizer._entities_from_utterance")
-    def test_fit(self, mocked_entities_from_utterance):
+    @patch("snips_nlu.intent_classifier.featurizer.CooccurrenceVectorizer."
+           "_preprocess")
+    def test_fit(self, mocked_preprocess):
         t = "a b c d e f"
         u = text_to_utterance(t)
         builtin_ents = [
@@ -1112,9 +874,8 @@ def test_fit(self, mocked_entities_from_utterance):
                 "entity_kind": "the_c_entity"
             }
         ]
-        mocked_entities_from_utterance.return_value = builtin_ents, custom_ents
 
-        x = [u]
+        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]
 
         config = CooccurrenceVectorizerConfig(
             window_size=3,
@@ -1136,13 +897,14 @@ def test_fit(self, mocked_entities_from_utterance):
             ("d", "THE_SNIPS_E_ENTITY"): 7,
             ("d", "f"): 8,
         }
-        vectorizer = CooccurrenceVectorizer(config, **shared).fit(x, dataset)
+        vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset)
 
         # Then
         self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
 
-    @patch("snips_nlu.intent_classifier.featurizer._entities_from_utterance")
-    def test_fit_unordered(self, mocked_entities_from_utterance):
+    @patch("snips_nlu.intent_classifier.featurizer.CooccurrenceVectorizer."
+           "_preprocess")
+    def test_fit_unordered(self, mocked_preprocess):
         t = "a b c d e f"
         u = text_to_utterance(t)
         builtin_ents = [
@@ -1167,9 +929,7 @@ def test_fit_unordered(self, mocked_entities_from_utterance):
                 "entity_kind": "the_c_entity"
             }
         ]
-        mocked_entities_from_utterance.return_value = builtin_ents, custom_ents
-
-        x = [u]
+        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]
 
         config = CooccurrenceVectorizerConfig(
             window_size=3,
@@ -1192,13 +952,14 @@ def test_fit_unordered(self, mocked_entities_from_utterance):
             ("a", "d"): 7,
             ("d", "f"): 8,
         }
-        vectorizer = CooccurrenceVectorizer(config, **shared).fit(x, dataset)
+        vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset)
 
         # Then
         self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
 
-    @patch("snips_nlu.intent_classifier.featurizer._entities_from_utterance")
-    def test_fit_transform(self, mocked_entities_from_utterance):
+    @patch("snips_nlu.intent_classifier.featurizer.CooccurrenceVectorizer."
+           "_preprocess")
+    def test_fit_transform(self, mocked_preprocess):
         t = "a b c d e f"
         u = text_to_utterance(t)
         builtin_ents = [
@@ -1223,9 +984,7 @@ def test_fit_transform(self, mocked_entities_from_utterance):
                 "entity_kind": "the_c_entity"
             }
         ]
-        mocked_entities_from_utterance.return_value = builtin_ents, custom_ents
-
-        x = [u]
+        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]
 
         config = CooccurrenceVectorizerConfig(
             window_size=3,
@@ -1246,6 +1005,7 @@ def test_fit_transform(self, mocked_entities_from_utterance):
             custom_entity_parser=custom_parser, resources=resources)
 
         # When
+        x = [u]
         x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist()
         x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist()
 
@@ -1368,7 +1128,7 @@ def test_preprocess(self):
         vectorizer._language = language
 
         # When
-        processed_data = vectorizer._preprocess(utterances, training=False)
+        processed_data = vectorizer._preprocess(utterances)
         processed_data = list(zip(*processed_data))
 
         # Then
@@ -1433,163 +1193,3 @@ def test_preprocess(self):
         ]
 
         self.assertSequenceEqual(expected_data, processed_data)
-
-    def test_preprocess_for_training(self):
-        # Given
-        language = LANGUAGE_EN
-        resources = {
-            STEMS: {
-                "beautiful": "beauty",
-                "birdy": "bird",
-                "entity": "ent"
-            },
-            WORD_CLUSTERS: {
-                "my_word_clusters": {
-                    "beautiful": "cluster_1",
-                    "birdy": "cluster_2",
-                    "entity": "cluster_3"
-                }
-            },
-            STOP_WORDS: set()
-        }
-
-        dataset_stream = io.StringIO("""
----
-type: intent
-name: intent1
-utterances:
-    - dummy utterance
-
----
-type: entity
-name: entity_1
-automatically_extensible: false
-use_synononyms: false
-matching_strictness: 1.0
-values:
-  - [entity 1, alternative entity 1]
-  - [éntity 1, alternative entity 1]
-
----
-type: entity
-name: entity_2
-automatically_extensible: false
-use_synononyms: true
-matching_strictness: 1.0
-values:
-  - entity 1
-  - [Éntity 2, Éntity_2, Alternative entity 2]
-    """)
-        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-
-        custom_entity_parser = CustomEntityParser.build(
-            dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources)
-
-        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
-        utterances = [
-            {
-                "data": [
-                    {
-                        "text": "hÉllo wOrld "
-                    },
-                    {
-                        "text": " yo "
-                    },
-                    {
-                        "text": " yo "
-                    },
-                    {
-                        "text": "yo "
-                    },
-                    {
-                        "text": "Éntity_2",
-                        "entity": "entity_2"
-                    },
-                    {
-                        "text": " "
-                    },
-                    {
-                        "text": "Éntity_2",
-                        "entity": "entity_2"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "beauTiful World "
-                    },
-                    {
-                        "text": "entity 1",
-                        "entity": "entity_1"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "Bird bïrdy"
-                    }
-                ]
-            },
-            {
-                "data": [
-                    {
-                        "text": "Bird birdy"
-                    }
-                ]
-            }
-        ]
-
-        vectorizer = CooccurrenceVectorizer(
-            custom_entity_parser=custom_entity_parser,
-            builtin_entity_parser=builtin_entity_parser,
-            resources=resources
-        )
-        vectorizer._language = language
-
-        # When
-        processed_data = vectorizer._preprocess(utterances, training=True)
-        processed_data = list(zip(*processed_data))
-
-        # Then
-        ent_00 = {
-            "entity_kind": "entity_2",
-            "value": "Éntity_2",
-            "range": {"start": 23, "end": 31}
-        }
-        ent_01 = {
-            "entity_kind": "entity_2",
-            "value": "Éntity_2",
-            "range": {"start": 32, "end": 40}
-        }
-        ent_1 = {
-            "entity_kind": "entity_1",
-            "value": "entity 1",
-            "range": {"start": 16, "end": 24}
-        }
-
-        expected_data = [
-            (
-                utterances[0],
-                [],
-                [ent_00, ent_01]
-            ),
-            (
-                utterances[1],
-                [],
-                [ent_1]
-            ),
-            (
-                utterances[2],
-                [],
-                []
-            ),
-            (
-                utterances[3],
-                [],
-                []
-            )
-        ]
-
-        self.assertSequenceEqual(expected_data, processed_data)

From d27f1c022b0f9b9524b7c62662f877db1cc2c7bf Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 12:27:43 +0100
Subject: [PATCH 08/14] Update Changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb2f2c78c..1d9b226a7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [Unreleased]
+### Fixed
+- Issue with intent classification reducing classification accuracy
+
 ## [0.19.2] - 2019-02-11
 ### Fixed
 - Fix an issue regarding the way builtin entities were handled by the `CRFSlotFiller`
@@ -236,6 +240,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
+[Unreleased]: https://github.com/snipsco/snips-nlu/compare/0.19.2...HEAD
 [0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2
 [0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1
 [0.19.0]: https://github.com/snipsco/snips-nlu/compare/0.18.0...0.19.0

From 25a5888b25c7ea1478d0ce1fdec76f42d49d6f56 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 14:03:37 +0100
Subject: [PATCH 09/14] Fix stochastic test

---
 snips_nlu/tests/test_crf_slot_filler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/snips_nlu/tests/test_crf_slot_filler.py b/snips_nlu/tests/test_crf_slot_filler.py
index 3b03137f2..02086ac38 100644
--- a/snips_nlu/tests/test_crf_slot_filler.py
+++ b/snips_nlu/tests/test_crf_slot_filler.py
@@ -97,6 +97,7 @@ def test_should_get_sub_builtin_slots(self):
 - find me something from [start](9am) to [end](12pm)
 - I need a break from [start](2pm) until [end](4pm)
 - Can you suggest something from [start](april 4th) until [end](april 6th) ?
+- find an activity from [start](6pm) to [end](8pm)
 - Book me a trip from [start](this friday) to [end](next tuesday)""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
         config = CRFSlotFillerConfig(random_seed=42)

From 24415c2d1e572ac665a00573b7d50541a4533455 Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Tue, 5 Mar 2019 15:11:36 +0100
Subject: [PATCH 10/14] Data augmentation should be deterministic

---
 snips_nlu/data_augmentation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snips_nlu/data_augmentation.py b/snips_nlu/data_augmentation.py
index 0ed6e9b99..68ff15d86 100644
--- a/snips_nlu/data_augmentation.py
+++ b/snips_nlu/data_augmentation.py
@@ -88,7 +88,7 @@ def get_intent_entities(dataset, intent_name):
         for chunk in utterance[DATA]:
             if ENTITY in chunk:
                 intent_entities.add(chunk[ENTITY])
-    return intent_entities
+    return sorted(intent_entities)
 
 
 def num_queries_to_generate(dataset, intent_name, min_utterances):

From a7a45799fa72c26b993de1e4e4064a3975b52f5f Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Tue, 5 Mar 2019 15:13:05 +0100
Subject: [PATCH 11/14] Remove dataset specific noise

---
 .../log_reg_classifier_utils.py               | 18 +-------------
 .../tests/test_log_reg_classifier_utils.py    | 24 +------------------
 2 files changed, 2 insertions(+), 40 deletions(-)

diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
index dc35245b7..3353ff0fd 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -108,22 +108,6 @@ def add_unknown_word_to_utterances(utterances, replacement_string,
     return new_utterances
 
 
-def get_dataset_specific_noise(dataset, resources):
-    """Return a noise list that excludes the dataset entity values"""
-    entities_values = set()
-    for ent_name, ent in iteritems(dataset[ENTITIES]):
-        if is_builtin_entity(ent_name):
-            continue
-        for k, v in iteritems(ent[UTTERANCES]):
-            entities_values.add(k)
-            entities_values.add(v)
-    original_noise = get_noise(resources)
-    specific_noise = [n for n in original_noise if n not in entities_values]
-    if not specific_noise:  # Avoid returning an empty noise
-        return original_noise
-    return specific_noise
-
-
 def build_training_data(dataset, language, data_augmentation_config, resources,
                         random_state):
     # Create class mapping
@@ -164,7 +148,7 @@ def build_training_data(dataset, language, data_augmentation_config, resources,
         )
 
     # Adding noise
-    noise = get_dataset_specific_noise(dataset, resources)
+    noise = get_noise(resources)
     noisy_utterances = generate_noise_utterances(
         augmented_utterances, noise, len(intents), data_augmentation_config,
         language, random_state)
diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py
index 629e9e3e7..66e27baba 100644
--- a/snips_nlu/tests/test_log_reg_classifier_utils.py
+++ b/snips_nlu/tests/test_log_reg_classifier_utils.py
@@ -15,7 +15,7 @@
 from snips_nlu.intent_classifier.log_reg_classifier_utils import (
     add_unknown_word_to_utterances, build_training_data,
     generate_noise_utterances, generate_smart_noise, get_noise_it,
-    remove_builtin_slots, text_to_utterance, get_dataset_specific_noise)
+    remove_builtin_slots, text_to_utterance)
 from snips_nlu.pipeline.configs import (
     IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig)
 from snips_nlu.tests.test_log_reg_intent_classifier import (
@@ -536,28 +536,6 @@ def test_remove_builtin_slots(self):
 
         self.assertDictEqual(expected_dataset, filtered_dataset)
 
-    def test_get_dataset_specific_noise(self):
-        # Given
-        dataset_stream = io.StringIO("""
----
-type: intent
-name: my_intent
-utterances:
-- what is the weather in [city](paris)
-- give me the weather in [city](london) 
-- does it rain in [city](tokyo)?""")
-        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        dataset = validate_and_format_dataset(dataset)
-        resources = {
-            NOISE: ["paris", "tokyo", "yo"]
-        }
-
-        # When
-        noise = get_dataset_specific_noise(dataset, resources)
-
-        # Then
-        self.assertEqual(["yo"], noise)
-
     def test_add_unknown_word_to_utterances_with_none_max_unknownword(self):
         # Given
         utterances = [text_to_utterance("yo")]

From fd4486baab83d7726626b0905bd53868624c99eb Mon Sep 17 00:00:00 2001
From: ClemDoum <clement.doumouro@gmail.com>
Date: Tue, 5 Mar 2019 15:14:33 +0100
Subject: [PATCH 12/14] Linting

---
 snips_nlu/intent_classifier/log_reg_classifier.py       | 3 +--
 snips_nlu/intent_classifier/log_reg_classifier_utils.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/snips_nlu/intent_classifier/log_reg_classifier.py b/snips_nlu/intent_classifier/log_reg_classifier.py
index 3b0e85979..1e56dbcc8 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier.py
@@ -87,13 +87,12 @@ def fit(self, dataset):
 
         none_class = max(classes)
         try:
-            self.featurizer = self.featurizer.fit(
+            x = self.featurizer.fit_transform(
                 dataset, utterances, classes, none_class)
         except _EmptyDatasetUtterancesError:
             self.featurizer = None
             return self
 
-        x = self.featurizer.transform(utterances)
         alpha = get_regularization_factor(dataset)
         self.classifier = SGDClassifier(random_state=random_state,
                                         alpha=alpha, **LOG_REG_ARGS)
diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
index 3353ff0fd..5d709b77b 100644
--- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py
+++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -9,7 +9,7 @@
 import numpy as np
 from future.utils import iteritems, itervalues
 
-from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT,
+from snips_nlu.constants import (DATA, ENTITY, INTENTS, TEXT,
                                  UNKNOWNWORD, UTTERANCES)
 from snips_nlu.data_augmentation import augment_utterances
 from snips_nlu.dataset import get_text_from_chunks

From 7227434b39731585d5b31d394d6f7b8837dfb613 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 17:19:54 +0100
Subject: [PATCH 13/14] Update Changelog

---
 CHANGELOG.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d9b226a7..23cdc145c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,9 +1,12 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
-## [Unreleased]
+## [0.19.3] - 2019-03-05
 ### Fixed
 - Issue with intent classification reducing classification accuracy
+- Issue resulting in a mutation of the CRFSlotFillerConfig
+- Wrong required resources of the `DeterministicIntentParser`
+- Issue with non ASCII characters when using the parsing CLI with Python2 
 
 ## [0.19.2] - 2019-02-11
 ### Fixed
@@ -240,7 +243,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
-[Unreleased]: https://github.com/snipsco/snips-nlu/compare/0.19.2...HEAD
+[0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3
 [0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2
 [0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1
 [0.19.0]: https://github.com/snipsco/snips-nlu/compare/0.18.0...0.19.0

From 4b850fb8320a5f6e0566a6ca9b645c174da9a722 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 5 Mar 2019 17:20:08 +0100
Subject: [PATCH 14/14] Bump version to 0.19.3

---
 snips_nlu/__about__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py
index c9d76cadc..0c0398efb 100644
--- a/snips_nlu/__about__.py
+++ b/snips_nlu/__about__.py
@@ -11,7 +11,7 @@
 __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai"
 __license__ = "Apache License, Version 2.0"
 
-__version__ = "0.19.2"
+__version__ = "0.19.3"
 __model_version__ = "0.19.0"
 
 __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"