Merge pull request #699 from snipsco/release/0.17.4

Release 0.17.4
snipsco · Nov 20, 2018 · ddcfcec · ddcfcec
2 parents a3e00e8 + ffdb55f
commit ddcfcec
Show file tree

Hide file tree

Showing 36 changed files with 176 additions and 103 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,16 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [0.17.4] - 2018-11-20
+### Added
+- Add a `--config` argument in the metrics CLI
+
+### Changed
+- Replace "parser_threshold" by "matching_strictness" in dataset format
+- Optimize loading and inference runtime
+- Disable stemming for intent classification in default configs
+
+
 ## [0.17.3] - 2018-10-18
 ### Fixed
 - Crash with num2words and floats
@@ -165,6 +175,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
+[0.17.4]: https://github.com/snipsco/snips-nlu/compare/0.17.3...0.17.4
 [0.17.3]: https://github.com/snipsco/snips-nlu/compare/0.17.2...0.17.3
 [0.17.2]: https://github.com/snipsco/snips-nlu/compare/0.17.1...0.17.2
 [0.17.1]: https://github.com/snipsco/snips-nlu/compare/0.17.0...0.17.1

diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -4,5 +4,6 @@ Contributors
 This is a list of everyone who has made significant contributions to Snips NLU, in alphabetical order. Thanks a lot for the great work!
 
 * `Alice Coucke <https://github.com/choufractal>`_
+* `ddorian <https://github.com/ddorian>`_
 * `Josh Meyer <https://github.com/JRMeyer>`_
-* `Matthieu Brouillard <https://github.com/McFoggy>`_
+* `Matthieu Brouillard <https://github.com/McFoggy>`_
diff --git a/docs/source/data_model.rst b/docs/source/data_model.rst
@@ -266,7 +266,7 @@ entity in your dataset as follows:
             "synonyms": []
           }
         ],
-        "parser_threshold": 1.0
+        "matching_strictness": 1.0
       }
     }
 
@@ -363,6 +363,6 @@ not your custom entity is automatically extensible:
         "automatically_extensible": true,
         "use_synonyms": true,
         "data": [],
-        "parser_threshold": 1.0
+        "matching_strictness": 1.0
       }
     }
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -136,7 +136,7 @@ Now, the ``"entities"`` part of the generated json looks like that:
               "value": "garden"
             }
           ],
-          "parser_threshold": 1.0,
+          "matching_strictness": 1.0,
           "use_synonyms": true
         },
         "snips/temperature": {}
@@ -186,7 +186,7 @@ and that we want our assistant to cover. Additionally, we add some
               "value": "garden"
             }
           ],
-          "parser_threshold": 1.0,
+          "matching_strictness": 1.0,
           "use_synonyms": true
         },
         "snips/temperature": {}

diff --git a/sample_datasets/beverage_dataset.json b/sample_datasets/beverage_dataset.json
@@ -13,7 +13,7 @@
           ]
         }
       ],
-      "parser_threshold": 1.0,
+      "matching_strictness": 1.0,
       "use_synonyms": true
     },
     "snips/number": {}

diff --git a/sample_datasets/flights_dataset.json b/sample_datasets/flights_dataset.json
@@ -16,7 +16,7 @@
           "synonyms": ["new york", "big apple"]
         }
       ],
-      "parser_threshold": 1.0,
+      "matching_strictness": 1.0,
       "use_synonyms": true
     },
     "snips/datetime": {}

diff --git a/sample_datasets/lights_dataset.json b/sample_datasets/lights_dataset.json
@@ -3,7 +3,7 @@
     "color": {
       "automatically_extensible": true,
       "data": [],
-      "parser_threshold": 1.0,
+      "matching_strictness": 1.0,
       "use_synonyms": true
     },
     "room": {
@@ -18,7 +18,7 @@
           "synonyms": []
         }
       ],
-      "parser_threshold": 1.0,
+      "matching_strictness": 1.0,
       "use_synonyms": true
     }
   },

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 required = [
     "enum34>=1.1,<2.0; python_version<'3.4'",
     "future>=0.16,<0.17",
-    "numpy==1.14.0",
+    "numpy>=1.15,<1.16",
     "scipy>=1.0,<2.0",
     "scikit-learn>=0.19,<0.20",
     "sklearn-crfsuite>=0.3.6,<0.4",

diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py
@@ -11,7 +11,7 @@
 __email__ = "[email protected], [email protected]"
 __license__ = "Apache License, Version 2.0"
 
-__version__ = "0.17.3"
+__version__ = "0.17.4"
 __model_version__ = "0.17.0"
 
 __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"

diff --git a/snips_nlu/cli/dataset/entities.py b/snips_nlu/cli/dataset/entities.py
@@ -9,9 +9,9 @@
 import six
 from future.utils import with_metaclass
 
-from snips_nlu.constants import (AUTOMATICALLY_EXTENSIBLE, DATA,
-                                 PARSER_THRESHOLD, SYNONYMS, USE_SYNONYMS,
-                                 VALUE)
+from snips_nlu.constants import (
+    AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS, SYNONYMS,
+    USE_SYNONYMS, VALUE)
 from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
 
 AUTO_EXT_REGEX = re.compile(r'^#\sautomatically_extensible=(true|false)\s*$')
@@ -38,12 +38,12 @@ class CustomEntity(Entity):
     """
 
     def __init__(self, name, utterances, automatically_extensible,
-                 use_synonyms, parser_threshold=1.0):
+                 use_synonyms, matching_strictness=1.0):
         super(CustomEntity, self).__init__(name)
         self.utterances = utterances
         self.automatically_extensible = automatically_extensible
         self.use_synonyms = use_synonyms
-        self.parser_threshold = parser_threshold
+        self.matching_strictness = matching_strictness
 
     @classmethod
     def from_file(cls, filepath):
@@ -86,7 +86,7 @@ def json(self):
             AUTOMATICALLY_EXTENSIBLE: self.automatically_extensible,
             USE_SYNONYMS: self.use_synonyms,
             DATA: [u.json for u in self.utterances],
-            PARSER_THRESHOLD: self.parser_threshold
+            MATCHING_STRICTNESS: self.matching_strictness
         }
 
 

diff --git a/snips_nlu/cli/metrics.py b/snips_nlu/cli/metrics.py
@@ -4,31 +4,56 @@
 from pathlib import Path
 
 import plac
+from snips_nlu_metrics import Engine
 
 from snips_nlu import SnipsNLUEngine, load_resources
 from snips_nlu.utils import json_string
 
 
+def make_engine_cls(config):
+    class ConfigEngine(Engine):
+        def __init__(self):
+            self.engine = None
+            self.config = config
+
+        def fit(self, dataset):
+            self.engine = SnipsNLUEngine(self.config).fit(dataset)
+            return self
+
+        def parse(self, text):
+            return self.engine.parse(text)
+
+    return ConfigEngine
+
+
 @plac.annotations(
     dataset_path=("Path to the dataset file", "positional", None, str),
     output_path=("Destination path for the json metrics", "positional", None,
                  str),
+    config_path=("Path to a NLU engine config file", "option", "c", str),
     nb_folds=("Number of folds to use for the cross-validation", "option", "n",
               int),
     train_size_ratio=("Fraction of the data that we want to use for training "
                       "(between 0 and 1)", "option", "t", float),
     exclude_slot_metrics=("Exclude slot metrics and slot errors in the output",
                           "flag", "s", bool),
     include_errors=("Include parsing errors in the output", "flag", "i", bool))
-def cross_val_metrics(dataset_path, output_path, nb_folds=5,
+def cross_val_metrics(dataset_path, output_path, config_path=None, nb_folds=5,
                       train_size_ratio=1.0, exclude_slot_metrics=False,
                       include_errors=False):
     def progression_handler(progress):
         print("%d%%" % int(progress * 100))
 
+    if config_path is not None:
+        with Path(config_path).open("r", encoding="utf-8") as f:
+            config = json.load(f)
+        engine_cls = make_engine_cls(config)
+    else:
+        engine_cls = SnipsNLUEngine
+
     metrics_args = dict(
         dataset=dataset_path,
-        engine_class=SnipsNLUEngine,
+        engine_class=engine_cls,
         progression_handler=progression_handler,
         nb_folds=nb_folds,
         train_size_ratio=train_size_ratio,
@@ -55,15 +80,24 @@ def progression_handler(progress):
                        None, str),
     output_path=("Destination path for the json metrics", "positional", None,
                  str),
+    config_path=("Path to a NLU engine config file", "option", "c", str),
     exclude_slot_metrics=("Exclude slot metrics and slot errors in the output",
                           "flag", "s", bool),
     include_errors=("Include parsing errors in the output", "flag", "i", bool))
 def train_test_metrics(train_dataset_path, test_dataset_path, output_path,
-                       exclude_slot_metrics=False, include_errors=False):
+                       config_path=None, exclude_slot_metrics=False,
+                       include_errors=False):
+    if config_path is not None:
+        with Path(config_path).open("r", encoding="utf-8") as f:
+            config = json.load(f)
+        engine_cls = make_engine_cls(config)
+    else:
+        engine_cls = SnipsNLUEngine
+
     metrics_args = dict(
         train_dataset=train_dataset_path,
         test_dataset=test_dataset_path,
-        engine_class=SnipsNLUEngine,
+        engine_class=engine_cls,
         include_slot_metrics=not exclude_slot_metrics
     )
 

diff --git a/snips_nlu/constants.py b/snips_nlu/constants.py
@@ -45,7 +45,7 @@
 END = "end"
 BUILTIN_ENTITY_PARSER = "builtin_entity_parser"
 CUSTOM_ENTITY_PARSER = "custom_entity_parser"
-PARSER_THRESHOLD = "parser_threshold"
+MATCHING_STRICTNESS = "matching_strictness"
 
 # resources
 STOP_WORDS = "stop_words"

diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset.py
@@ -8,13 +8,12 @@
 from future.utils import iteritems, itervalues
 from snips_nlu_ontology import get_all_languages
 
-from snips_nlu.constants import (AUTOMATICALLY_EXTENSIBLE, CAPITALIZE, DATA,
-                                 ENTITIES, ENTITY, INTENTS, LANGUAGE,
-                                 PARSER_THRESHOLD, SLOT_NAME, SYNONYMS, TEXT,
-                                 USE_SYNONYMS, UTTERANCES, VALIDATED, VALUE)
-from snips_nlu.entity_parser.builtin_entity_parser import (BuiltinEntityParser,
-                                                           is_builtin_entity,
-                                                           is_gazetteer_entity)
+from snips_nlu.constants import (
+    AUTOMATICALLY_EXTENSIBLE, CAPITALIZE, DATA, ENTITIES, ENTITY, INTENTS,
+    LANGUAGE, MATCHING_STRICTNESS, SLOT_NAME, SYNONYMS, TEXT, USE_SYNONYMS,
+    UTTERANCES, VALIDATED, VALUE)
+from snips_nlu.entity_parser.builtin_entity_parser import (
+    BuiltinEntityParser, is_builtin_entity, is_gazetteer_entity)
 from snips_nlu.preprocessing import tokenize_light
 from snips_nlu.string_variations import get_string_variations
 from snips_nlu.utils import validate_key, validate_keys, validate_type
@@ -135,21 +134,23 @@ def validate_and_format_custom_entity(entity, queries_entities, language,
     validate_type(entity, dict)
 
     # TODO: this is here temporarily, only to allow backward compatibility
-    if PARSER_THRESHOLD not in entity:
-        entity[PARSER_THRESHOLD] = 1.0
+    if MATCHING_STRICTNESS not in entity:
+        strictness = entity.get("parser_threshold", 1.0)
+
+        entity[MATCHING_STRICTNESS] = strictness
 
     mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA,
-                      PARSER_THRESHOLD]
+                      MATCHING_STRICTNESS]
     validate_keys(entity, mandatory_keys, object_label="entity")
     validate_type(entity[USE_SYNONYMS], bool)
     validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
     validate_type(entity[DATA], list)
-    validate_type(entity[PARSER_THRESHOLD], float)
+    validate_type(entity[MATCHING_STRICTNESS], float)
 
     formatted_entity = dict()
     formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[
         AUTOMATICALLY_EXTENSIBLE]
-    formatted_entity[PARSER_THRESHOLD] = entity[PARSER_THRESHOLD]
+    formatted_entity[MATCHING_STRICTNESS] = entity[MATCHING_STRICTNESS]
     use_synonyms = entity[USE_SYNONYMS]
 
     # Validate format and filter out unused data

diff --git a/snips_nlu/default_configs/config_de.py b/snips_nlu/default_configs/config_de.py
@@ -182,7 +182,7 @@
                     "sublinear_tf": False,
                     "pvalue_threshold": 0.4,
                     "word_clusters_name": None,
-                    "use_stemming": True
+                    "use_stemming": False
                 },
                 "random_seed": None
             }

diff --git a/snips_nlu/default_configs/config_en.py b/snips_nlu/default_configs/config_en.py
@@ -159,7 +159,7 @@
                     "sublinear_tf": False,
                     "pvalue_threshold": 0.4,
                     "word_clusters_name": None,
-                    "use_stemming": True
+                    "use_stemming": False
                 },
                 "random_seed": None
             }

diff --git a/snips_nlu/default_configs/config_es.py b/snips_nlu/default_configs/config_es.py
@@ -146,7 +146,7 @@
                     "sublinear_tf": False,
                     "pvalue_threshold": 0.4,
                     "word_clusters_name": None,
-                    "use_stemming": True
+                    "use_stemming": False
                 },
                 "random_seed": None
             }

diff --git a/snips_nlu/default_configs/config_fr.py b/snips_nlu/default_configs/config_fr.py
@@ -146,7 +146,7 @@
                     "sublinear_tf": False,
                     "pvalue_threshold": 0.4,
                     "word_clusters_name": None,
-                    "use_stemming": True
+                    "use_stemming": False
                 },
                 "random_seed": None
             }

diff --git a/snips_nlu/default_configs/config_it.py b/snips_nlu/default_configs/config_it.py
@@ -146,7 +146,7 @@
                     "sublinear_tf": False,
                     "pvalue_threshold": 0.4,
                     "word_clusters_name": None,
-                    "use_stemming": True
+                    "use_stemming": False
                 },
                 "random_seed": None
             }

diff --git a/snips_nlu/entity_parser/custom_entity_parser.py b/snips_nlu/entity_parser/custom_entity_parser.py
@@ -9,7 +9,7 @@
 from snips_nlu_ontology import GazetteerEntityParser
 
 from snips_nlu.constants import (
-    END, ENTITIES, LANGUAGE, PARSER_THRESHOLD, RES_MATCH_RANGE, START,
+    END, ENTITIES, LANGUAGE, MATCHING_STRICTNESS, RES_MATCH_RANGE, START,
     UTTERANCES, ENTITY_KIND)
 from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
 from snips_nlu.entity_parser.custom_entity_parser_usage import (
@@ -87,7 +87,7 @@ def parse(self, text, scope=None, use_cache=True):
         if cache_key not in self._cache:
             parser_result = self._parse(text, scope)
             self._cache[cache_key] = parser_result
-        return deepcopy(self._cache[cache_key])
+        return self._cache[cache_key]
 
     def _parse(self, text, scope):
         tokens = tokenize(text, self.language)
@@ -123,7 +123,7 @@ def _create_custom_entity_parser_configuration(entities):
             {
                 "entity_identifier": entity_name,
                 "entity_parser": {
-                    "threshold": entity[PARSER_THRESHOLD],
+                    "threshold": entity[MATCHING_STRICTNESS],
                     "gazetteer": [
                         {
                             "raw_value": k,

diff --git a/snips_nlu/entity_parser/entity_parser.py b/snips_nlu/entity_parser/entity_parser.py
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 
 from abc import ABCMeta, abstractmethod
-from copy import deepcopy
 
 from future.builtins import object
 from future.utils import with_metaclass
@@ -34,7 +33,7 @@ def parse(self, text, scope=None, use_cache=True):
         if cache_key not in self._cache:
             parser_result = self._parser.parse(text, scope)
             self._cache[cache_key] = parser_result
-        return deepcopy(self._cache[cache_key])
+        return self._cache[cache_key]
 
     @abstractmethod
     def persist(self, path):