From 8437e3947e9f0c84034a2af679cb0217b1431704 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 12 Feb 2019 10:17:30 +0100 Subject: [PATCH 01/14] Allow to fetch resources based only on major and minor --- snips_nlu/cli/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/snips_nlu/cli/utils.py b/snips_nlu/cli/utils.py index 8a99f51d3..7c26b5933 100644 --- a/snips_nlu/cli/utils.py +++ b/snips_nlu/cli/utils.py @@ -7,6 +7,7 @@ from enum import Enum, unique import requests +from semantic_version import Version import snips_nlu from snips_nlu import __about__ @@ -71,13 +72,16 @@ def get_json(url, desc): def get_compatibility(): version = __about__.__version__ + semver_version = Version(version) + minor_version = "%d.%d" % (semver_version.major, semver_version.minor) table = get_json(__about__.__compatibility__, "Compatibility table") - compatibility = table["snips-nlu"] - if version not in compatibility: + nlu_table = table["snips-nlu"] + compatibility = nlu_table.get(version, nlu_table.get(minor_version)) + if compatibility is None: pretty_print("No compatible resources found for version %s" % version, title="Resources compatibility error", exits=1, level=PrettyPrintLevel.ERROR) - return compatibility[version] + return compatibility def get_resources_version(resource_fullname, resource_alias, compatibility): From f502431b432381e6fb3021cf3d9fb5dded88842e Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Wed, 13 Feb 2019 15:37:19 +0100 Subject: [PATCH 02/14] Fix a bug which was mutating the CRFSlotFillerConfig --- snips_nlu/slot_filler/crf_slot_filler.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/snips_nlu/slot_filler/crf_slot_filler.py b/snips_nlu/slot_filler/crf_slot_filler.py index ff9b0a4c6..3b21e6821 100644 --- a/snips_nlu/slot_filler/crf_slot_filler.py +++ b/snips_nlu/slot_filler/crf_slot_filler.py @@ -7,6 +7,7 @@ import shutil import tempfile from builtins import range +from copy import deepcopy from pathlib import Path from future.utils import iteritems @@ -48,6 +49,9 @@ class CRFSlotFiller(SlotFiller): def __init__(self, config=None, **shared): """The CRF slot filler can be configured by passing a :class:`.CRFSlotFillerConfig`""" + # The CRFSlotFillerConfig must be deep-copied as it is mutated when + # fitting the feature factories + config = deepcopy(config) super(CRFSlotFiller, self).__init__(config, **shared) self.crf_model = None self.features_factories = [ From 48e97b4ca4dbc72549375fb96930507708cdb22b Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Thu, 14 Feb 2019 11:00:39 +0100 Subject: [PATCH 03/14] Allow matching_strictness property to be int --- snips_nlu/dataset/validation.py | 2 +- snips_nlu/tests/test_dataset_validation.py | 29 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/snips_nlu/dataset/validation.py b/snips_nlu/dataset/validation.py index 61babeec7..179763ce0 100644 --- a/snips_nlu/dataset/validation.py +++ b/snips_nlu/dataset/validation.py @@ -128,7 +128,7 @@ def _validate_and_format_custom_entity(entity, queries_entities, language, validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool, object_label="automatically_extensible") validate_type(entity[DATA], list, object_label="entity data") - validate_type(entity[MATCHING_STRICTNESS], float, + validate_type(entity[MATCHING_STRICTNESS], (float, int), object_label="matching_strictness") formatted_entity = dict() diff --git a/snips_nlu/tests/test_dataset_validation.py b/snips_nlu/tests/test_dataset_validation.py index 083f2f1e3..ee4981b39 100644 --- a/snips_nlu/tests/test_dataset_validation.py +++ b/snips_nlu/tests/test_dataset_validation.py @@ -93,6 +93,35 @@ def test_missing_entity_key_should_raise_exception(self): self.assertEqual("Expected custom entity to have key: 'use_synonyms'", str(ctx.exception.args[0])) + def test_should_support_int_or_float_for_matching_strictness(self): + # Given + dataset = { + "intents": {}, + "entities": { + "entity1": { + "data": [], + "automatically_extensible": False, + "use_synonyms": True, + "matching_strictness": 0.5 + }, + "entity2": { + "data": [], + "automatically_extensible": False, + "use_synonyms": True, + "matching_strictness": 1 + } + }, + "language": "en", + } + + # When/Then + dataset = validate_and_format_dataset(dataset) + + self.assertEqual( + 0.5, dataset["entities"]["entity1"].get("matching_strictness")) + self.assertEqual( + 1, dataset["entities"]["entity2"].get("matching_strictness")) + def test_missing_matching_strictness_should_be_handled(self): # TODO: This test is temporary, and must be removed once the backward # compatibility with the previous dataset format, without From 6c4fcea4e014c7bb6c3d8b9b8602cdeb1bbbca30 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Mon, 18 Feb 2019 17:40:12 +0100 Subject: [PATCH 04/14] Fix issue with resources required by the deterministic intent parser --- snips_nlu/intent_parser/deterministic_intent_parser.py | 1 + snips_nlu/pipeline/configs/intent_parser.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py index 1f3e404a0..4c442ec58 100644 --- a/snips_nlu/intent_parser/deterministic_intent_parser.py +++ b/snips_nlu/intent_parser/deterministic_intent_parser.py @@ -127,6 +127,7 @@ def fit(self, dataset, force_retrain=True): """Fits the intent parser with a valid Snips dataset""" logger.info("Fitting deterministic parser...") dataset = validate_and_format_dataset(dataset) + self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self.language = dataset[LANGUAGE] diff --git a/snips_nlu/pipeline/configs/intent_parser.py b/snips_nlu/pipeline/configs/intent_parser.py index 9a4fea7d2..bdc56e083 100644 --- a/snips_nlu/pipeline/configs/intent_parser.py +++ b/snips_nlu/pipeline/configs/intent_parser.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from snips_nlu.common.from_dict import FromDict -from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE +from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS from snips_nlu.entity_parser import CustomEntityParserUsage from snips_nlu.pipeline.configs import ProcessingUnitConfig from snips_nlu.resources import merge_required_resources @@ -84,7 +84,8 @@ def unit_name(self): def get_required_resources(self): return { - CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS, + STOP_WORDS: self.ignore_stop_words } def to_dict(self): From 812be2d19de9f1a3b54b9664b84e5da529248cba Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Fri, 1 Mar 2019 14:15:24 +0100 Subject: [PATCH 05/14] Fix issue with group names in deterministic parser --- .../deterministic_intent_parser.py | 2 +- .../tests/test_deterministic_intent_parser.py | 41 ++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py index 4c442ec58..63238b9e6 100644 --- a/snips_nlu/intent_parser/deterministic_intent_parser.py +++ b/snips_nlu/intent_parser/deterministic_intent_parser.py @@ -314,7 +314,7 @@ def _get_matching_result(self, text, processed_text, regex, intent, for group_name in found_result.groupdict(): ref_group_name = group_name if "_" in group_name: - ref_group_name = group_name[:(len(group_name) - 2)] + ref_group_name = group_name.split("_")[0] slot_name = self.group_names_to_slot_names[ref_group_name] entity = self.slot_names_to_entities[intent][slot_name] rng = (found_result.start(group_name), diff --git a/snips_nlu/tests/test_deterministic_intent_parser.py b/snips_nlu/tests/test_deterministic_intent_parser.py index 5af116d95..094b74e8c 100644 --- a/snips_nlu/tests/test_deterministic_intent_parser.py +++ b/snips_nlu/tests/test_deterministic_intent_parser.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import io - from builtins import range from mock import patch @@ -162,6 +161,46 @@ def test_should_parse_intent_with_stop_words(self, mock_get_stop_words): self.assertEqual(expected_intent, parsing[RES_INTENT]) + def test_should_parse_intent_with_duplicated_slot_names(self): + # Given + slots_dataset_stream = io.StringIO(""" +--- +type: intent +name: math_operation +slots: + - name: number + entity: snips/number +utterances: + - what is [number](one) plus [number](one)""") + dataset = Dataset.from_yaml_files("en", [slots_dataset_stream]).json + parser = DeterministicIntentParser().fit(dataset) + text = "what is one plus one" + + # When + parsing = parser.parse(text) + + # Then + probability = 1.0 + expected_intent = intent_classification_result( + intent_name="math_operation", probability=probability) + expected_slots = [ + { + "entity": "snips/number", + "range": {"end": 11, "start": 8}, + "slotName": "number", + "value": "one" + }, + { + "entity": "snips/number", + "range": {"end": 20, "start": 17}, + "slotName": "number", + "value": "one" + } + ] + + self.assertDictEqual(expected_intent, parsing[RES_INTENT]) + self.assertListEqual(expected_slots, parsing[RES_SLOTS]) + def test_should_ignore_ambiguous_utterances(self): # Given dataset_stream = io.StringIO(""" From 7ff270b67f0a8f0a44139ed392753e3febc6ddd7 Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Mon, 4 Mar 2019 14:00:17 +0100 Subject: [PATCH 06/14] Fix inference CLI --- snips_nlu/cli/inference.py | 3 +++ snips_nlu/common/log_utils.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/snips_nlu/cli/inference.py b/snips_nlu/cli/inference.py index 004526671..867fa9605 100644 --- a/snips_nlu/cli/inference.py +++ b/snips_nlu/cli/inference.py @@ -18,6 +18,7 @@ ) def parse(training_path, query, verbose=False): """Load a trained NLU engine and play with its parsing API interactively""" + from builtins import str if verbose: set_nlu_logger(logging.DEBUG) @@ -29,6 +30,8 @@ def parse(training_path, query, verbose=False): while True: query = input("Enter a query (type 'q' to quit): ").strip() + if not isinstance(query, str): + query = query.decode("utf-8") if query == "q": break print_parsing_result(engine, query) diff --git a/snips_nlu/common/log_utils.py b/snips_nlu/common/log_utils.py index 06d27ca18..6d29a9038 100644 --- a/snips_nlu/common/log_utils.py +++ b/snips_nlu/common/log_utils.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + +from builtins import str from datetime import datetime from functools import wraps From 5907ce254d4fccde5e2fb9cce39988701ccf0007 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 12:24:16 +0100 Subject: [PATCH 07/14] Fix issue with intent classifier when preprocessing data --- snips_nlu/intent_classifier/featurizer.py | 106 ++--- .../test_intent_classifier_featurizer.py | 434 +----------------- 2 files changed, 55 insertions(+), 485 deletions(-) diff --git a/snips_nlu/intent_classifier/featurizer.py b/snips_nlu/intent_classifier/featurizer.py index a2c7a2e5f..b5458d0d8 100644 --- a/snips_nlu/intent_classifier/featurizer.py +++ b/snips_nlu/intent_classifier/featurizer.py @@ -17,8 +17,7 @@ json_string, fitted_required, replace_entities_with_placeholders, check_persisted_path) from snips_nlu.constants import ( - DATA, END, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, RES_MATCH_RANGE, - RES_VALUE, START, TEXT, ENTITIES) + DATA, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, TEXT, ENTITIES) from snips_nlu.dataset import get_text_from_chunks, validate_and_format_dataset from snips_nlu.entity_parser.builtin_entity_parser import ( is_builtin_entity) @@ -264,7 +263,7 @@ def fit(self, x, dataset): self._init_vectorizer(self._language) self.builtin_entity_scope = set( e for e in dataset[ENTITIES] if is_builtin_entity(e)) - preprocessed_data = self._preprocess(x, training=True) + preprocessed_data = self._preprocess(x) utterances = [ self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters) for u, builtin_ents, custom_ents, w_clusters @@ -296,7 +295,7 @@ def fit_transform(self, x, dataset): self._init_vectorizer(self._language) self.builtin_entity_scope = set( e for e in dataset[ENTITIES] if is_builtin_entity(e)) - preprocessed_data = self._preprocess(x, training=True) + preprocessed_data = self._preprocess(x) utterances = [ self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters) for u, builtin_ents, custom_ents, w_clusters @@ -330,31 +329,30 @@ def transform(self, x): for data in zip(*self._preprocess(x))] return self._tfidf_vectorizer.transform(utterances) - def _preprocess(self, utterances, training=False): + def _preprocess(self, utterances): normalized_utterances = deepcopy(utterances) for u in normalized_utterances: - for chunk in u[DATA]: + nb_chunks = len(u[DATA]) + for i, chunk in enumerate(u[DATA]): chunk[TEXT] = _normalize_stem( chunk[TEXT], self.language, self.resources, self.config.use_stemming) - - if training: - builtin_ents, custom_ents = zip( - *[_entities_from_utterance(u) for u in utterances]) - else: - # Extract builtin entities on unormalized utterances - builtin_ents = [ - self.builtin_entity_parser.parse( - get_text_from_chunks(u[DATA]), - self.builtin_entity_scope, use_cache=True) - for u in utterances - ] - # Extract builtin entities on normalized utterances - custom_ents = [ - self.custom_entity_parser.parse( - get_text_from_chunks(u[DATA]), use_cache=True) - for u in normalized_utterances - ] + if i < nb_chunks - 1: + chunk[TEXT] += " " + + # Extract builtin entities on unormalized utterances + builtin_ents = [ + self.builtin_entity_parser.parse( + get_text_from_chunks(u[DATA]), + self.builtin_entity_scope, use_cache=True) + for u in utterances + ] + # Extract builtin entities on normalized utterances + custom_ents = [ + self.custom_entity_parser.parse( + get_text_from_chunks(u[DATA]), use_cache=True) + for u in normalized_utterances + ] if self.config.word_clusters_name: # Extract world clusters on unormalized utterances original_utterances_text = [get_text_from_chunks(u[DATA]) @@ -582,7 +580,7 @@ def fit(self, x, dataset): self.builtin_entity_scope = set( e for e in dataset[ENTITIES] if is_builtin_entity(e)) - preprocessed = self._preprocess(list(x), training=True) + preprocessed = self._preprocess(list(x)) utterances = [ self._enrich_utterance(utterance, builtin_ents, custom_ent) for utterance, builtin_ents, custom_ent in zip(*preprocessed)] @@ -648,7 +646,7 @@ def transform(self, x): Raises: NotTrained: when the vectorizer is not fitted """ - preprocessed = self._preprocess(x, training=False) + preprocessed = self._preprocess(x) utterances = [ self._enrich_utterance(utterance, builtin_ents, custom_ent) for utterance, builtin_ents, custom_ent in zip(*preprocessed)] @@ -661,24 +659,20 @@ def transform(self, x): return x_coo.tocsr() - def _preprocess(self, x, training=False): - if training: - builtin_ents, custom_ents = zip( - *[_entities_from_utterance(u) for u in x]) - else: - # Extract all entities on unnormalized data - builtin_ents = [ - self.builtin_entity_parser.parse( - get_text_from_chunks(u[DATA]), - self.builtin_entity_scope, - use_cache=True - ) for u in x - ] - custom_ents = [ - self.custom_entity_parser.parse( - get_text_from_chunks(u[DATA]), use_cache=True) - for u in x - ] + def _preprocess(self, x): + # Extract all entities on unnormalized data + builtin_ents = [ + self.builtin_entity_parser.parse( + get_text_from_chunks(u[DATA]), + self.builtin_entity_scope, + use_cache=True + ) for u in x + ] + custom_ents = [ + self.custom_entity_parser.parse( + get_text_from_chunks(u[DATA]), use_cache=True) + for u in x + ] return x, builtin_ents, custom_ents def _extract_word_pairs(self, utterance): @@ -805,27 +799,3 @@ def _get_word_cluster_features(query_tokens, clusters_name, resources): if cluster is not None: cluster_features.append(cluster) return cluster_features - - -def _entities_from_utterance(utterance): - builtin_ents = [] - custom_ents = [] - current_ix = 0 - for chunk in utterance[DATA]: - text = chunk[TEXT] - text_length = len(text) - if ENTITY in chunk: - ent = { - ENTITY_KIND: chunk[ENTITY], - RES_VALUE: text, - RES_MATCH_RANGE: { - START: current_ix, - END: current_ix + text_length - } - } - if is_builtin_entity(ent[ENTITY_KIND]): - builtin_ents.append(ent) - else: - custom_ents.append(ent) - current_ix += text_length - return builtin_ents, custom_ents diff --git a/snips_nlu/tests/test_intent_classifier_featurizer.py b/snips_nlu/tests/test_intent_classifier_featurizer.py index 98c6160af..f2d571d44 100644 --- a/snips_nlu/tests/test_intent_classifier_featurizer.py +++ b/snips_nlu/tests/test_intent_classifier_featurizer.py @@ -538,9 +538,6 @@ def test_preprocess(self): --- type: entity name: entity_1 -automatically_extensible: false -use_synononyms: false -matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] @@ -548,9 +545,6 @@ def test_preprocess(self): --- type: entity name: entity_2 -automatically_extensible: false -use_synononyms: true -matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") @@ -579,8 +573,7 @@ def test_preprocess(self): vectorizer.builtin_entity_scope = {"snips/number"} # When - processed_data = vectorizer._preprocess( - utterances, training=False) + processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then @@ -682,238 +675,6 @@ def test_preprocess(self): self.assertSequenceEqual(expected_data, processed_data) - def test_preprocess_for_training(self): - # Given - language = LANGUAGE_EN - resources = { - STEMS: { - "beautiful": "beauty", - "birdy": "bird", - "entity": "ent" - }, - WORD_CLUSTERS: { - "my_word_clusters": { - "beautiful": "cluster_1", - "birdy": "cluster_2", - "entity": "cluster_3" - } - }, - STOP_WORDS: set() - } - - dataset_stream = io.StringIO(""" ---- -type: intent -name: intent1 -utterances: - - dummy utterance - ---- -type: entity -name: entity_1 -automatically_extensible: false -use_synononyms: false -matching_strictness: 1.0 -values: - - [entity 1, alternative entity 1] - - [éntity 1, alternative entity 1] - ---- -type: entity -name: entity_2 -automatically_extensible: false -use_synononyms: true -matching_strictness: 1.0 -values: - - entity 1 - - [Éntity 2, Éntity_2, Alternative entity 2]""") - dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - - custom_entity_parser = CustomEntityParser.build( - dataset, CustomEntityParserUsage.WITH_STEMS, resources) - - builtin_entity_parser = BuiltinEntityParser.build(dataset, language) - utterances = [ - { - "data": [ - { - "text": "hÉllo wOrld " - }, - { - "text": " yo " - }, - { - "text": " yo " - }, - { - "text": "yo " - }, - { - "text": "Éntity_2", - "entity": "entity_2" - }, - { - "text": " " - }, - { - "text": "Éntity_2", - "entity": "entity_2" - } - ] - }, - { - "data": [ - { - "text": "beauTiful World " - }, - { - "text": "entity 1", - "entity": "entity_1" - }, - { - "text": " " - }, - { - "text": "2", - "entity": "snips/number" - } - ] - }, - { - "data": [ - { - "text": "Bird bïrdy" - } - ] - }, - { - "data": [ - { - "text": "Bird birdy" - } - ] - } - ] - - config = TfidfVectorizerConfig( - use_stemming=True, word_clusters_name="my_word_clusters") - vectorizer = TfidfVectorizer( - config=config, - custom_entity_parser=custom_entity_parser, - builtin_entity_parser=builtin_entity_parser, - resources=resources - ) - vectorizer._language = language - - # When - processed_data = vectorizer._preprocess(utterances, training=True) - processed_data = list(zip(*processed_data)) - - # Then - u_0 = { - "data": [ - { - "text": "hello world" - }, - { - "text": "yo" - }, - { - "text": "yo" - }, - { - "text": "yo" - }, - { - "text": "entity_2", - "entity": "entity_2" - }, - { - "text": "" - }, - { - "text": "entity_2", - "entity": "entity_2" - } - ] - } - u_1 = { - "data": [ - { - "text": "beauty world" - }, - { - "text": "ent 1", - "entity": "entity_1" - }, - { - "text": "" - }, - { - "text": "2", - "entity": "snips/number" - } - ] - } - u_2 = { - "data": [ - { - "text": "bird bird" - } - ] - } - - ent_00 = { - "entity_kind": "entity_2", - "value": "Éntity_2", - "range": {"start": 23, "end": 31} - } - ent_01 = { - "entity_kind": "entity_2", - "value": "Éntity_2", - "range": {"start": 32, "end": 40} - } - - ent_1 = { - "entity_kind": "entity_1", - "value": "entity 1", - "range": {"start": 16, "end": 24} - } - num_1 = { - "entity_kind": "snips/number", - "value": "2", - "range": {"start": 25, "end": 26} - } - - expected_data = [ - ( - u_0, - [], - [ent_00, ent_01], - [] - ), - ( - u_1, - [num_1], - [ent_1], - ["cluster_1", "cluster_3"] - ), - ( - u_2, - [], - [], - [] - ), - ( - u_2, - [], - [], - ["cluster_2"] - ) - ] - - self.assertSequenceEqual(expected_data, processed_data) - class CooccurrenceVectorizerTest(FixtureTest): @@ -1086,8 +847,9 @@ def test_transform(self): expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]] self.assertEqual(expected, x.todense().tolist()) - @patch("snips_nlu.intent_classifier.featurizer._entities_from_utterance") - def test_fit(self, mocked_entities_from_utterance): + @patch("snips_nlu.intent_classifier.featurizer.CooccurrenceVectorizer." + "_preprocess") + def test_fit(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ @@ -1112,9 +874,8 @@ def test_fit(self, mocked_entities_from_utterance): "entity_kind": "the_c_entity" } ] - mocked_entities_from_utterance.return_value = builtin_ents, custom_ents - x = [u] + mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, @@ -1136,13 +897,14 @@ def test_fit(self, mocked_entities_from_utterance): ("d", "THE_SNIPS_E_ENTITY"): 7, ("d", "f"): 8, } - vectorizer = CooccurrenceVectorizer(config, **shared).fit(x, dataset) + vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset) # Then self.assertDictEqual(expected_pairs, vectorizer.word_pairs) - @patch("snips_nlu.intent_classifier.featurizer._entities_from_utterance") - def test_fit_unordered(self, mocked_entities_from_utterance): + @patch("snips_nlu.intent_classifier.featurizer.CooccurrenceVectorizer." + "_preprocess") + def test_fit_unordered(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ @@ -1167,9 +929,7 @@ def test_fit_unordered(self, mocked_entities_from_utterance): "entity_kind": "the_c_entity" } ] - mocked_entities_from_utterance.return_value = builtin_ents, custom_ents - - x = [u] + mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, @@ -1192,13 +952,14 @@ def test_fit_unordered(self, mocked_entities_from_utterance): ("a", "d"): 7, ("d", "f"): 8, } - vectorizer = CooccurrenceVectorizer(config, **shared).fit(x, dataset) + vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset) # Then self.assertDictEqual(expected_pairs, vectorizer.word_pairs) - @patch("snips_nlu.intent_classifier.featurizer._entities_from_utterance") - def test_fit_transform(self, mocked_entities_from_utterance): + @patch("snips_nlu.intent_classifier.featurizer.CooccurrenceVectorizer." + "_preprocess") + def test_fit_transform(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ @@ -1223,9 +984,7 @@ def test_fit_transform(self, mocked_entities_from_utterance): "entity_kind": "the_c_entity" } ] - mocked_entities_from_utterance.return_value = builtin_ents, custom_ents - - x = [u] + mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, @@ -1246,6 +1005,7 @@ def test_fit_transform(self, mocked_entities_from_utterance): custom_entity_parser=custom_parser, resources=resources) # When + x = [u] x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist() x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist() @@ -1368,7 +1128,7 @@ def test_preprocess(self): vectorizer._language = language # When - processed_data = vectorizer._preprocess(utterances, training=False) + processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then @@ -1433,163 +1193,3 @@ def test_preprocess(self): ] self.assertSequenceEqual(expected_data, processed_data) - - def test_preprocess_for_training(self): - # Given - language = LANGUAGE_EN - resources = { - STEMS: { - "beautiful": "beauty", - "birdy": "bird", - "entity": "ent" - }, - WORD_CLUSTERS: { - "my_word_clusters": { - "beautiful": "cluster_1", - "birdy": "cluster_2", - "entity": "cluster_3" - } - }, - STOP_WORDS: set() - } - - dataset_stream = io.StringIO(""" ---- -type: intent -name: intent1 -utterances: - - dummy utterance - ---- -type: entity -name: entity_1 -automatically_extensible: false -use_synononyms: false -matching_strictness: 1.0 -values: - - [entity 1, alternative entity 1] - - [éntity 1, alternative entity 1] - ---- -type: entity -name: entity_2 -automatically_extensible: false -use_synononyms: true -matching_strictness: 1.0 -values: - - entity 1 - - [Éntity 2, Éntity_2, Alternative entity 2] - """) - dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - - custom_entity_parser = CustomEntityParser.build( - dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources) - - builtin_entity_parser = BuiltinEntityParser.build(dataset, language) - utterances = [ - { - "data": [ - { - "text": "hÉllo wOrld " - }, - { - "text": " yo " - }, - { - "text": " yo " - }, - { - "text": "yo " - }, - { - "text": "Éntity_2", - "entity": "entity_2" - }, - { - "text": " " - }, - { - "text": "Éntity_2", - "entity": "entity_2" - } - ] - }, - { - "data": [ - { - "text": "beauTiful World " - }, - { - "text": "entity 1", - "entity": "entity_1" - } - ] - }, - { - "data": [ - { - "text": "Bird bïrdy" - } - ] - }, - { - "data": [ - { - "text": "Bird birdy" - } - ] - } - ] - - vectorizer = CooccurrenceVectorizer( - custom_entity_parser=custom_entity_parser, - builtin_entity_parser=builtin_entity_parser, - resources=resources - ) - vectorizer._language = language - - # When - processed_data = vectorizer._preprocess(utterances, training=True) - processed_data = list(zip(*processed_data)) - - # Then - ent_00 = { - "entity_kind": "entity_2", - "value": "Éntity_2", - "range": {"start": 23, "end": 31} - } - ent_01 = { - "entity_kind": "entity_2", - "value": "Éntity_2", - "range": {"start": 32, "end": 40} - } - ent_1 = { - "entity_kind": "entity_1", - "value": "entity 1", - "range": {"start": 16, "end": 24} - } - - expected_data = [ - ( - utterances[0], - [], - [ent_00, ent_01] - ), - ( - utterances[1], - [], - [ent_1] - ), - ( - utterances[2], - [], - [] - ), - ( - utterances[3], - [], - [] - ) - ] - - self.assertSequenceEqual(expected_data, processed_data) From d27f1c022b0f9b9524b7c62662f877db1cc2c7bf Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 12:27:43 +0100 Subject: [PATCH 08/14] Update Changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb2f2c78c..1d9b226a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Changelog All notable changes to this project will be documented in this file. +## [Unreleased] +### Fixed +- Issue with intent classification reducing classification accuracy + ## [0.19.2] - 2019-02-11 ### Fixed - Fix an issue regarding the way builtin entities were handled by the `CRFSlotFiller` @@ -236,6 +240,7 @@ several commands. - Fix compiling issue with `bindgen` dependency when installing from source - Fix issue in `CRFSlotFiller` when handling builtin entities +[Unreleased]: https://github.com/snipsco/snips-nlu/compare/0.19.2...HEAD [0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2 [0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1 [0.19.0]: https://github.com/snipsco/snips-nlu/compare/0.18.0...0.19.0 From 25a5888b25c7ea1478d0ce1fdec76f42d49d6f56 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 14:03:37 +0100 Subject: [PATCH 09/14] Fix stochastic test --- snips_nlu/tests/test_crf_slot_filler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/snips_nlu/tests/test_crf_slot_filler.py b/snips_nlu/tests/test_crf_slot_filler.py index 3b03137f2..02086ac38 100644 --- a/snips_nlu/tests/test_crf_slot_filler.py +++ b/snips_nlu/tests/test_crf_slot_filler.py @@ -97,6 +97,7 @@ def test_should_get_sub_builtin_slots(self): - find me something from [start](9am) to [end](12pm) - I need a break from [start](2pm) until [end](4pm) - Can you suggest something from [start](april 4th) until [end](april 6th) ? +- find an activity from [start](6pm) to [end](8pm) - Book me a trip from [start](this friday) to [end](next tuesday)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) From 24415c2d1e572ac665a00573b7d50541a4533455 Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Tue, 5 Mar 2019 15:11:36 +0100 Subject: [PATCH 10/14] Data augmentation should be deterministic --- snips_nlu/data_augmentation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snips_nlu/data_augmentation.py b/snips_nlu/data_augmentation.py index 0ed6e9b99..68ff15d86 100644 --- a/snips_nlu/data_augmentation.py +++ b/snips_nlu/data_augmentation.py @@ -88,7 +88,7 @@ def get_intent_entities(dataset, intent_name): for chunk in utterance[DATA]: if ENTITY in chunk: intent_entities.add(chunk[ENTITY]) - return intent_entities + return sorted(intent_entities) def num_queries_to_generate(dataset, intent_name, min_utterances): From a7a45799fa72c26b993de1e4e4064a3975b52f5f Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Tue, 5 Mar 2019 15:13:05 +0100 Subject: [PATCH 11/14] Remove dataset specific noise --- .../log_reg_classifier_utils.py | 18 +------------- .../tests/test_log_reg_classifier_utils.py | 24 +------------------ 2 files changed, 2 insertions(+), 40 deletions(-) diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py index dc35245b7..3353ff0fd 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py +++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py @@ -108,22 +108,6 @@ def add_unknown_word_to_utterances(utterances, replacement_string, return new_utterances -def get_dataset_specific_noise(dataset, resources): - """Return a noise list that excludes the dataset entity values""" - entities_values = set() - for ent_name, ent in iteritems(dataset[ENTITIES]): - if is_builtin_entity(ent_name): - continue - for k, v in iteritems(ent[UTTERANCES]): - entities_values.add(k) - entities_values.add(v) - original_noise = get_noise(resources) - specific_noise = [n for n in original_noise if n not in entities_values] - if not specific_noise: # Avoid returning an empty noise - return original_noise - return specific_noise - - def build_training_data(dataset, language, data_augmentation_config, resources, random_state): # Create class mapping @@ -164,7 +148,7 @@ def build_training_data(dataset, language, data_augmentation_config, resources, ) # Adding noise - noise = get_dataset_specific_noise(dataset, resources) + noise = get_noise(resources) noisy_utterances = generate_noise_utterances( augmented_utterances, noise, len(intents), data_augmentation_config, language, random_state) diff --git a/snips_nlu/tests/test_log_reg_classifier_utils.py b/snips_nlu/tests/test_log_reg_classifier_utils.py index 629e9e3e7..66e27baba 100644 --- a/snips_nlu/tests/test_log_reg_classifier_utils.py +++ b/snips_nlu/tests/test_log_reg_classifier_utils.py @@ -15,7 +15,7 @@ from snips_nlu.intent_classifier.log_reg_classifier_utils import ( add_unknown_word_to_utterances, build_training_data, generate_noise_utterances, generate_smart_noise, get_noise_it, - remove_builtin_slots, text_to_utterance, get_dataset_specific_noise) + remove_builtin_slots, text_to_utterance) from snips_nlu.pipeline.configs import ( IntentClassifierDataAugmentationConfig, LogRegIntentClassifierConfig) from snips_nlu.tests.test_log_reg_intent_classifier import ( @@ -536,28 +536,6 @@ def test_remove_builtin_slots(self): self.assertDictEqual(expected_dataset, filtered_dataset) - def test_get_dataset_specific_noise(self): - # Given - dataset_stream = io.StringIO(""" ---- -type: intent -name: my_intent -utterances: -- what is the weather in [city](paris) -- give me the weather in [city](london) -- does it rain in [city](tokyo)?""") - dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - dataset = validate_and_format_dataset(dataset) - resources = { - NOISE: ["paris", "tokyo", "yo"] - } - - # When - noise = get_dataset_specific_noise(dataset, resources) - - # Then - self.assertEqual(["yo"], noise) - def test_add_unknown_word_to_utterances_with_none_max_unknownword(self): # Given utterances = [text_to_utterance("yo")] From fd4486baab83d7726626b0905bd53868624c99eb Mon Sep 17 00:00:00 2001 From: ClemDoum Date: Tue, 5 Mar 2019 15:14:33 +0100 Subject: [PATCH 12/14] Linting --- snips_nlu/intent_classifier/log_reg_classifier.py | 3 +-- snips_nlu/intent_classifier/log_reg_classifier_utils.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/snips_nlu/intent_classifier/log_reg_classifier.py b/snips_nlu/intent_classifier/log_reg_classifier.py index 3b0e85979..1e56dbcc8 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier.py +++ b/snips_nlu/intent_classifier/log_reg_classifier.py @@ -87,13 +87,12 @@ def fit(self, dataset): none_class = max(classes) try: - self.featurizer = self.featurizer.fit( + x = self.featurizer.fit_transform( dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: self.featurizer = None return self - x = self.featurizer.transform(utterances) alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py index 3353ff0fd..5d709b77b 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier_utils.py +++ b/snips_nlu/intent_classifier/log_reg_classifier_utils.py @@ -9,7 +9,7 @@ import numpy as np from future.utils import iteritems, itervalues -from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT, +from snips_nlu.constants import (DATA, ENTITY, INTENTS, TEXT, UNKNOWNWORD, UTTERANCES) from snips_nlu.data_augmentation import augment_utterances from snips_nlu.dataset import get_text_from_chunks From 7227434b39731585d5b31d394d6f7b8837dfb613 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 17:19:54 +0100 Subject: [PATCH 13/14] Update Changelog --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d9b226a7..23cdc145c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,12 @@ # Changelog All notable changes to this project will be documented in this file. -## [Unreleased] +## [0.19.3] - 2019-03-05 ### Fixed - Issue with intent classification reducing classification accuracy +- Issue resulting in a mutation of the CRFSlotFillerConfig +- Wrong required resources of the `DeterministicIntentParser` +- Issue with non ASCII characters when using the parsing CLI with Python2 ## [0.19.2] - 2019-02-11 ### Fixed @@ -240,7 +243,7 @@ several commands. - Fix compiling issue with `bindgen` dependency when installing from source - Fix issue in `CRFSlotFiller` when handling builtin entities -[Unreleased]: https://github.com/snipsco/snips-nlu/compare/0.19.2...HEAD +[0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3 [0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2 [0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1 [0.19.0]: https://github.com/snipsco/snips-nlu/compare/0.18.0...0.19.0 From 4b850fb8320a5f6e0566a6ca9b645c174da9a722 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 5 Mar 2019 17:20:08 +0100 Subject: [PATCH 14/14] Bump version to 0.19.3 --- snips_nlu/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py index c9d76cadc..0c0398efb 100644 --- a/snips_nlu/__about__.py +++ b/snips_nlu/__about__.py @@ -11,7 +11,7 @@ __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai" __license__ = "Apache License, Version 2.0" -__version__ = "0.19.2" +__version__ = "0.19.3" __model_version__ = "0.19.0" __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"