diff --git a/CHANGELOG.md b/CHANGELOG.md index 17b56ed33..064f07f92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ # Changelog All notable changes to this project will be documented in this file. +## [0.17.1] - 2018-10-09 +### Fixed +- `DeterministicIntentParser` now relies on the custom entity parser + +### Changed +- Bump `snips-nlu-ontology` to `0.60` + ## [0.17.0] - 2018-10-05 ### Added - Support for 3 new builtin entities in French: `snips/musicAlbum`, `snips/musicArtist` and `snips/musicTrack` @@ -150,6 +157,7 @@ several commands. - Fix compiling issue with `bindgen` dependency when installing from source - Fix issue in `CRFSlotFiller` when handling builtin entities +[0.17.1]: https://github.com/snipsco/snips-nlu/compare/0.17.0...0.17.1 [0.17.0]: https://github.com/snipsco/snips-nlu/compare/0.16.5...0.17.0 [0.16.5]: https://github.com/snipsco/snips-nlu/compare/0.16.4...0.16.5 [0.16.4]: https://github.com/snipsco/snips-nlu/compare/0.16.3...0.16.4 diff --git a/setup.py b/setup.py index 02b27ae1e..46caa545a 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ "sklearn-crfsuite>=0.3.6,<0.4", "semantic_version>=2.6,<3.0", "snips_nlu_utils>=0.7,<0.8", - "snips_nlu_ontology==0.59.0", + "snips_nlu_ontology>=0.60,<0.61", "num2words>=0.5.6,<0.6", "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py index e0f36721c..2622c9afa 100644 --- a/snips_nlu/__about__.py +++ b/snips_nlu/__about__.py @@ -11,7 +11,7 @@ __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai" __license__ = "Apache License, Version 2.0" -__version__ = "0.17.0" +__version__ = "0.17.1" __model_version__ = "0.17.0" __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download" diff --git a/snips_nlu/constants.py b/snips_nlu/constants.py index d2694b5a7..a572b544a 100644 --- a/snips_nlu/constants.py +++ b/snips_nlu/constants.py @@ -30,7 +30,6 @@ ENTITIES = "entities" ENTITY = "entity" ENTITY_KIND = "entity_kind" -ENTITY_IDENTIFIER = "entity_identifier" RESOLVED_VALUE = "resolved_value" SLOT_NAME = "slot_name" TEXT = "text" diff --git a/snips_nlu/entity_parser/custom_entity_parser.py b/snips_nlu/entity_parser/custom_entity_parser.py index 59200ccd4..c5ab72a54 100644 --- a/snips_nlu/entity_parser/custom_entity_parser.py +++ b/snips_nlu/entity_parser/custom_entity_parser.py @@ -10,7 +10,7 @@ from snips_nlu.constants import ( END, ENTITIES, LANGUAGE, PARSER_THRESHOLD, RES_MATCH_RANGE, START, - UTTERANCES) + UTTERANCES, ENTITY_KIND) from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity from snips_nlu.entity_parser.custom_entity_parser_usage import ( CustomEntityParserUsage) @@ -97,6 +97,7 @@ def _parse(self, text, scope): for entity in entities: start = entity[RES_MATCH_RANGE][START] end = entity[RES_MATCH_RANGE][END] + entity[ENTITY_KIND] = entity.pop("entity_identifier") entity[RES_MATCH_RANGE][START] -= shifts[start] entity[RES_MATCH_RANGE][END] -= shifts[end - 1] return entities diff --git a/snips_nlu/intent_classifier/featurizer.py b/snips_nlu/intent_classifier/featurizer.py index b157a1286..5306b169f 100644 --- a/snips_nlu/intent_classifier/featurizer.py +++ b/snips_nlu/intent_classifier/featurizer.py @@ -201,7 +201,7 @@ def _preprocess_utterance(utterance, language, builtin_entity_parser, custom_entities = [e for e in custom_entities if e["value"] != unknownword_replacement_string] custom_entities_features = [ - _entity_name_to_feature(e["entity_identifier"], language) + _entity_name_to_feature(e[ENTITY_KIND], language) for e in custom_entities] builtin_entities = builtin_entity_parser.parse( diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py index 8926a7160..452543c11 100644 --- a/snips_nlu/intent_parser/deterministic_intent_parser.py +++ b/snips_nlu/intent_parser/deterministic_intent_parser.py @@ -9,11 +9,11 @@ from future.utils import iteritems from snips_nlu.constants import ( - BUILTIN_ENTITY_PARSER, DATA, END, ENTITIES, ENTITY, ENTITY_KIND, INTENTS, - LANGUAGE, RES_MATCH_RANGE, RES_VALUE, SLOT_NAME, START, TEXT, UTTERANCES) + BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, DATA, END, ENTITIES, ENTITY, + ENTITY_KIND, INTENTS, LANGUAGE, RES_MATCH_RANGE, RES_VALUE, SLOT_NAME, + START, TEXT, UTTERANCES) from snips_nlu.dataset import validate_and_format_dataset from snips_nlu.intent_parser.intent_parser import IntentParser -from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity from snips_nlu.pipeline.configs import DeterministicIntentParserConfig from snips_nlu.preprocessing import tokenize, tokenize_light from snips_nlu.result import ( @@ -82,16 +82,16 @@ def fit(self, dataset, force_retrain=True): logger.info("Fitting deterministic parser...") dataset = validate_and_format_dataset(dataset) self.fit_builtin_entity_parser_if_needed(dataset) + self.fit_custom_entity_parser_if_needed(dataset) self.language = dataset[LANGUAGE] self.regexes_per_intent = dict() self.group_names_to_slot_names = dict() - joined_entity_utterances = _get_joined_entity_utterances( - dataset, self.language) + entity_placeholders = _get_entity_placeholders(dataset, self.language) self.slot_names_to_entities = get_slot_name_mappings(dataset) for intent_name, intent in iteritems(dataset[INTENTS]): utterances = intent[UTTERANCES] patterns, self.group_names_to_slot_names = _generate_patterns( - utterances, joined_entity_utterances, + utterances, entity_placeholders, self.group_names_to_slot_names, self.language) patterns = [p for p in patterns if len(p) < self.config.max_pattern_length] @@ -128,8 +128,11 @@ def parse(self, text, intents=None): builtin_entities = self.builtin_entity_parser.parse( text, use_cache=True) - ranges_mapping, processed_text = _replace_builtin_entities( - text, self.language, builtin_entities) + custom_entities = self.custom_entity_parser.parse( + text, use_cache=True) + all_entities = builtin_entities + custom_entities + ranges_mapping, processed_text = _replace_entities_with_placeholders( + text, self.language, all_entities) # We try to match both the input text and the preprocessed text to # cover inconsistencies between labeled data and builtin entity parsing @@ -151,7 +154,7 @@ def parse(self, text, intents=None): return empty_result(text) def _get_matching_result(self, text, processed_text, regex, intent, - builtin_entities_ranges_mapping=None): + entities_ranges_mapping=None): found_result = regex.match(processed_text) if found_result is None: return None @@ -163,12 +166,12 @@ def _get_matching_result(self, text, processed_text, regex, intent, entity = self.slot_names_to_entities[intent][slot_name] rng = (found_result.start(group_name), found_result.end(group_name)) - if builtin_entities_ranges_mapping is not None: - if rng in builtin_entities_ranges_mapping: - rng = builtin_entities_ranges_mapping[rng] + if entities_ranges_mapping is not None: + if rng in entities_ranges_mapping: + rng = entities_ranges_mapping[rng] else: shift = _get_range_shift( - rng, builtin_entities_ranges_mapping) + rng, entities_ranges_mapping) rng = {START: rng[0] + shift, END: rng[1] + shift} else: rng = {START: rng[0], END: rng[1]} @@ -229,8 +232,11 @@ def from_dict(cls, unit_dict, **shared): :func:`~DeterministicIntentParser.to_dict` """ config = cls.config_type.from_dict(unit_dict["config"]) - parser = cls(config=config, - builtin_entity_parser=shared.get(BUILTIN_ENTITY_PARSER)) + parser = cls( + config=config, + builtin_entity_parser=shared.get(BUILTIN_ENTITY_PARSER), + custom_entity_parser=shared.get(CUSTOM_ENTITY_PARSER), + ) parser.patterns = unit_dict["patterns"] parser.language = unit_dict["language_code"] parser.group_names_to_slot_names = unit_dict[ @@ -299,8 +305,8 @@ def _generate_new_index(slots_name_to_labels): return index -def _query_to_pattern(query, joined_entity_utterances, - group_names_to_slot_names, language): +def _query_to_pattern(query, entity_placeholders, group_names_to_slot_names, + language): pattern = [] for chunk in query[DATA]: if SLOT_NAME in chunk: @@ -309,7 +315,7 @@ def _query_to_pattern(query, joined_entity_utterances, entity = chunk[ENTITY] group_names_to_slot_names[max_index] = slot_name pattern.append( - r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity])) + r"(?P<%s>%s)" % (max_index, entity_placeholders[entity])) else: tokens = tokenize_light(chunk[TEXT], language) pattern += [regex_escape(t) for t in tokens] @@ -338,53 +344,38 @@ def _get_queries_with_unique_context(intent_queries, language): return queries -def _generate_patterns(intent_queries, joined_entity_utterances, +def _generate_patterns(intent_queries, entity_placeholders, group_names_to_labels, language): queries = _get_queries_with_unique_context(intent_queries, language) # Join all the entities utterances with a "|" to create the patterns patterns = set() for query in queries: pattern, group_names_to_labels = _query_to_pattern( - query, joined_entity_utterances, group_names_to_labels, language) + query, entity_placeholders, group_names_to_labels, language) patterns.add(pattern) return list(patterns), group_names_to_labels -def _get_joined_entity_utterances(dataset, language): - joined_entity_utterances = dict() - for entity_name, entity in iteritems(dataset[ENTITIES]): - # matches are performed in a case insensitive manner - utterances = set(u.lower() for u in entity[UTTERANCES]) - patterns = [] - if is_builtin_entity(entity_name): - # We add a placeholder value for builtin entities - placeholder = _get_entity_name_placeholder(entity_name, language) - patterns.append(regex_escape(placeholder)) - else: - for utterance in utterances: - tokens = tokenize_light(utterance, language) - pattern = WHITESPACE_PATTERN.join(regex_escape(t) - for t in tokens) - patterns.append(pattern) - patterns = (p for p in patterns if p) - joined_entity_utterances[entity_name] = r"|".join( - sorted(patterns, key=len, reverse=True)) - return joined_entity_utterances - - -def _replace_builtin_entities(text, language, builtin_entities): - if not builtin_entities: +def _get_entity_placeholders(dataset, language): + return { + e: _get_entity_name_placeholder(e, language) + for e in dataset[ENTITIES] + } + + +def _replace_entities_with_placeholders(text, language, entities): + if not entities: return dict(), text - builtin_entities = _deduplicate_overlapping_entities(builtin_entities) - builtin_entities = sorted(builtin_entities, - key=lambda e: e[RES_MATCH_RANGE][START]) + entities = _deduplicate_overlapping_entities(entities) + entities = sorted( + entities, key=lambda e: e[RES_MATCH_RANGE][START]) range_mapping = dict() processed_text = "" offset = 0 current_ix = 0 - for ent in builtin_entities: + for ent in entities: ent_start = ent[RES_MATCH_RANGE][START] ent_end = ent[RES_MATCH_RANGE][END] rng_start = ent_start + offset @@ -392,8 +383,8 @@ def _replace_builtin_entities(text, language, builtin_entities): processed_text += text[current_ix:ent_start] entity_length = ent_end - ent_start - entity_place_holder = _get_entity_name_placeholder(ent[ENTITY_KIND], - language) + entity_place_holder = _get_entity_name_placeholder( + ent[ENTITY_KIND], language) offset += len(entity_place_holder) - entity_length diff --git a/snips_nlu/nlu_engine/nlu_engine.py b/snips_nlu/nlu_engine/nlu_engine.py index f3cecda49..44181e28f 100644 --- a/snips_nlu/nlu_engine/nlu_engine.py +++ b/snips_nlu/nlu_engine/nlu_engine.py @@ -11,7 +11,7 @@ from snips_nlu.__about__ import __model_version__, __version__ from snips_nlu.constants import ( AUTOMATICALLY_EXTENSIBLE, BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, - ENTITIES, ENTITY, ENTITY_IDENTIFIER, ENTITY_KIND, LANGUAGE, RESOLVED_VALUE, + ENTITIES, ENTITY, ENTITY_KIND, LANGUAGE, RESOLVED_VALUE, RES_ENTITY, RES_INTENT, RES_MATCH_RANGE, RES_SLOTS, RES_VALUE) from snips_nlu.dataset import validate_and_format_dataset @@ -173,7 +173,6 @@ def resolve_slots(self, text, slots): parser = self.builtin_entity_parser slot_builder = builtin_slot use_cache = False - entity_name_key = ENTITY_KIND extensible = False resolved_value_key = ENTITY else: @@ -181,14 +180,13 @@ def resolve_slots(self, text, slots): parser = self.custom_entity_parser slot_builder = custom_slot use_cache = True - entity_name_key = ENTITY_IDENTIFIER extensible = self._dataset_metadata[ENTITIES][entity_name][ AUTOMATICALLY_EXTENSIBLE] resolved_value_key = RESOLVED_VALUE resolved_slot = None for ent in entities: - if ent[entity_name_key] == entity_name and \ + if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = slot_builder(slot, ent[resolved_value_key]) break diff --git a/snips_nlu/pipeline/configs/intent_parser.py b/snips_nlu/pipeline/configs/intent_parser.py index 613cdc150..9c5722be7 100644 --- a/snips_nlu/pipeline/configs/intent_parser.py +++ b/snips_nlu/pipeline/configs/intent_parser.py @@ -2,6 +2,8 @@ from copy import deepcopy +from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE +from snips_nlu.entity_parser import CustomEntityParserUsage from snips_nlu.pipeline.configs import ProcessingUnitConfig from snips_nlu.pipeline.processing_unit import get_processing_unit_config from snips_nlu.resources import merge_required_resources @@ -92,6 +94,11 @@ def unit_name(cls): # pylint:disable=no-self-argument DeterministicIntentParser return DeterministicIntentParser.unit_name + def get_required_resources(self): + return { + CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS + } + def to_dict(self): return { "unit_name": self.unit_name, diff --git a/snips_nlu/tests/test_custom_entity_parser.py b/snips_nlu/tests/test_custom_entity_parser.py index ad4a59540..0f7b06868 100644 --- a/snips_nlu/tests/test_custom_entity_parser.py +++ b/snips_nlu/tests/test_custom_entity_parser.py @@ -64,7 +64,7 @@ def test_should_parse_without_stems(self): "start": 0, "end": 14 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", @@ -73,7 +73,7 @@ def test_should_parse_without_stems(self): "start": 15, "end": 22 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" }, { "value": "dummy_entity_2", @@ -82,7 +82,7 @@ def test_should_parse_without_stems(self): "start": 23, "end": 37 }, - "entity_identifier": "dummy_entity_2" + "entity_kind": "dummy_entity_2" }, { "value": "dummy_2", @@ -91,7 +91,7 @@ def test_should_parse_without_stems(self): "start": 38, "end": 45 }, - "entity_identifier": "dummy_entity_2" + "entity_kind": "dummy_entity_2" } ] self.assertListEqual(expected_entities, result) @@ -117,7 +117,7 @@ def test_should_parse_with_stems(self, mocked_stem): "start": 0, "end": 13 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" } ] self.assertListEqual(expected_entities, result) @@ -143,7 +143,7 @@ def test_should_parse_with_and_without_stems(self, mocked_stem): "start": 0, "end": 13 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", @@ -152,7 +152,7 @@ def test_should_parse_with_and_without_stems(self, mocked_stem): "start": 14, "end": 21 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" } ] self.assertListEqual(expected_entities, result) @@ -176,7 +176,7 @@ def test_should_parse_with_proper_tokenization(self): "start": 2, "end": 9 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" }, { "value": "dummy_2", @@ -185,7 +185,7 @@ def test_should_parse_with_proper_tokenization(self): "start": 10, "end": 17 }, - "entity_identifier": "dummy_entity_2" + "entity_kind": "dummy_entity_2" } ] self.assertListEqual(expected_entities, result) @@ -242,7 +242,7 @@ def test_should_be_serializable(self): "start": 0, "end": 14 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" }, { "value": "dummy_1", @@ -251,7 +251,7 @@ def test_should_be_serializable(self): "start": 15, "end": 22 }, - "entity_identifier": "dummy_entity_1" + "entity_kind": "dummy_entity_1" } ] self.assertListEqual(expected_entities, result) diff --git a/snips_nlu/tests/test_deterministic_intent_parser.py b/snips_nlu/tests/test_deterministic_intent_parser.py index 87c8fb509..a5fad5444 100644 --- a/snips_nlu/tests/test_deterministic_intent_parser.py +++ b/snips_nlu/tests/test_deterministic_intent_parser.py @@ -12,7 +12,7 @@ from snips_nlu.entity_parser import BuiltinEntityParser from snips_nlu.intent_parser.deterministic_intent_parser import ( DeterministicIntentParser, _deduplicate_overlapping_slots, - _get_range_shift, _replace_builtin_entities, + _get_range_shift, _replace_entities_with_placeholders, _replace_tokenized_out_characters) from snips_nlu.pipeline.configs import DeterministicIntentParserConfig from snips_nlu.result import intent_classification_result, unresolved_slot @@ -231,10 +231,12 @@ def test_should_get_intent_after_deserialization(self): dataset = validate_and_format_dataset(self.slots_dataset) parser = DeterministicIntentParser().fit(dataset) + custom_entity_parser = parser.custom_entity_parser parser.persist(self.tmp_file_path) deserialized_parser = DeterministicIntentParser.from_path( self.tmp_file_path, - builtin_entity_parser=BuiltinEntityParser.build(language="en")) + builtin_entity_parser=BuiltinEntityParser.build(language="en"), + custom_entity_parser=custom_entity_parser) text = "this is a dummy_a query with another dummy_c at 10p.m. or " \ "at 12p.m." @@ -332,10 +334,12 @@ def test_should_get_slots_after_deserialization(self): dataset = validate_and_format_dataset(dataset) parser = DeterministicIntentParser().fit(dataset) + custom_entity_parser = parser.custom_entity_parser parser.persist(self.tmp_file_path) deserialized_parser = DeterministicIntentParser.from_path( self.tmp_file_path, - builtin_entity_parser=BuiltinEntityParser.build(language="en")) + builtin_entity_parser=BuiltinEntityParser.build(language="en"), + custom_entity_parser=custom_entity_parser) texts = [ ( @@ -403,12 +407,15 @@ def test_should_be_serializable_into_bytearray(self): # Given dataset = BEVERAGE_DATASET intent_parser = DeterministicIntentParser().fit(dataset) + custom_entity_parser = intent_parser.custom_entity_parser # When intent_parser_bytes = intent_parser.to_byte_array() loaded_intent_parser = DeterministicIntentParser.from_byte_array( intent_parser_bytes, - builtin_entity_parser=BuiltinEntityParser.build(language="en")) + builtin_entity_parser=BuiltinEntityParser.build(language="en"), + custom_entity_parser=custom_entity_parser + ) result = loaded_intent_parser.parse("make me two cups of coffee") # Then @@ -750,15 +757,15 @@ def test_should_limit_patterns_length(self): parser = DeterministicIntentParser(config=config).fit(dataset) # Then - self.assertEqual(3, len(parser.regexes_per_intent["dummy_intent_1"])) + self.assertEqual(4, len(parser.regexes_per_intent["dummy_intent_1"])) self.assertEqual(1, len(parser.regexes_per_intent["dummy_intent_2"])) - def test_should_replace_builtin_entities(self): + def test_should_replace_entities(self): # Given text = "Be the first to be there at 9pm" # When - builtin_entities = [ + entities = [ { "entity_kind": "snips/ordinal", "value": "the first", @@ -768,7 +775,7 @@ def test_should_replace_builtin_entities(self): } }, { - "entity_kind": "snips/musicAlbum", + "entity_kind": "my_custom_entity", "value": "first", "range": { "start": 7, @@ -784,9 +791,8 @@ def test_should_replace_builtin_entities(self): } } ] - range_mapping, processed_text = _replace_builtin_entities( - text=text, language=LANGUAGE_EN, - builtin_entities=builtin_entities) + range_mapping, processed_text = _replace_entities_with_placeholders( + text=text, language=LANGUAGE_EN, entities=entities) # Then expected_mapping = {