diff --git a/setup.py b/setup.py index ce8f7d276..bb0efd6bd 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,8 @@ "scipy==0.19.0", "scikit-learn==0.18.1", "sklearn-crfsuite==0.3.5", - "snips-queries==0.4.0" + "snips-queries==0.4.0", + "builtin_entities_ontology==0.1.1" ] test_required = [ diff --git a/snips_nlu/__init__.py b/snips_nlu/__init__.py index 536c4cf0d..5086c03f1 100644 --- a/snips_nlu/__init__.py +++ b/snips_nlu/__init__.py @@ -1,6 +1,7 @@ import io import os +import builtin_entities_ontology from duckling import core from snips_nlu.resources import load_resources @@ -14,3 +15,5 @@ __version__ = f.readline().strip() load_resources() + +__builtin_entities_version__ = builtin_entities_ontology.__ontology_version__ diff --git a/snips_nlu/__version__ b/snips_nlu/__version__ index 448a0fa11..60a2d3e96 100644 --- a/snips_nlu/__version__ +++ b/snips_nlu/__version__ @@ -1 +1 @@ -0.3.4 \ No newline at end of file +0.4.0 \ No newline at end of file diff --git a/snips_nlu/built_in_entities.py b/snips_nlu/built_in_entities.py index 0b155fc47..c39e10968 100644 --- a/snips_nlu/built_in_entities.py +++ b/snips_nlu/built_in_entities.py @@ -10,21 +10,8 @@ class BuiltInEntity(Enum): DURATION = {LABEL: "snips/duration", DUCKLING_DIM: "duration"} - TIME_CYCLE = {LABEL: "snips/time-cycle", DUCKLING_DIM: "cycle"} - NUMBER = {LABEL: "snips/number", DUCKLING_DIM: "number"} - ORDINAL = {LABEL: "snips/ordinal", DUCKLING_DIM: "ordinal"} - - TEMPERATURE = {LABEL: "snips/temperature", DUCKLING_DIM: "temperature"} - - UNIT = {LABEL: "snips/unit", DUCKLING_DIM: "unit"} - - AMOUNT_OF_MONEY = { - LABEL: "snips/amount-of-money", - DUCKLING_DIM: "amount-of-money" - } - @property def label(self): return self.value[LABEL] @@ -117,3 +104,7 @@ def get_built_in_entities(text, language, scope=None): def clear_cache(): _DUCKLING_CACHE.clear() + + +def is_built_in_entity(entity_label): + return entity_label in BuiltInEntity.built_in_entity_by_label diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset.py index 8423bd3e2..6806ba83b 100644 --- a/snips_nlu/dataset.py +++ b/snips_nlu/dataset.py @@ -1,7 +1,7 @@ import re from copy import deepcopy -from snips_nlu.built_in_entities import BuiltInEntity +from snips_nlu.built_in_entities import BuiltInEntity, is_built_in_entity from snips_nlu.constants import (TEXT, USE_SYNONYMS, SYNONYMS, DATA, INTENTS, ENTITIES, ENTITY, SLOT_NAME, UTTERANCES, LANGUAGE, VALUE, AUTOMATICALLY_EXTENSIBLE, @@ -24,7 +24,12 @@ def validate_and_format_dataset(dataset): entities = set() for entity_name, entity in dataset[ENTITIES].iteritems(): entities.add(entity_name) - dataset[ENTITIES][entity_name] = validate_and_format_entity(entity) + if is_built_in_entity(entity_name): + validate_entity = validate_and_format_builtin_entity + else: + validate_entity = validate_and_format_custom_entity + dataset[ENTITIES][entity_name] = validate_entity(entity) + for intent_name, intent in dataset[INTENTS].iteritems(): validate_intent_name(intent_name) validate_and_format_intent(intent, dataset[ENTITIES]) @@ -67,7 +72,7 @@ def get_text_from_chunks(chunks): return ''.join(chunk[TEXT] for chunk in chunks) -def validate_and_format_entity(entity): +def validate_and_format_custom_entity(entity): validate_type(entity, dict) mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA] validate_keys(entity, mandatory_keys, object_label="entity") @@ -85,6 +90,11 @@ def validate_and_format_entity(entity): return entity +def validate_and_format_builtin_entity(entity): + validate_type(entity, dict) + return entity + + def validate_language(language): if language not in Language.language_by_iso_code: raise ValueError("Language name must be ISO 639-1," diff --git a/snips_nlu/intent_parser/probabilistic_intent_parser.py b/snips_nlu/intent_parser/probabilistic_intent_parser.py index aacb30f15..0ce1276a1 100644 --- a/snips_nlu/intent_parser/probabilistic_intent_parser.py +++ b/snips_nlu/intent_parser/probabilistic_intent_parser.py @@ -14,7 +14,7 @@ '_DataAugmentationConfig', 'max_utterances noise_prob min_noise_size max_noise_size', { - 'max_utterances': 0, + 'max_utterances': 200, 'noise_prob': 0., 'min_noise_size': 0, 'max_noise_size': 0 @@ -31,14 +31,6 @@ def from_dict(cls, obj_dict): return cls(**obj_dict) -def default_data_augmentation_config(language): - if language == Language.EN: - return DataAugmentationConfig(max_utterances=200, noise_prob=0.05, - min_noise_size=1, max_noise_size=3) - else: - return DataAugmentationConfig() - - class ProbabilisticIntentParser(IntentParser): def __init__(self, language, intent_classifier, crf_taggers, slot_name_to_entity_mapping, data_augmentation_config=None): @@ -49,8 +41,7 @@ def __init__(self, language, intent_classifier, crf_taggers, self.crf_taggers = crf_taggers self.slot_name_to_entity_mapping = slot_name_to_entity_mapping if data_augmentation_config is None: - data_augmentation_config = default_data_augmentation_config( - self.language) + data_augmentation_config = DataAugmentationConfig() self.data_augmentation_config = data_augmentation_config @property diff --git a/snips_nlu/slot_filler/data_augmentation.py b/snips_nlu/slot_filler/data_augmentation.py index 10c58d782..a7334966d 100644 --- a/snips_nlu/slot_filler/data_augmentation.py +++ b/snips_nlu/slot_filler/data_augmentation.py @@ -33,8 +33,10 @@ def generate_utterance(contexts_iterator, entities_iterators, noise_iterator, space_before = " " if has_entity else "" if noise_prob > 0 and random.random() < noise_prob: - noise = deepcopy(next(noise_iterator)) - context_data.append({"text": space_before + noise + space_after}) + noise = deepcopy(next(noise_iterator, None)) + if noise is not None: + context_data.append( + {"text": space_before + noise + space_after}) context[DATA] = context_data return context @@ -81,19 +83,18 @@ def get_noise_iterator(language, min_size, max_size): def augment_utterances(dataset, intent_name, language, max_utterances, noise_prob, min_noise_size, max_noise_size): utterances = dataset[INTENTS][intent_name][UTTERANCES] - if max_utterances < len(utterances): - return utterances - - num_to_generate = max_utterances - len(utterances) + nb_utterances = len(utterances) + nb_to_generate = max(nb_utterances, max_utterances) contexts_it = get_contexts_iterator(utterances) noise_iterator = get_noise_iterator(language, min_noise_size, max_noise_size) intent_entities = get_intent_entities(dataset, intent_name) entities_its = get_entities_iterators(dataset, intent_entities) - - while num_to_generate > 0: - utterances.append(generate_utterance(contexts_it, entities_its, - noise_iterator, noise_prob)) - num_to_generate -= 1 - - return utterances + generated_utterances = [] + while nb_to_generate > 0: + generated_utterance = generate_utterance(contexts_it, entities_its, + noise_iterator, noise_prob) + generated_utterances.append(generated_utterance) + nb_to_generate -= 1 + + return generated_utterances diff --git a/snips_nlu/tests/test_built_in_entities.py b/snips_nlu/tests/test_built_in_entities.py index 6e4100ace..a5bc8ceec 100644 --- a/snips_nlu/tests/test_built_in_entities.py +++ b/snips_nlu/tests/test_built_in_entities.py @@ -1,5 +1,8 @@ +from __future__ import unicode_literals + import unittest +from builtin_entities_ontology import get_ontology from mock import patch from snips_nlu.built_in_entities import (get_built_in_entities, BuiltInEntity, @@ -17,7 +20,7 @@ def test_get_built_in_entities(self, mocked_duckling_parse): text = "let's meet at 2p.m in the bronx" mocked_parse = [{ - 'body': u'at 2p.m.', + 'body': 'at 2p.m.', 'dim': 'time', 'end': 17, 'value': { @@ -88,3 +91,14 @@ def test_duckling_cache(self, mocked_duckling_parse): # Then mocked_duckling_parse.assert_called_once_with(language.duckling_code, text) + + def test_builtins_should_have_exactly_ontology_entities(self): + # Given + ontology = get_ontology() + ontology_entities = [e["label"] for e in ontology["entities"]] + + # When + entities = [e.label for e in BuiltInEntity] + + # Then + self.assertItemsEqual(ontology_entities, entities) diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset.py index a9ed30d05..1c09549ef 100644 --- a/snips_nlu/tests/test_dataset.py +++ b/snips_nlu/tests/test_dataset.py @@ -1,5 +1,6 @@ import unittest +from snips_nlu.built_in_entities import BuiltInEntity from snips_nlu.constants import CUSTOM_ENGINE from snips_nlu.dataset import validate_and_format_dataset @@ -269,6 +270,40 @@ def test_should_format_dataset_by_adding_entity_values(self): # Then self.assertEqual(dataset, expected_dataset) + def test_should_not_require_data_for_builtin_entities(self): + # Given + dataset = { + "intents": { + "intent1": { + "utterances": [ + { + "data": [ + { + "text": "this is ", + }, + { + "text": "10p.m", + "entity": BuiltInEntity.DATETIME.label, + "slot_name": "startTime" + } + ] + } + ], + "engineType": CUSTOM_ENGINE + } + }, + "entities": { + BuiltInEntity.DATETIME.label: {} + }, + "language": "en" + } + + # When / Then + try: + validate_and_format_dataset(dataset) + except: + self.fail("Could not validate dataset") + if __name__ == '__main__': unittest.main()