Skip to content

Commit

Permalink
Merge pull request #210 from snipsco/release/0.4.0
Browse files Browse the repository at this point in the history
Release/0.4.0
  • Loading branch information
Adrien Ball authored May 5, 2017
2 parents 71f1acd + 0eb78e9 commit e3303d7
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 43 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
"scipy==0.19.0",
"scikit-learn==0.18.1",
"sklearn-crfsuite==0.3.5",
"snips-queries==0.4.0"
"snips-queries==0.4.0",
"builtin_entities_ontology==0.1.1"
]

test_required = [
Expand Down
3 changes: 3 additions & 0 deletions snips_nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io
import os

import builtin_entities_ontology
from duckling import core

from snips_nlu.resources import load_resources
Expand All @@ -14,3 +15,5 @@
__version__ = f.readline().strip()

load_resources()

__builtin_entities_version__ = builtin_entities_ontology.__ontology_version__
2 changes: 1 addition & 1 deletion snips_nlu/__version__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.4
0.4.0
17 changes: 4 additions & 13 deletions snips_nlu/built_in_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,8 @@ class BuiltInEntity(Enum):

DURATION = {LABEL: "snips/duration", DUCKLING_DIM: "duration"}

TIME_CYCLE = {LABEL: "snips/time-cycle", DUCKLING_DIM: "cycle"}

NUMBER = {LABEL: "snips/number", DUCKLING_DIM: "number"}

ORDINAL = {LABEL: "snips/ordinal", DUCKLING_DIM: "ordinal"}

TEMPERATURE = {LABEL: "snips/temperature", DUCKLING_DIM: "temperature"}

UNIT = {LABEL: "snips/unit", DUCKLING_DIM: "unit"}

AMOUNT_OF_MONEY = {
LABEL: "snips/amount-of-money",
DUCKLING_DIM: "amount-of-money"
}

@property
def label(self):
return self.value[LABEL]
Expand Down Expand Up @@ -117,3 +104,7 @@ def get_built_in_entities(text, language, scope=None):

def clear_cache():
_DUCKLING_CACHE.clear()


def is_built_in_entity(entity_label):
return entity_label in BuiltInEntity.built_in_entity_by_label
16 changes: 13 additions & 3 deletions snips_nlu/dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from copy import deepcopy

from snips_nlu.built_in_entities import BuiltInEntity
from snips_nlu.built_in_entities import BuiltInEntity, is_built_in_entity
from snips_nlu.constants import (TEXT, USE_SYNONYMS, SYNONYMS, DATA, INTENTS,
ENTITIES, ENTITY, SLOT_NAME, UTTERANCES,
LANGUAGE, VALUE, AUTOMATICALLY_EXTENSIBLE,
Expand All @@ -24,7 +24,12 @@ def validate_and_format_dataset(dataset):
entities = set()
for entity_name, entity in dataset[ENTITIES].iteritems():
entities.add(entity_name)
dataset[ENTITIES][entity_name] = validate_and_format_entity(entity)
if is_built_in_entity(entity_name):
validate_entity = validate_and_format_builtin_entity
else:
validate_entity = validate_and_format_custom_entity
dataset[ENTITIES][entity_name] = validate_entity(entity)

for intent_name, intent in dataset[INTENTS].iteritems():
validate_intent_name(intent_name)
validate_and_format_intent(intent, dataset[ENTITIES])
Expand Down Expand Up @@ -67,7 +72,7 @@ def get_text_from_chunks(chunks):
return ''.join(chunk[TEXT] for chunk in chunks)


def validate_and_format_entity(entity):
def validate_and_format_custom_entity(entity):
validate_type(entity, dict)
mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA]
validate_keys(entity, mandatory_keys, object_label="entity")
Expand All @@ -85,6 +90,11 @@ def validate_and_format_entity(entity):
return entity


def validate_and_format_builtin_entity(entity):
validate_type(entity, dict)
return entity


def validate_language(language):
if language not in Language.language_by_iso_code:
raise ValueError("Language name must be ISO 639-1,"
Expand Down
13 changes: 2 additions & 11 deletions snips_nlu/intent_parser/probabilistic_intent_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
'_DataAugmentationConfig',
'max_utterances noise_prob min_noise_size max_noise_size',
{
'max_utterances': 0,
'max_utterances': 200,
'noise_prob': 0.,
'min_noise_size': 0,
'max_noise_size': 0
Expand All @@ -31,14 +31,6 @@ def from_dict(cls, obj_dict):
return cls(**obj_dict)


def default_data_augmentation_config(language):
if language == Language.EN:
return DataAugmentationConfig(max_utterances=200, noise_prob=0.05,
min_noise_size=1, max_noise_size=3)
else:
return DataAugmentationConfig()


class ProbabilisticIntentParser(IntentParser):
def __init__(self, language, intent_classifier, crf_taggers,
slot_name_to_entity_mapping, data_augmentation_config=None):
Expand All @@ -49,8 +41,7 @@ def __init__(self, language, intent_classifier, crf_taggers,
self.crf_taggers = crf_taggers
self.slot_name_to_entity_mapping = slot_name_to_entity_mapping
if data_augmentation_config is None:
data_augmentation_config = default_data_augmentation_config(
self.language)
data_augmentation_config = DataAugmentationConfig()
self.data_augmentation_config = data_augmentation_config

@property
Expand Down
27 changes: 14 additions & 13 deletions snips_nlu/slot_filler/data_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@ def generate_utterance(contexts_iterator, entities_iterators, noise_iterator,
space_before = " " if has_entity else ""

if noise_prob > 0 and random.random() < noise_prob:
noise = deepcopy(next(noise_iterator))
context_data.append({"text": space_before + noise + space_after})
noise = deepcopy(next(noise_iterator, None))
if noise is not None:
context_data.append(
{"text": space_before + noise + space_after})
context[DATA] = context_data
return context

Expand Down Expand Up @@ -81,19 +83,18 @@ def get_noise_iterator(language, min_size, max_size):
def augment_utterances(dataset, intent_name, language, max_utterances,
noise_prob, min_noise_size, max_noise_size):
utterances = dataset[INTENTS][intent_name][UTTERANCES]
if max_utterances < len(utterances):
return utterances

num_to_generate = max_utterances - len(utterances)
nb_utterances = len(utterances)
nb_to_generate = max(nb_utterances, max_utterances)
contexts_it = get_contexts_iterator(utterances)
noise_iterator = get_noise_iterator(language, min_noise_size,
max_noise_size)
intent_entities = get_intent_entities(dataset, intent_name)
entities_its = get_entities_iterators(dataset, intent_entities)

while num_to_generate > 0:
utterances.append(generate_utterance(contexts_it, entities_its,
noise_iterator, noise_prob))
num_to_generate -= 1

return utterances
generated_utterances = []
while nb_to_generate > 0:
generated_utterance = generate_utterance(contexts_it, entities_its,
noise_iterator, noise_prob)
generated_utterances.append(generated_utterance)
nb_to_generate -= 1

return generated_utterances
16 changes: 15 additions & 1 deletion snips_nlu/tests/test_built_in_entities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import unicode_literals

import unittest

from builtin_entities_ontology import get_ontology
from mock import patch

from snips_nlu.built_in_entities import (get_built_in_entities, BuiltInEntity,
Expand All @@ -17,7 +20,7 @@ def test_get_built_in_entities(self, mocked_duckling_parse):
text = "let's meet at 2p.m in the bronx"

mocked_parse = [{
'body': u'at 2p.m.',
'body': 'at 2p.m.',
'dim': 'time',
'end': 17,
'value': {
Expand Down Expand Up @@ -88,3 +91,14 @@ def test_duckling_cache(self, mocked_duckling_parse):
# Then
mocked_duckling_parse.assert_called_once_with(language.duckling_code,
text)

def test_builtins_should_have_exactly_ontology_entities(self):
# Given
ontology = get_ontology()
ontology_entities = [e["label"] for e in ontology["entities"]]

# When
entities = [e.label for e in BuiltInEntity]

# Then
self.assertItemsEqual(ontology_entities, entities)
35 changes: 35 additions & 0 deletions snips_nlu/tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest

from snips_nlu.built_in_entities import BuiltInEntity
from snips_nlu.constants import CUSTOM_ENGINE
from snips_nlu.dataset import validate_and_format_dataset

Expand Down Expand Up @@ -269,6 +270,40 @@ def test_should_format_dataset_by_adding_entity_values(self):
# Then
self.assertEqual(dataset, expected_dataset)

def test_should_not_require_data_for_builtin_entities(self):
# Given
dataset = {
"intents": {
"intent1": {
"utterances": [
{
"data": [
{
"text": "this is ",
},
{
"text": "10p.m",
"entity": BuiltInEntity.DATETIME.label,
"slot_name": "startTime"
}
]
}
],
"engineType": CUSTOM_ENGINE
}
},
"entities": {
BuiltInEntity.DATETIME.label: {}
},
"language": "en"
}

# When / Then
try:
validate_and_format_dataset(dataset)
except:
self.fail("Could not validate dataset")


if __name__ == '__main__':
unittest.main()

0 comments on commit e3303d7

Please sign in to comment.