Skip to content

Commit

Permalink
Merge pull request #673 from snipsco/release/0.17.1
Browse files Browse the repository at this point in the history
Release 0.17.1
  • Loading branch information
adrienball authored Oct 9, 2018
2 parents 5b97d48 + f6e39f6 commit 121a676
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 81 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Changelog
All notable changes to this project will be documented in this file.

## [0.17.1] - 2018-10-09
### Fixed
- `DeterministicIntentParser` now relies on the custom entity parser

### Changed
- Bump `snips-nlu-ontology` to `0.60`

## [0.17.0] - 2018-10-05
### Added
- Support for 3 new builtin entities in French: `snips/musicAlbum`, `snips/musicArtist` and `snips/musicTrack`
Expand Down Expand Up @@ -150,6 +157,7 @@ several commands.
- Fix compiling issue with `bindgen` dependency when installing from source
- Fix issue in `CRFSlotFiller` when handling builtin entities

[0.17.1]: https://github.com/snipsco/snips-nlu/compare/0.17.0...0.17.1
[0.17.0]: https://github.com/snipsco/snips-nlu/compare/0.16.5...0.17.0
[0.16.5]: https://github.com/snipsco/snips-nlu/compare/0.16.4...0.16.5
[0.16.4]: https://github.com/snipsco/snips-nlu/compare/0.16.3...0.16.4
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"sklearn-crfsuite>=0.3.6,<0.4",
"semantic_version>=2.6,<3.0",
"snips_nlu_utils>=0.7,<0.8",
"snips_nlu_ontology==0.59.0",
"snips_nlu_ontology>=0.60,<0.61",
"num2words>=0.5.6,<0.6",
"plac>=0.9.6,<1.0",
"requests>=2.0,<3.0",
Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/__about__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__email__ = "[email protected], [email protected]"
__license__ = "Apache License, Version 2.0"

__version__ = "0.17.0"
__version__ = "0.17.1"
__model_version__ = "0.17.0"

__download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"
Expand Down
1 change: 0 additions & 1 deletion snips_nlu/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
ENTITIES = "entities"
ENTITY = "entity"
ENTITY_KIND = "entity_kind"
ENTITY_IDENTIFIER = "entity_identifier"
RESOLVED_VALUE = "resolved_value"
SLOT_NAME = "slot_name"
TEXT = "text"
Expand Down
3 changes: 2 additions & 1 deletion snips_nlu/entity_parser/custom_entity_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from snips_nlu.constants import (
END, ENTITIES, LANGUAGE, PARSER_THRESHOLD, RES_MATCH_RANGE, START,
UTTERANCES)
UTTERANCES, ENTITY_KIND)
from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
from snips_nlu.entity_parser.custom_entity_parser_usage import (
CustomEntityParserUsage)
Expand Down Expand Up @@ -97,6 +97,7 @@ def _parse(self, text, scope):
for entity in entities:
start = entity[RES_MATCH_RANGE][START]
end = entity[RES_MATCH_RANGE][END]
entity[ENTITY_KIND] = entity.pop("entity_identifier")
entity[RES_MATCH_RANGE][START] -= shifts[start]
entity[RES_MATCH_RANGE][END] -= shifts[end - 1]
return entities
Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/intent_classifier/featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def _preprocess_utterance(utterance, language, builtin_entity_parser,
custom_entities = [e for e in custom_entities
if e["value"] != unknownword_replacement_string]
custom_entities_features = [
_entity_name_to_feature(e["entity_identifier"], language)
_entity_name_to_feature(e[ENTITY_KIND], language)
for e in custom_entities]

builtin_entities = builtin_entity_parser.parse(
Expand Down
91 changes: 41 additions & 50 deletions snips_nlu/intent_parser/deterministic_intent_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@
from future.utils import iteritems

from snips_nlu.constants import (
BUILTIN_ENTITY_PARSER, DATA, END, ENTITIES, ENTITY, ENTITY_KIND, INTENTS,
LANGUAGE, RES_MATCH_RANGE, RES_VALUE, SLOT_NAME, START, TEXT, UTTERANCES)
BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, DATA, END, ENTITIES, ENTITY,
ENTITY_KIND, INTENTS, LANGUAGE, RES_MATCH_RANGE, RES_VALUE, SLOT_NAME,
START, TEXT, UTTERANCES)
from snips_nlu.dataset import validate_and_format_dataset
from snips_nlu.intent_parser.intent_parser import IntentParser
from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
from snips_nlu.pipeline.configs import DeterministicIntentParserConfig
from snips_nlu.preprocessing import tokenize, tokenize_light
from snips_nlu.result import (
Expand Down Expand Up @@ -82,16 +82,16 @@ def fit(self, dataset, force_retrain=True):
logger.info("Fitting deterministic parser...")
dataset = validate_and_format_dataset(dataset)
self.fit_builtin_entity_parser_if_needed(dataset)
self.fit_custom_entity_parser_if_needed(dataset)
self.language = dataset[LANGUAGE]
self.regexes_per_intent = dict()
self.group_names_to_slot_names = dict()
joined_entity_utterances = _get_joined_entity_utterances(
dataset, self.language)
entity_placeholders = _get_entity_placeholders(dataset, self.language)
self.slot_names_to_entities = get_slot_name_mappings(dataset)
for intent_name, intent in iteritems(dataset[INTENTS]):
utterances = intent[UTTERANCES]
patterns, self.group_names_to_slot_names = _generate_patterns(
utterances, joined_entity_utterances,
utterances, entity_placeholders,
self.group_names_to_slot_names, self.language)
patterns = [p for p in patterns
if len(p) < self.config.max_pattern_length]
Expand Down Expand Up @@ -128,8 +128,11 @@ def parse(self, text, intents=None):

builtin_entities = self.builtin_entity_parser.parse(
text, use_cache=True)
ranges_mapping, processed_text = _replace_builtin_entities(
text, self.language, builtin_entities)
custom_entities = self.custom_entity_parser.parse(
text, use_cache=True)
all_entities = builtin_entities + custom_entities
ranges_mapping, processed_text = _replace_entities_with_placeholders(
text, self.language, all_entities)

# We try to match both the input text and the preprocessed text to
# cover inconsistencies between labeled data and builtin entity parsing
Expand All @@ -151,7 +154,7 @@ def parse(self, text, intents=None):
return empty_result(text)

def _get_matching_result(self, text, processed_text, regex, intent,
builtin_entities_ranges_mapping=None):
entities_ranges_mapping=None):
found_result = regex.match(processed_text)
if found_result is None:
return None
Expand All @@ -163,12 +166,12 @@ def _get_matching_result(self, text, processed_text, regex, intent,
entity = self.slot_names_to_entities[intent][slot_name]
rng = (found_result.start(group_name),
found_result.end(group_name))
if builtin_entities_ranges_mapping is not None:
if rng in builtin_entities_ranges_mapping:
rng = builtin_entities_ranges_mapping[rng]
if entities_ranges_mapping is not None:
if rng in entities_ranges_mapping:
rng = entities_ranges_mapping[rng]
else:
shift = _get_range_shift(
rng, builtin_entities_ranges_mapping)
rng, entities_ranges_mapping)
rng = {START: rng[0] + shift, END: rng[1] + shift}
else:
rng = {START: rng[0], END: rng[1]}
Expand Down Expand Up @@ -229,8 +232,11 @@ def from_dict(cls, unit_dict, **shared):
:func:`~DeterministicIntentParser.to_dict`
"""
config = cls.config_type.from_dict(unit_dict["config"])
parser = cls(config=config,
builtin_entity_parser=shared.get(BUILTIN_ENTITY_PARSER))
parser = cls(
config=config,
builtin_entity_parser=shared.get(BUILTIN_ENTITY_PARSER),
custom_entity_parser=shared.get(CUSTOM_ENTITY_PARSER),
)
parser.patterns = unit_dict["patterns"]
parser.language = unit_dict["language_code"]
parser.group_names_to_slot_names = unit_dict[
Expand Down Expand Up @@ -299,8 +305,8 @@ def _generate_new_index(slots_name_to_labels):
return index


def _query_to_pattern(query, joined_entity_utterances,
group_names_to_slot_names, language):
def _query_to_pattern(query, entity_placeholders, group_names_to_slot_names,
language):
pattern = []
for chunk in query[DATA]:
if SLOT_NAME in chunk:
Expand All @@ -309,7 +315,7 @@ def _query_to_pattern(query, joined_entity_utterances,
entity = chunk[ENTITY]
group_names_to_slot_names[max_index] = slot_name
pattern.append(
r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity]))
r"(?P<%s>%s)" % (max_index, entity_placeholders[entity]))
else:
tokens = tokenize_light(chunk[TEXT], language)
pattern += [regex_escape(t) for t in tokens]
Expand Down Expand Up @@ -338,62 +344,47 @@ def _get_queries_with_unique_context(intent_queries, language):
return queries


def _generate_patterns(intent_queries, joined_entity_utterances,
def _generate_patterns(intent_queries, entity_placeholders,
group_names_to_labels, language):
queries = _get_queries_with_unique_context(intent_queries, language)
# Join all the entities utterances with a "|" to create the patterns
patterns = set()
for query in queries:
pattern, group_names_to_labels = _query_to_pattern(
query, joined_entity_utterances, group_names_to_labels, language)
query, entity_placeholders, group_names_to_labels, language)
patterns.add(pattern)
return list(patterns), group_names_to_labels


def _get_joined_entity_utterances(dataset, language):
joined_entity_utterances = dict()
for entity_name, entity in iteritems(dataset[ENTITIES]):
# matches are performed in a case insensitive manner
utterances = set(u.lower() for u in entity[UTTERANCES])
patterns = []
if is_builtin_entity(entity_name):
# We add a placeholder value for builtin entities
placeholder = _get_entity_name_placeholder(entity_name, language)
patterns.append(regex_escape(placeholder))
else:
for utterance in utterances:
tokens = tokenize_light(utterance, language)
pattern = WHITESPACE_PATTERN.join(regex_escape(t)
for t in tokens)
patterns.append(pattern)
patterns = (p for p in patterns if p)
joined_entity_utterances[entity_name] = r"|".join(
sorted(patterns, key=len, reverse=True))
return joined_entity_utterances


def _replace_builtin_entities(text, language, builtin_entities):
if not builtin_entities:
def _get_entity_placeholders(dataset, language):
return {
e: _get_entity_name_placeholder(e, language)
for e in dataset[ENTITIES]
}


def _replace_entities_with_placeholders(text, language, entities):
if not entities:
return dict(), text

builtin_entities = _deduplicate_overlapping_entities(builtin_entities)
builtin_entities = sorted(builtin_entities,
key=lambda e: e[RES_MATCH_RANGE][START])
entities = _deduplicate_overlapping_entities(entities)
entities = sorted(
entities, key=lambda e: e[RES_MATCH_RANGE][START])

range_mapping = dict()
processed_text = ""
offset = 0
current_ix = 0
for ent in builtin_entities:
for ent in entities:
ent_start = ent[RES_MATCH_RANGE][START]
ent_end = ent[RES_MATCH_RANGE][END]
rng_start = ent_start + offset

processed_text += text[current_ix:ent_start]

entity_length = ent_end - ent_start
entity_place_holder = _get_entity_name_placeholder(ent[ENTITY_KIND],
language)
entity_place_holder = _get_entity_name_placeholder(
ent[ENTITY_KIND], language)

offset += len(entity_place_holder) - entity_length

Expand Down
6 changes: 2 additions & 4 deletions snips_nlu/nlu_engine/nlu_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from snips_nlu.__about__ import __model_version__, __version__
from snips_nlu.constants import (
AUTOMATICALLY_EXTENSIBLE, BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER,
ENTITIES, ENTITY, ENTITY_IDENTIFIER, ENTITY_KIND, LANGUAGE, RESOLVED_VALUE,
ENTITIES, ENTITY, ENTITY_KIND, LANGUAGE, RESOLVED_VALUE,
RES_ENTITY, RES_INTENT, RES_MATCH_RANGE,
RES_SLOTS, RES_VALUE)
from snips_nlu.dataset import validate_and_format_dataset
Expand Down Expand Up @@ -173,22 +173,20 @@ def resolve_slots(self, text, slots):
parser = self.builtin_entity_parser
slot_builder = builtin_slot
use_cache = False
entity_name_key = ENTITY_KIND
extensible = False
resolved_value_key = ENTITY
else:
entities = custom_entities
parser = self.custom_entity_parser
slot_builder = custom_slot
use_cache = True
entity_name_key = ENTITY_IDENTIFIER
extensible = self._dataset_metadata[ENTITIES][entity_name][
AUTOMATICALLY_EXTENSIBLE]
resolved_value_key = RESOLVED_VALUE

resolved_slot = None
for ent in entities:
if ent[entity_name_key] == entity_name and \
if ent[ENTITY_KIND] == entity_name and \
ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
resolved_slot = slot_builder(slot, ent[resolved_value_key])
break
Expand Down
7 changes: 7 additions & 0 deletions snips_nlu/pipeline/configs/intent_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from copy import deepcopy

from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE
from snips_nlu.entity_parser import CustomEntityParserUsage
from snips_nlu.pipeline.configs import ProcessingUnitConfig
from snips_nlu.pipeline.processing_unit import get_processing_unit_config
from snips_nlu.resources import merge_required_resources
Expand Down Expand Up @@ -92,6 +94,11 @@ def unit_name(cls): # pylint:disable=no-self-argument
DeterministicIntentParser
return DeterministicIntentParser.unit_name

def get_required_resources(self):
return {
CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
}

def to_dict(self):
return {
"unit_name": self.unit_name,
Expand Down
Loading

0 comments on commit 121a676

Please sign in to comment.