Skip to content

Commit

Permalink
Merge pull request #765 from snipsco/release/0.19.3
Browse files Browse the repository at this point in the history
Release 0.19.3
  • Loading branch information
adrienball authored Mar 5, 2019
2 parents 04087e1 + 4b850fb commit 71f9599
Show file tree
Hide file tree
Showing 18 changed files with 162 additions and 538 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Changelog
All notable changes to this project will be documented in this file.

## [0.19.3] - 2019-03-05
### Fixed
- Issue with intent classification reducing classification accuracy
- Issue resulting in a mutation of the CRFSlotFillerConfig
- Wrong required resources of the `DeterministicIntentParser`
- Issue with non ASCII characters when using the parsing CLI with Python2

## [0.19.2] - 2019-02-11
### Fixed
- Fix an issue regarding the way builtin entities were handled by the `CRFSlotFiller`
Expand Down Expand Up @@ -236,6 +243,7 @@ several commands.
- Fix compiling issue with `bindgen` dependency when installing from source
- Fix issue in `CRFSlotFiller` when handling builtin entities

[0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3
[0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2
[0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1
[0.19.0]: https://github.com/snipsco/snips-nlu/compare/0.18.0...0.19.0
Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/__about__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__email__ = "[email protected], [email protected]"
__license__ = "Apache License, Version 2.0"

__version__ = "0.19.2"
__version__ = "0.19.3"
__model_version__ = "0.19.0"

__download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"
Expand Down
3 changes: 3 additions & 0 deletions snips_nlu/cli/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)
def parse(training_path, query, verbose=False):
"""Load a trained NLU engine and play with its parsing API interactively"""
from builtins import str
if verbose:
set_nlu_logger(logging.DEBUG)

Expand All @@ -29,6 +30,8 @@ def parse(training_path, query, verbose=False):

while True:
query = input("Enter a query (type 'q' to quit): ").strip()
if not isinstance(query, str):
query = query.decode("utf-8")
if query == "q":
break
print_parsing_result(engine, query)
Expand Down
10 changes: 7 additions & 3 deletions snips_nlu/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from enum import Enum, unique

import requests
from semantic_version import Version

import snips_nlu
from snips_nlu import __about__
Expand Down Expand Up @@ -71,13 +72,16 @@ def get_json(url, desc):

def get_compatibility():
version = __about__.__version__
semver_version = Version(version)
minor_version = "%d.%d" % (semver_version.major, semver_version.minor)
table = get_json(__about__.__compatibility__, "Compatibility table")
compatibility = table["snips-nlu"]
if version not in compatibility:
nlu_table = table["snips-nlu"]
compatibility = nlu_table.get(version, nlu_table.get(minor_version))
if compatibility is None:
pretty_print("No compatible resources found for version %s" % version,
title="Resources compatibility error", exits=1,
level=PrettyPrintLevel.ERROR)
return compatibility[version]
return compatibility


def get_resources_version(resource_fullname, resource_alias, compatibility):
Expand Down
3 changes: 3 additions & 0 deletions snips_nlu/common/log_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from __future__ import unicode_literals

from builtins import str
from datetime import datetime
from functools import wraps

Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/data_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def get_intent_entities(dataset, intent_name):
for chunk in utterance[DATA]:
if ENTITY in chunk:
intent_entities.add(chunk[ENTITY])
return intent_entities
return sorted(intent_entities)


def num_queries_to_generate(dataset, intent_name, min_utterances):
Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/dataset/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _validate_and_format_custom_entity(entity, queries_entities, language,
validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool,
object_label="automatically_extensible")
validate_type(entity[DATA], list, object_label="entity data")
validate_type(entity[MATCHING_STRICTNESS], float,
validate_type(entity[MATCHING_STRICTNESS], (float, int),
object_label="matching_strictness")

formatted_entity = dict()
Expand Down
106 changes: 38 additions & 68 deletions snips_nlu/intent_classifier/featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
json_string, fitted_required, replace_entities_with_placeholders,
check_persisted_path)
from snips_nlu.constants import (
DATA, END, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, RES_MATCH_RANGE,
RES_VALUE, START, TEXT, ENTITIES)
DATA, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, TEXT, ENTITIES)
from snips_nlu.dataset import get_text_from_chunks, validate_and_format_dataset
from snips_nlu.entity_parser.builtin_entity_parser import (
is_builtin_entity)
Expand Down Expand Up @@ -264,7 +263,7 @@ def fit(self, x, dataset):
self._init_vectorizer(self._language)
self.builtin_entity_scope = set(
e for e in dataset[ENTITIES] if is_builtin_entity(e))
preprocessed_data = self._preprocess(x, training=True)
preprocessed_data = self._preprocess(x)
utterances = [
self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
for u, builtin_ents, custom_ents, w_clusters
Expand Down Expand Up @@ -296,7 +295,7 @@ def fit_transform(self, x, dataset):
self._init_vectorizer(self._language)
self.builtin_entity_scope = set(
e for e in dataset[ENTITIES] if is_builtin_entity(e))
preprocessed_data = self._preprocess(x, training=True)
preprocessed_data = self._preprocess(x)
utterances = [
self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
for u, builtin_ents, custom_ents, w_clusters
Expand Down Expand Up @@ -330,31 +329,30 @@ def transform(self, x):
for data in zip(*self._preprocess(x))]
return self._tfidf_vectorizer.transform(utterances)

def _preprocess(self, utterances, training=False):
def _preprocess(self, utterances):
normalized_utterances = deepcopy(utterances)
for u in normalized_utterances:
for chunk in u[DATA]:
nb_chunks = len(u[DATA])
for i, chunk in enumerate(u[DATA]):
chunk[TEXT] = _normalize_stem(
chunk[TEXT], self.language, self.resources,
self.config.use_stemming)

if training:
builtin_ents, custom_ents = zip(
*[_entities_from_utterance(u) for u in utterances])
else:
# Extract builtin entities on unormalized utterances
builtin_ents = [
self.builtin_entity_parser.parse(
get_text_from_chunks(u[DATA]),
self.builtin_entity_scope, use_cache=True)
for u in utterances
]
# Extract builtin entities on normalized utterances
custom_ents = [
self.custom_entity_parser.parse(
get_text_from_chunks(u[DATA]), use_cache=True)
for u in normalized_utterances
]
if i < nb_chunks - 1:
chunk[TEXT] += " "

# Extract builtin entities on unormalized utterances
builtin_ents = [
self.builtin_entity_parser.parse(
get_text_from_chunks(u[DATA]),
self.builtin_entity_scope, use_cache=True)
for u in utterances
]
# Extract builtin entities on normalized utterances
custom_ents = [
self.custom_entity_parser.parse(
get_text_from_chunks(u[DATA]), use_cache=True)
for u in normalized_utterances
]
if self.config.word_clusters_name:
# Extract world clusters on unormalized utterances
original_utterances_text = [get_text_from_chunks(u[DATA])
Expand Down Expand Up @@ -582,7 +580,7 @@ def fit(self, x, dataset):
self.builtin_entity_scope = set(
e for e in dataset[ENTITIES] if is_builtin_entity(e))

preprocessed = self._preprocess(list(x), training=True)
preprocessed = self._preprocess(list(x))
utterances = [
self._enrich_utterance(utterance, builtin_ents, custom_ent)
for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
Expand Down Expand Up @@ -648,7 +646,7 @@ def transform(self, x):
Raises:
NotTrained: when the vectorizer is not fitted
"""
preprocessed = self._preprocess(x, training=False)
preprocessed = self._preprocess(x)
utterances = [
self._enrich_utterance(utterance, builtin_ents, custom_ent)
for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
Expand All @@ -661,24 +659,20 @@ def transform(self, x):

return x_coo.tocsr()

def _preprocess(self, x, training=False):
if training:
builtin_ents, custom_ents = zip(
*[_entities_from_utterance(u) for u in x])
else:
# Extract all entities on unnormalized data
builtin_ents = [
self.builtin_entity_parser.parse(
get_text_from_chunks(u[DATA]),
self.builtin_entity_scope,
use_cache=True
) for u in x
]
custom_ents = [
self.custom_entity_parser.parse(
get_text_from_chunks(u[DATA]), use_cache=True)
for u in x
]
def _preprocess(self, x):
# Extract all entities on unnormalized data
builtin_ents = [
self.builtin_entity_parser.parse(
get_text_from_chunks(u[DATA]),
self.builtin_entity_scope,
use_cache=True
) for u in x
]
custom_ents = [
self.custom_entity_parser.parse(
get_text_from_chunks(u[DATA]), use_cache=True)
for u in x
]
return x, builtin_ents, custom_ents

def _extract_word_pairs(self, utterance):
Expand Down Expand Up @@ -805,27 +799,3 @@ def _get_word_cluster_features(query_tokens, clusters_name, resources):
if cluster is not None:
cluster_features.append(cluster)
return cluster_features


def _entities_from_utterance(utterance):
builtin_ents = []
custom_ents = []
current_ix = 0
for chunk in utterance[DATA]:
text = chunk[TEXT]
text_length = len(text)
if ENTITY in chunk:
ent = {
ENTITY_KIND: chunk[ENTITY],
RES_VALUE: text,
RES_MATCH_RANGE: {
START: current_ix,
END: current_ix + text_length
}
}
if is_builtin_entity(ent[ENTITY_KIND]):
builtin_ents.append(ent)
else:
custom_ents.append(ent)
current_ix += text_length
return builtin_ents, custom_ents
3 changes: 1 addition & 2 deletions snips_nlu/intent_classifier/log_reg_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,12 @@ def fit(self, dataset):

none_class = max(classes)
try:
self.featurizer = self.featurizer.fit(
x = self.featurizer.fit_transform(
dataset, utterances, classes, none_class)
except _EmptyDatasetUtterancesError:
self.featurizer = None
return self

x = self.featurizer.transform(utterances)
alpha = get_regularization_factor(dataset)
self.classifier = SGDClassifier(random_state=random_state,
alpha=alpha, **LOG_REG_ARGS)
Expand Down
20 changes: 2 additions & 18 deletions snips_nlu/intent_classifier/log_reg_classifier_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
from future.utils import iteritems, itervalues

from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT,
from snips_nlu.constants import (DATA, ENTITY, INTENTS, TEXT,
UNKNOWNWORD, UTTERANCES)
from snips_nlu.data_augmentation import augment_utterances
from snips_nlu.dataset import get_text_from_chunks
Expand Down Expand Up @@ -108,22 +108,6 @@ def add_unknown_word_to_utterances(utterances, replacement_string,
return new_utterances


def get_dataset_specific_noise(dataset, resources):
"""Return a noise list that excludes the dataset entity values"""
entities_values = set()
for ent_name, ent in iteritems(dataset[ENTITIES]):
if is_builtin_entity(ent_name):
continue
for k, v in iteritems(ent[UTTERANCES]):
entities_values.add(k)
entities_values.add(v)
original_noise = get_noise(resources)
specific_noise = [n for n in original_noise if n not in entities_values]
if not specific_noise: # Avoid returning an empty noise
return original_noise
return specific_noise


def build_training_data(dataset, language, data_augmentation_config, resources,
random_state):
# Create class mapping
Expand Down Expand Up @@ -164,7 +148,7 @@ def build_training_data(dataset, language, data_augmentation_config, resources,
)

# Adding noise
noise = get_dataset_specific_noise(dataset, resources)
noise = get_noise(resources)
noisy_utterances = generate_noise_utterances(
augmented_utterances, noise, len(intents), data_augmentation_config,
language, random_state)
Expand Down
3 changes: 2 additions & 1 deletion snips_nlu/intent_parser/deterministic_intent_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def fit(self, dataset, force_retrain=True):
"""Fits the intent parser with a valid Snips dataset"""
logger.info("Fitting deterministic parser...")
dataset = validate_and_format_dataset(dataset)
self.load_resources_if_needed(dataset[LANGUAGE])
self.fit_builtin_entity_parser_if_needed(dataset)
self.fit_custom_entity_parser_if_needed(dataset)
self.language = dataset[LANGUAGE]
Expand Down Expand Up @@ -313,7 +314,7 @@ def _get_matching_result(self, text, processed_text, regex, intent,
for group_name in found_result.groupdict():
ref_group_name = group_name
if "_" in group_name:
ref_group_name = group_name[:(len(group_name) - 2)]
ref_group_name = group_name.split("_")[0]
slot_name = self.group_names_to_slot_names[ref_group_name]
entity = self.slot_names_to_entities[intent][slot_name]
rng = (found_result.start(group_name),
Expand Down
5 changes: 3 additions & 2 deletions snips_nlu/pipeline/configs/intent_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import unicode_literals

from snips_nlu.common.from_dict import FromDict
from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE
from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS
from snips_nlu.entity_parser import CustomEntityParserUsage
from snips_nlu.pipeline.configs import ProcessingUnitConfig
from snips_nlu.resources import merge_required_resources
Expand Down Expand Up @@ -84,7 +84,8 @@ def unit_name(self):

def get_required_resources(self):
return {
CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
STOP_WORDS: self.ignore_stop_words
}

def to_dict(self):
Expand Down
4 changes: 4 additions & 0 deletions snips_nlu/slot_filler/crf_slot_filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import shutil
import tempfile
from builtins import range
from copy import deepcopy
from pathlib import Path

from future.utils import iteritems
Expand Down Expand Up @@ -48,6 +49,9 @@ class CRFSlotFiller(SlotFiller):
def __init__(self, config=None, **shared):
"""The CRF slot filler can be configured by passing a
:class:`.CRFSlotFillerConfig`"""
# The CRFSlotFillerConfig must be deep-copied as it is mutated when
# fitting the feature factories
config = deepcopy(config)
super(CRFSlotFiller, self).__init__(config, **shared)
self.crf_model = None
self.features_factories = [
Expand Down
1 change: 1 addition & 0 deletions snips_nlu/tests/test_crf_slot_filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def test_should_get_sub_builtin_slots(self):
- find me something from [start](9am) to [end](12pm)
- I need a break from [start](2pm) until [end](4pm)
- Can you suggest something from [start](april 4th) until [end](april 6th) ?
- find an activity from [start](6pm) to [end](8pm)
- Book me a trip from [start](this friday) to [end](next tuesday)""")
dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
config = CRFSlotFillerConfig(random_seed=42)
Expand Down
Loading

0 comments on commit 71f9599

Please sign in to comment.