Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial implementation of NLU YAML parser #5989

Merged
merged 4 commits into from
Jul 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions data/rasa_yaml_examples/nlu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
nlu:
- intent: estimate_emissions
# Arbitrary metadata
metadata:
author: Some example metadata!
key: value
# Multiline examples, each line is a separate training example.
examples: |
how much CO2 will that use?
how much carbon will a one way flight from [new york]{"entity": "city", "role": "from"} to california produce?
17 changes: 4 additions & 13 deletions rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from typing import Any, Dict, List, Optional, Text, Union, Type, Tuple

import numpy as np
import scipy.sparse

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
import rasa.utils.io
import rasa.utils.io
import scipy.sparse
from rasa.nlu import utils
from rasa.nlu.components import Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.constants import (
RESPONSE,
Expand All @@ -19,12 +19,10 @@
FEATURE_TYPE_SEQUENCE,
FEATURIZER_CLASS_ALIAS,
)
from rasa.nlu.tokenizers.tokenizer import Tokenizer
from rasa.nlu.components import Component
from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features
from rasa.nlu.training_data import Message, TrainingData
import rasa.utils.common as common_utils
from rasa.nlu.model import Metadata
from rasa.nlu.tokenizers.tokenizer import Tokenizer
from rasa.nlu.training_data import Message, TrainingData

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -158,13 +156,6 @@ def _generate_lookup_regex(
# if it's a list, it should be the elements directly
if isinstance(lookup_elements, list):
elements_to_regex = lookup_elements
common_utils.raise_warning(
"Directly including lookup tables as a list is deprecated since Rasa "
"1.6.",
FutureWarning,
docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
)

# otherwise it's a file path.
else:

Expand Down
3 changes: 3 additions & 0 deletions rasa/nlu/training_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import rasa.nlu.training_data.entities_parser
import rasa.nlu.training_data.synonyms_parser
import rasa.nlu.training_data.lookup_tables_parser
from rasa.nlu.training_data.loading import load_data
from rasa.nlu.training_data.message import Message
from rasa.nlu.training_data.training_data import TrainingData
167 changes: 167 additions & 0 deletions rasa/nlu/training_data/entities_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import re
from json import JSONDecodeError
from typing import Text, List, Dict, Match, Optional, NamedTuple, Any

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
from rasa.nlu.constants import (
ENTITY_ATTRIBUTE_GROUP,
ENTITY_ATTRIBUTE_TYPE,
ENTITY_ATTRIBUTE_ROLE,
ENTITY_ATTRIBUTE_VALUE,
)
from rasa.utils.common import raise_warning

GROUP_ENTITY_VALUE = "value"
GROUP_ENTITY_TYPE = "entity"
GROUP_ENTITY_DICT = "entity_dict"
GROUP_ENTITY_TEXT = "entity_text"
GROUP_COMPLETE_MATCH = 0

# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
ENTITY_REGEX = re.compile(
r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
)


class EntityAttributes(NamedTuple):
"""Attributes of an entity defined in markdown data."""

type: Text
value: Text
text: Text
group: Optional[Text]
role: Optional[Text]


def find_entities_in_training_example(example: Text) -> List[Dict[Text, Any]]:
"""Extracts entities from an intent example.

Args:
example: Intent example.

Returns:
Extracted entities.
"""
import rasa.nlu.utils as rasa_nlu_utils

entities = []
offset = 0

for match in re.finditer(ENTITY_REGEX, example):
entity_attributes = extract_entity_attributes(match)

start_index = match.start() - offset
end_index = start_index + len(entity_attributes.text)
offset += len(match.group(0)) - len(entity_attributes.text)

entity = rasa_nlu_utils.build_entity(
start_index,
end_index,
entity_attributes.value,
entity_attributes.type,
entity_attributes.role,
entity_attributes.group,
)
entities.append(entity)

return entities


def extract_entity_attributes(match: Match) -> EntityAttributes:
"""Extract the entity attributes, i.e. type, value, etc., from the
regex match.

Args:
match: Regex match to extract the entity attributes from.

Returns:
EntityAttributes object.
"""
entity_text = match.groupdict()[GROUP_ENTITY_TEXT]

if match.groupdict()[GROUP_ENTITY_DICT]:
return extract_entity_attributes_from_dict(entity_text, match)

entity_type = match.groupdict()[GROUP_ENTITY_TYPE]

if match.groupdict()[GROUP_ENTITY_VALUE]:
entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
else:
entity_value = entity_text

return EntityAttributes(entity_type, entity_value, entity_text, None, None)


def extract_entity_attributes_from_dict(
entity_text: Text, match: Match
) -> EntityAttributes:
"""Extract entity attributes from dict format.

Args:
entity_text: Original entity text.
match: Regex match.

Returns:
Extracted entity attributes.
"""
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
entity_dict = get_validated_dict(entity_dict_str)
return EntityAttributes(
entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
entity_text,
entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
)


def get_validated_dict(json_str: Text) -> Dict[Text, Text]:
"""Converts the provided `json_str` to a valid dict containing the entity
attributes.

Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"}.

Args:
json_str: The entity dict as string without "{}".

Raises:
ValidationError if validation of entity dict fails.
JSONDecodeError if provided entity dict is not valid json.

Returns:
Deserialized and validated `json_str`.
"""
import json
import rasa.utils.validation as validation_utils
import rasa.nlu.schemas.data_schema as schema

# add {} as they are not part of the regex
try:
data = json.loads(f"{{{json_str}}}")
except JSONDecodeError as e:
raise_warning(
f"Incorrect training data format ('{{{json_str}}}'). Make sure your "
f"data is valid.",
docs=DOCS_URL_TRAINING_DATA_NLU,
)
raise e

validation_utils.validate_training_data(data, schema.entity_dict_schema())

return data


def replace_entities(training_example: Text) -> Text:
"""Replace special symbols related to the entities in the provided
training example.

Args:
training_example: Original training example with special symbols.

Returns:
String with removed special symbols.
"""
return re.sub(
ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
)
2 changes: 2 additions & 0 deletions rasa/nlu/training_data/formats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
from rasa.nlu.training_data.formats.dialogflow import DialogflowReader
from rasa.nlu.training_data.formats.luis import LuisReader
from rasa.nlu.training_data.formats.markdown import MarkdownReader, MarkdownWriter
Expand All @@ -7,3 +8,4 @@
NLGMarkdownReader,
NLGMarkdownWriter,
)
from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
Loading