Skip to content

Commit

Permalink
Initial implenemtation of NLU YAML parser
Browse files Browse the repository at this point in the history
- Created new YAML parser
- Extracted common parts for YAML and MD to the separate classes
- Added unit tests
  • Loading branch information
Alexander Khizov committed Jun 17, 2020
1 parent 4151446 commit ac8aae0
Show file tree
Hide file tree
Showing 19 changed files with 765 additions and 142 deletions.
10 changes: 10 additions & 0 deletions data/rasa_yaml_examples/nlu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
nlu:
- intent: estimate_emissions
# Arbitrary metadata
metadata:
author: Some example metadata!
key: value
# Multiline examples, each line is a separate training example.
examples: |
how much CO2 will that use?
how much carbon will a one way flight from [new york]{"entity": "city", "role": "from"} to california produce?
7 changes: 5 additions & 2 deletions rasa/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

logger = logging.getLogger(__name__)
MARKDOWN_FILE_EXTENSION = ".md"
YAML_FILE_EXTENSION = ".yml"
JSON_FILE_EXTENSION = ".json"


Expand Down Expand Up @@ -123,8 +124,10 @@ def _find_core_nlu_files_in_directory(directory: Text,) -> Tuple[Set[Text], Set[

def _is_valid_filetype(path: Text) -> bool:
is_file = os.path.isfile(path)
is_datafile = path.endswith(JSON_FILE_EXTENSION) or path.endswith(
MARKDOWN_FILE_EXTENSION
is_datafile = (
path.endswith(JSON_FILE_EXTENSION)
or path.endswith(MARKDOWN_FILE_EXTENSION)
or path.endswith(YAML_FILE_EXTENSION)
)

return is_file and is_datafile
Expand Down
12 changes: 6 additions & 6 deletions rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,12 @@ def _generate_lookup_regex(
# if it's a list, it should be the elements directly
if isinstance(lookup_elements, list):
elements_to_regex = lookup_elements
common_utils.raise_warning(
"Directly including lookup tables as a list is deprecated since Rasa "
"1.6.",
FutureWarning,
docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
)
# common_utils.raise_warning(
# "Directly including lookup tables as a list is deprecated since Rasa "
# "1.6.",
# FutureWarning,
# docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
# )

# otherwise it's a file path.
else:
Expand Down
149 changes: 149 additions & 0 deletions rasa/nlu/training_data/entities_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
import re
from json import JSONDecodeError
from typing import Text, List, Dict, Match, Optional, NamedTuple

from rasa.nlu.utils import build_entity

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU

from rasa.nlu.constants import (
ENTITY_ATTRIBUTE_GROUP,
ENTITY_ATTRIBUTE_TYPE,
ENTITY_ATTRIBUTE_ROLE,
ENTITY_ATTRIBUTE_VALUE,
)

from rasa.utils.common import raise_warning

GROUP_ENTITY_VALUE = "value"
GROUP_ENTITY_TYPE = "entity"
GROUP_ENTITY_DICT = "entity_dict"
GROUP_ENTITY_TEXT = "entity_text"
GROUP_COMPLETE_MATCH = 0

# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
ENTITY_REGEX = re.compile(
r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
)


class EntityAttributes(NamedTuple):
"""Attributes of an entity defined in the markdown data."""

type: Text
value: Text
text: Text
group: Optional[Text]
role: Optional[Text]


class EntitiesParser:
@staticmethod
def find_entities_in_training_example(example: Text) -> List[Dict]:
"""Extracts entities from an intent example.
Args:
example: intent example
Returns: list of extracted entities
"""
entities = []
offset = 0

for match in re.finditer(ENTITY_REGEX, example):
entity_attributes = EntitiesParser._extract_entity_attributes(match)

start_index = match.start() - offset
end_index = start_index + len(entity_attributes.text)
offset += len(match.group(0)) - len(entity_attributes.text)

entity = build_entity(
start_index,
end_index,
entity_attributes.value,
entity_attributes.type,
entity_attributes.role,
entity_attributes.group,
)
entities.append(entity)

return entities

@staticmethod
def _extract_entity_attributes(match: Match) -> EntityAttributes:
"""Extract the entity attributes, i.e. type, value, etc., from the
regex match."""
entity_text = match.groupdict()[GROUP_ENTITY_TEXT]

if match.groupdict()[GROUP_ENTITY_DICT]:
return EntitiesParser._extract_entity_attributes_from_dict(
entity_text, match
)

entity_type = match.groupdict()[GROUP_ENTITY_TYPE]

if match.groupdict()[GROUP_ENTITY_VALUE]:
entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
else:
entity_value = entity_text

return EntityAttributes(entity_type, entity_value, entity_text, None, None)

@staticmethod
def _extract_entity_attributes_from_dict(
entity_text: Text, match: Match
) -> EntityAttributes:
"""Extract the entity attributes from the dict format."""
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
entity_dict = EntitiesParser._get_validated_dict(entity_dict_str)
return EntityAttributes(
entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
entity_text,
entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
)

@staticmethod
def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
"""Converts the provided json_str to a valid dict containing the entity
attributes.
Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"}
Args:
json_str: the entity dict as string without "{}"
Raises:
ValidationError if validation of entity dict fails.
JSONDecodeError if provided entity dict is not valid json.
Returns:
a proper python dict
"""
import json
import rasa.utils.validation as validation_utils
import rasa.nlu.schemas.data_schema as schema

# add {} as they are not part of the regex
try:
data = json.loads(f"{{{json_str}}}")
except JSONDecodeError as e:
raise_warning(
f"Incorrect training data format ('{{{json_str}}}'), make sure your "
f"data is valid. For more information about the format visit "
f"{DOCS_URL_TRAINING_DATA_NLU}."
)
raise e

validation_utils.validate_training_data(data, schema.entity_dict_schema())

return data

@staticmethod
def replace_entities(training_example: Text) -> Text:
return re.sub(
ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
)
1 change: 1 addition & 0 deletions rasa/nlu/training_data/formats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
from rasa.nlu.training_data.formats.dialogflow import DialogflowReader
from rasa.nlu.training_data.formats.luis import LuisReader
from rasa.nlu.training_data.formats.markdown import MarkdownReader, MarkdownWriter
Expand Down
134 changes: 20 additions & 114 deletions rasa/nlu/training_data/formats/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,10 @@
import typing
from collections import OrderedDict
from json import JSONDecodeError
from typing import Any, Text, Optional, Tuple, List, Dict, NamedTuple, Match
from typing import Any, Text, Optional, Tuple, Dict, Match

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
from rasa.core.constants import INTENT_MESSAGE_PREFIX

from rasa.nlu.training_data.formats.readerwriter import (
TrainingDataReader,
TrainingDataWriter,
)
from rasa.nlu.utils import build_entity
from rasa.utils.common import raise_warning
from rasa.nlu.constants import (
ENTITY_ATTRIBUTE_GROUP,
ENTITY_ATTRIBUTE_TYPE,
Expand All @@ -22,6 +15,14 @@
ENTITY_ATTRIBUTE_END,
ENTITY_ATTRIBUTE_START,
)
from rasa.nlu.training_data.entities_parser import EntitiesParser, ENTITY_REGEX
from rasa.nlu.training_data.formats.readerwriter import (
TrainingDataReader,
TrainingDataWriter,
)
from rasa.nlu.training_data.lookup_tables_parser import LookupTablesParser
from rasa.nlu.training_data.synonyms_parser import SynonymsParser
from rasa.utils.common import raise_warning

GROUP_ENTITY_VALUE = "value"
GROUP_ENTITY_TYPE = "entity"
Expand All @@ -38,10 +39,6 @@
LOOKUP = "lookup"
available_sections = [INTENT, SYNONYM, REGEX, LOOKUP]

# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
entity_regex = re.compile(
r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
)
item_regex = re.compile(r"\s*[-*+]\s*(.+)")
comment_regex = re.compile(r"<!--[\s\S]*?--!*>", re.MULTILINE)
fname_regex = re.compile(r"\s*([^-*+]+)")
Expand All @@ -51,16 +48,6 @@
ESCAPE = re.compile(r"[\b\f\n\r\t]")


class EntityAttributes(NamedTuple):
"""Attributes of an entity defined in the markdown data."""

type: Text
value: Text
text: Text
group: Optional[Text]
role: Optional[Text]


def encode_string(s: Text) -> Text:
"""Return a encoded python string."""

Expand Down Expand Up @@ -158,25 +145,17 @@ def _parse_item(self, line: Text) -> None:
parsed = self.parse_training_example(item)
self.training_examples.append(parsed)
elif self.current_section == SYNONYM:
self._add_synonym(item, self.current_title)
SynonymsParser.add_synonym(
item, self.current_title, self.entity_synonyms
)
elif self.current_section == REGEX:
self.regex_features.append(
{"name": self.current_title, "pattern": item}
)
elif self.current_section == LOOKUP:
self._add_item_to_lookup(item)

def _add_item_to_lookup(self, item: Text) -> None:
"""Takes a list of lookup table dictionaries. Finds the one associated
with the current lookup, then adds the item to the list."""
matches = [
table for table in self.lookup_tables if table["name"] == self.current_title
]
if not matches:
self.lookup_tables.append({"name": self.current_title, "elements": [item]})
else:
elements = matches[0]["elements"]
elements.append(item)
LookupTablesParser.add_item_to_lookup_tables(
self.current_title, item, self.lookup_tables
)

@staticmethod
def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
Expand Down Expand Up @@ -215,90 +194,17 @@ def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:

return data

def _find_entities_in_training_example(self, example: Text) -> List[Dict]:
"""Extracts entities from a markdown intent example.
Args:
example: markdown intent example
Returns: list of extracted entities
"""
entities = []
offset = 0

for match in re.finditer(entity_regex, example):
entity_attributes = self._extract_entity_attributes(match)

start_index = match.start() - offset
end_index = start_index + len(entity_attributes.text)
offset += len(match.group(0)) - len(entity_attributes.text)

entity = build_entity(
start_index,
end_index,
entity_attributes.value,
entity_attributes.type,
entity_attributes.role,
entity_attributes.group,
)
entities.append(entity)

return entities

def _extract_entity_attributes(self, match: Match) -> EntityAttributes:
"""Extract the entity attributes, i.e. type, value, etc., from the
regex match."""
entity_text = match.groupdict()[GROUP_ENTITY_TEXT]

if match.groupdict()[GROUP_ENTITY_DICT]:
return self._extract_entity_attributes_from_dict(entity_text, match)

entity_type = match.groupdict()[GROUP_ENTITY_TYPE]

if match.groupdict()[GROUP_ENTITY_VALUE]:
entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
self._deprecated_synonym_format_was_used = True
else:
entity_value = entity_text

return EntityAttributes(entity_type, entity_value, entity_text, None, None)

def _extract_entity_attributes_from_dict(
self, entity_text: Text, match: Match
) -> EntityAttributes:
"""Extract the entity attributes from the dict format."""
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
entity_dict = self._get_validated_dict(entity_dict_str)
return EntityAttributes(
entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
entity_text,
entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
)

def _add_synonym(self, text: Text, value: Text) -> None:
from rasa.nlu.training_data.util import check_duplicate_synonym

check_duplicate_synonym(self.entity_synonyms, text, value, "reading markdown")
self.entity_synonyms[text] = value

def _add_synonyms(self, plain_text: Text, entities: List[Dict]) -> None:
"""Adds synonyms found in intent examples"""
for e in entities:
e_text = plain_text[e[ENTITY_ATTRIBUTE_START] : e[ENTITY_ATTRIBUTE_END]]
if e_text != e[ENTITY_ATTRIBUTE_VALUE]:
self._add_synonym(e_text, e[ENTITY_ATTRIBUTE_VALUE])

def parse_training_example(self, example: Text) -> "Message":
"""Extract entities and synonyms, and convert to plain text."""
from rasa.nlu.training_data import Message

entities = self._find_entities_in_training_example(example)
entities = EntitiesParser.find_entities_in_training_example(example)
plain_text = re.sub(
entity_regex, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], example
ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], example,
)
SynonymsParser.add_synonyms_from_entities(
plain_text, entities, self.entity_synonyms
)
self._add_synonyms(plain_text, entities)

message = Message.build(plain_text, self.current_title)

Expand Down
Loading

0 comments on commit ac8aae0

Please sign in to comment.