Initial implenemtation of NLU YAML parser

- Created new YAML parser - Extracted common parts for YAML and MD to the separate classes - Added unit tests
RasaHQ · Jun 17, 2020 · ac8aae0 · ac8aae0
1 parent 4151446
commit ac8aae0
Show file tree

Hide file tree

Showing 19 changed files with 765 additions and 142 deletions.
diff --git a/data/rasa_yaml_examples/nlu.yml b/data/rasa_yaml_examples/nlu.yml
@@ -0,0 +1,10 @@
+nlu:
+- intent: estimate_emissions
+  # Arbitrary metadata
+  metadata:
+    author: Some example metadata!
+    key: value
+  # Multiline examples, each line is a separate training example.
+  examples: |
+     how much CO2 will that use?
+     how much carbon will a one way flight from [new york]{"entity": "city", "role": "from"} to california produce?
diff --git a/rasa/data.py b/rasa/data.py
@@ -11,6 +11,7 @@
 
 logger = logging.getLogger(__name__)
 MARKDOWN_FILE_EXTENSION = ".md"
+YAML_FILE_EXTENSION = ".yml"
 JSON_FILE_EXTENSION = ".json"
 
 
@@ -123,8 +124,10 @@ def _find_core_nlu_files_in_directory(directory: Text,) -> Tuple[Set[Text], Set[
 
 def _is_valid_filetype(path: Text) -> bool:
     is_file = os.path.isfile(path)
-    is_datafile = path.endswith(JSON_FILE_EXTENSION) or path.endswith(
-        MARKDOWN_FILE_EXTENSION
+    is_datafile = (
+        path.endswith(JSON_FILE_EXTENSION)
+        or path.endswith(MARKDOWN_FILE_EXTENSION)
+        or path.endswith(YAML_FILE_EXTENSION)
     )
 
     return is_file and is_datafile

diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -145,12 +145,12 @@ def _generate_lookup_regex(
         # if it's a list, it should be the elements directly
         if isinstance(lookup_elements, list):
             elements_to_regex = lookup_elements
-            common_utils.raise_warning(
-                "Directly including lookup tables as a list is deprecated since Rasa "
-                "1.6.",
-                FutureWarning,
-                docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
-            )
+            # common_utils.raise_warning(
+            #     "Directly including lookup tables as a list is deprecated since Rasa "
+            #     "1.6.",
+            #     FutureWarning,
+            #     docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
+            # )
 
         # otherwise it's a file path.
         else:

diff --git a/rasa/nlu/training_data/entities_parser.py b/rasa/nlu/training_data/entities_parser.py
@@ -0,0 +1,149 @@
+# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
+import re
+from json import JSONDecodeError
+from typing import Text, List, Dict, Match, Optional, NamedTuple
+
+from rasa.nlu.utils import build_entity
+
+from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
+
+from rasa.nlu.constants import (
+    ENTITY_ATTRIBUTE_GROUP,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_ATTRIBUTE_ROLE,
+    ENTITY_ATTRIBUTE_VALUE,
+)
+
+from rasa.utils.common import raise_warning
+
+GROUP_ENTITY_VALUE = "value"
+GROUP_ENTITY_TYPE = "entity"
+GROUP_ENTITY_DICT = "entity_dict"
+GROUP_ENTITY_TEXT = "entity_text"
+GROUP_COMPLETE_MATCH = 0
+
+# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
+ENTITY_REGEX = re.compile(
+    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
+)
+
+
+class EntityAttributes(NamedTuple):
+    """Attributes of an entity defined in the markdown data."""
+
+    type: Text
+    value: Text
+    text: Text
+    group: Optional[Text]
+    role: Optional[Text]
+
+
+class EntitiesParser:
+    @staticmethod
+    def find_entities_in_training_example(example: Text) -> List[Dict]:
+        """Extracts entities from an intent example.
+
+        Args:
+            example: intent example
+
+        Returns: list of extracted entities
+        """
+        entities = []
+        offset = 0
+
+        for match in re.finditer(ENTITY_REGEX, example):
+            entity_attributes = EntitiesParser._extract_entity_attributes(match)
+
+            start_index = match.start() - offset
+            end_index = start_index + len(entity_attributes.text)
+            offset += len(match.group(0)) - len(entity_attributes.text)
+
+            entity = build_entity(
+                start_index,
+                end_index,
+                entity_attributes.value,
+                entity_attributes.type,
+                entity_attributes.role,
+                entity_attributes.group,
+            )
+            entities.append(entity)
+
+        return entities
+
+    @staticmethod
+    def _extract_entity_attributes(match: Match) -> EntityAttributes:
+        """Extract the entity attributes, i.e. type, value, etc., from the
+        regex match."""
+        entity_text = match.groupdict()[GROUP_ENTITY_TEXT]
+
+        if match.groupdict()[GROUP_ENTITY_DICT]:
+            return EntitiesParser._extract_entity_attributes_from_dict(
+                entity_text, match
+            )
+
+        entity_type = match.groupdict()[GROUP_ENTITY_TYPE]
+
+        if match.groupdict()[GROUP_ENTITY_VALUE]:
+            entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
+        else:
+            entity_value = entity_text
+
+        return EntityAttributes(entity_type, entity_value, entity_text, None, None)
+
+    @staticmethod
+    def _extract_entity_attributes_from_dict(
+        entity_text: Text, match: Match
+    ) -> EntityAttributes:
+        """Extract the entity attributes from the dict format."""
+        entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
+        entity_dict = EntitiesParser._get_validated_dict(entity_dict_str)
+        return EntityAttributes(
+            entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
+            entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
+            entity_text,
+            entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
+            entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
+        )
+
+    @staticmethod
+    def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
+        """Converts the provided json_str to a valid dict containing the entity
+        attributes.
+
+        Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
+        [LA]{"entity": "city", "role": "to", "value": "Los Angeles"}
+
+        Args:
+            json_str: the entity dict as string without "{}"
+
+        Raises:
+            ValidationError if validation of entity dict fails.
+            JSONDecodeError if provided entity dict is not valid json.
+
+        Returns:
+            a proper python dict
+        """
+        import json
+        import rasa.utils.validation as validation_utils
+        import rasa.nlu.schemas.data_schema as schema
+
+        # add {} as they are not part of the regex
+        try:
+            data = json.loads(f"{{{json_str}}}")
+        except JSONDecodeError as e:
+            raise_warning(
+                f"Incorrect training data format ('{{{json_str}}}'), make sure your "
+                f"data is valid. For more information about the format visit "
+                f"{DOCS_URL_TRAINING_DATA_NLU}."
+            )
+            raise e
+
+        validation_utils.validate_training_data(data, schema.entity_dict_schema())
+
+        return data
+
+    @staticmethod
+    def replace_entities(training_example: Text) -> Text:
+        return re.sub(
+            ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
+        )
diff --git a/rasa/nlu/training_data/formats/__init__.py b/rasa/nlu/training_data/formats/__init__.py
@@ -1,3 +1,4 @@
+from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
 from rasa.nlu.training_data.formats.dialogflow import DialogflowReader
 from rasa.nlu.training_data.formats.luis import LuisReader
 from rasa.nlu.training_data.formats.markdown import MarkdownReader, MarkdownWriter

diff --git a/rasa/nlu/training_data/formats/markdown.py b/rasa/nlu/training_data/formats/markdown.py
@@ -3,17 +3,10 @@
 import typing
 from collections import OrderedDict
 from json import JSONDecodeError
-from typing import Any, Text, Optional, Tuple, List, Dict, NamedTuple, Match
+from typing import Any, Text, Optional, Tuple, Dict, Match
 
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
 from rasa.core.constants import INTENT_MESSAGE_PREFIX
-
-from rasa.nlu.training_data.formats.readerwriter import (
-    TrainingDataReader,
-    TrainingDataWriter,
-)
-from rasa.nlu.utils import build_entity
-from rasa.utils.common import raise_warning
 from rasa.nlu.constants import (
     ENTITY_ATTRIBUTE_GROUP,
     ENTITY_ATTRIBUTE_TYPE,
@@ -22,6 +15,14 @@
     ENTITY_ATTRIBUTE_END,
     ENTITY_ATTRIBUTE_START,
 )
+from rasa.nlu.training_data.entities_parser import EntitiesParser, ENTITY_REGEX
+from rasa.nlu.training_data.formats.readerwriter import (
+    TrainingDataReader,
+    TrainingDataWriter,
+)
+from rasa.nlu.training_data.lookup_tables_parser import LookupTablesParser
+from rasa.nlu.training_data.synonyms_parser import SynonymsParser
+from rasa.utils.common import raise_warning
 
 GROUP_ENTITY_VALUE = "value"
 GROUP_ENTITY_TYPE = "entity"
@@ -38,10 +39,6 @@
 LOOKUP = "lookup"
 available_sections = [INTENT, SYNONYM, REGEX, LOOKUP]
 
-# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
-entity_regex = re.compile(
-    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
-)
 item_regex = re.compile(r"\s*[-*+]\s*(.+)")
 comment_regex = re.compile(r"<!--[\s\S]*?--!*>", re.MULTILINE)
 fname_regex = re.compile(r"\s*([^-*+]+)")
@@ -51,16 +48,6 @@
 ESCAPE = re.compile(r"[\b\f\n\r\t]")
 
 
-class EntityAttributes(NamedTuple):
-    """Attributes of an entity defined in the markdown data."""
-
-    type: Text
-    value: Text
-    text: Text
-    group: Optional[Text]
-    role: Optional[Text]
-
-
 def encode_string(s: Text) -> Text:
     """Return a encoded python string."""
 
@@ -158,25 +145,17 @@ def _parse_item(self, line: Text) -> None:
                 parsed = self.parse_training_example(item)
                 self.training_examples.append(parsed)
             elif self.current_section == SYNONYM:
-                self._add_synonym(item, self.current_title)
+                SynonymsParser.add_synonym(
+                    item, self.current_title, self.entity_synonyms
+                )
             elif self.current_section == REGEX:
                 self.regex_features.append(
                     {"name": self.current_title, "pattern": item}
                 )
             elif self.current_section == LOOKUP:
-                self._add_item_to_lookup(item)
-
-    def _add_item_to_lookup(self, item: Text) -> None:
-        """Takes a list of lookup table dictionaries.  Finds the one associated
-        with the current lookup, then adds the item to the list."""
-        matches = [
-            table for table in self.lookup_tables if table["name"] == self.current_title
-        ]
-        if not matches:
-            self.lookup_tables.append({"name": self.current_title, "elements": [item]})
-        else:
-            elements = matches[0]["elements"]
-            elements.append(item)
+                LookupTablesParser.add_item_to_lookup_tables(
+                    self.current_title, item, self.lookup_tables
+                )
 
     @staticmethod
     def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
@@ -215,90 +194,17 @@ def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
 
         return data
 
-    def _find_entities_in_training_example(self, example: Text) -> List[Dict]:
-        """Extracts entities from a markdown intent example.
-
-        Args:
-            example: markdown intent example
-
-        Returns: list of extracted entities
-        """
-        entities = []
-        offset = 0
-
-        for match in re.finditer(entity_regex, example):
-            entity_attributes = self._extract_entity_attributes(match)
-
-            start_index = match.start() - offset
-            end_index = start_index + len(entity_attributes.text)
-            offset += len(match.group(0)) - len(entity_attributes.text)
-
-            entity = build_entity(
-                start_index,
-                end_index,
-                entity_attributes.value,
-                entity_attributes.type,
-                entity_attributes.role,
-                entity_attributes.group,
-            )
-            entities.append(entity)
-
-        return entities
-
-    def _extract_entity_attributes(self, match: Match) -> EntityAttributes:
-        """Extract the entity attributes, i.e. type, value, etc., from the
-        regex match."""
-        entity_text = match.groupdict()[GROUP_ENTITY_TEXT]
-
-        if match.groupdict()[GROUP_ENTITY_DICT]:
-            return self._extract_entity_attributes_from_dict(entity_text, match)
-
-        entity_type = match.groupdict()[GROUP_ENTITY_TYPE]
-
-        if match.groupdict()[GROUP_ENTITY_VALUE]:
-            entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
-            self._deprecated_synonym_format_was_used = True
-        else:
-            entity_value = entity_text
-
-        return EntityAttributes(entity_type, entity_value, entity_text, None, None)
-
-    def _extract_entity_attributes_from_dict(
-        self, entity_text: Text, match: Match
-    ) -> EntityAttributes:
-        """Extract the entity attributes from the dict format."""
-        entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
-        entity_dict = self._get_validated_dict(entity_dict_str)
-        return EntityAttributes(
-            entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
-            entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
-            entity_text,
-            entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
-            entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
-        )
-
-    def _add_synonym(self, text: Text, value: Text) -> None:
-        from rasa.nlu.training_data.util import check_duplicate_synonym
-
-        check_duplicate_synonym(self.entity_synonyms, text, value, "reading markdown")
-        self.entity_synonyms[text] = value
-
-    def _add_synonyms(self, plain_text: Text, entities: List[Dict]) -> None:
-        """Adds synonyms found in intent examples"""
-        for e in entities:
-            e_text = plain_text[e[ENTITY_ATTRIBUTE_START] : e[ENTITY_ATTRIBUTE_END]]
-            if e_text != e[ENTITY_ATTRIBUTE_VALUE]:
-                self._add_synonym(e_text, e[ENTITY_ATTRIBUTE_VALUE])
-
     def parse_training_example(self, example: Text) -> "Message":
         """Extract entities and synonyms, and convert to plain text."""
         from rasa.nlu.training_data import Message
 
-        entities = self._find_entities_in_training_example(example)
+        entities = EntitiesParser.find_entities_in_training_example(example)
         plain_text = re.sub(
-            entity_regex, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], example
+            ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], example,
+        )
+        SynonymsParser.add_synonyms_from_entities(
+            plain_text, entities, self.entity_synonyms
         )
-        self._add_synonyms(plain_text, entities)
 
         message = Message.build(plain_text, self.current_title)