Initial commit

RasaHQ · Jun 10, 2020 · 1611414 · 1611414
1 parent 2725bda
commit 1611414
Show file tree

Hide file tree

Showing 9 changed files with 372 additions and 7 deletions.
diff --git a/rasa/data.py b/rasa/data.py
@@ -120,7 +120,9 @@ def _find_core_nlu_files_in_directory(directory: Text,) -> Tuple[Set[Text], Set[
 
 def _is_valid_filetype(path: Text) -> bool:
     is_file = os.path.isfile(path)
-    is_datafile = path.endswith(".json") or path.endswith(".md")
+    is_datafile = (
+        path.endswith(".json") or path.endswith(".md") or path.endswith(".yml")
+    )
 
     return is_file and is_datafile
 

diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -145,12 +145,12 @@ def _generate_lookup_regex(
         # if it's a list, it should be the elements directly
         if isinstance(lookup_elements, list):
             elements_to_regex = lookup_elements
-            common_utils.raise_warning(
-                "Directly including lookup tables as a list is deprecated since Rasa "
-                "1.6.",
-                FutureWarning,
-                docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
-            )
+            # common_utils.raise_warning(
+            #     "Directly including lookup tables as a list is deprecated since Rasa "
+            #     "1.6.",
+            #     FutureWarning,
+            #     docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
+            # )
 
         # otherwise it's a file path.
         else:

diff --git a/rasa/nlu/training_data/entities_parser.py b/rasa/nlu/training_data/entities_parser.py
@@ -0,0 +1,141 @@
+# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
+import re
+from json import JSONDecodeError
+from typing import Text, List, Dict, Match
+
+from rasa.nlu.training_data.formats.markdown import EntityAttributes
+from rasa.nlu.utils import build_entity
+
+from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
+
+from rasa.nlu.constants import (
+    ENTITY_ATTRIBUTE_GROUP,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_ATTRIBUTE_ROLE,
+    ENTITY_ATTRIBUTE_VALUE,
+)
+
+from rasa.utils.common import raise_warning
+
+GROUP_ENTITY_VALUE = "value"
+GROUP_ENTITY_TYPE = "entity"
+GROUP_ENTITY_DICT = "entity_dict"
+GROUP_ENTITY_TEXT = "entity_text"
+GROUP_COMPLETE_MATCH = 0
+
+# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
+entity_regex = re.compile(
+    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
+)
+
+
+class EntitiesParser:
+    @staticmethod
+    def find_entities_in_training_example(example: Text) -> List[Dict]:
+        """Extracts entities from an intent example.
+
+        Args:
+            example: intent example
+
+        Returns: list of extracted entities
+        """
+        entities = []
+        offset = 0
+
+        for match in re.finditer(entity_regex, example):
+            entity_attributes = EntitiesParser._extract_entity_attributes(match)
+
+            start_index = match.start() - offset
+            end_index = start_index + len(entity_attributes.text)
+            offset += len(match.group(0)) - len(entity_attributes.text)
+
+            entity = build_entity(
+                start_index,
+                end_index,
+                entity_attributes.value,
+                entity_attributes.type,
+                entity_attributes.role,
+                entity_attributes.group,
+            )
+            entities.append(entity)
+
+        return entities
+
+    @staticmethod
+    def _extract_entity_attributes(match: Match) -> EntityAttributes:
+        """Extract the entity attributes, i.e. type, value, etc., from the
+        regex match."""
+        entity_text = match.groupdict()[GROUP_ENTITY_TEXT]
+
+        if match.groupdict()[GROUP_ENTITY_DICT]:
+            return EntitiesParser._extract_entity_attributes_from_dict(
+                entity_text, match
+            )
+
+        entity_type = match.groupdict()[GROUP_ENTITY_TYPE]
+
+        if match.groupdict()[GROUP_ENTITY_VALUE]:
+            entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
+            # self._deprecated_synonym_format_was_used = True
+        else:
+            entity_value = entity_text
+
+        return EntityAttributes(entity_type, entity_value, entity_text, None, None)
+
+    @staticmethod
+    def _extract_entity_attributes_from_dict(
+        entity_text: Text, match: Match
+    ) -> EntityAttributes:
+        """Extract the entity attributes from the dict format."""
+        entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
+        entity_dict = EntitiesParser._get_validated_dict(entity_dict_str)
+        return EntityAttributes(
+            entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
+            entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
+            entity_text,
+            entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
+            entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
+        )
+
+    @staticmethod
+    def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
+        """Converts the provided json_str to a valid dict containing the entity
+        attributes.
+
+        Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
+        [LA]{"entity": "city", "role": "to", "value": "Los Angeles"}
+
+        Args:
+            json_str: the entity dict as string without "{}"
+
+        Raises:
+            ValidationError if validation of entity dict fails.
+            JSONDecodeError if provided entity dict is not valid json.
+
+        Returns:
+            a proper python dict
+        """
+        import json
+        import rasa.utils.validation as validation_utils
+        import rasa.nlu.schemas.data_schema as schema
+
+        # add {} as they are not part of the regex
+        try:
+            data = json.loads(f"{{{json_str}}}")
+        except JSONDecodeError as e:
+            raise_warning(
+                f"Incorrect training data format ('{{{json_str}}}'), make sure your "
+                f"data is valid. For more information about the format visit "
+                f"{DOCS_URL_TRAINING_DATA_NLU}."
+            )
+            raise e
+
+        validation_utils.validate_training_data(data, schema.entity_dict_schema())
+
+        return data
+
+    @staticmethod
+    def replace_entities(training_example: Text) -> Text:
+        return re.sub(
+            entity_regex, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
+        )
diff --git a/rasa/nlu/training_data/formats/__init__.py b/rasa/nlu/training_data/formats/__init__.py
@@ -1,3 +1,4 @@
+from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
 from rasa.nlu.training_data.formats.dialogflow import DialogflowReader
 from rasa.nlu.training_data.formats.luis import LuisReader
 from rasa.nlu.training_data.formats.markdown import MarkdownReader, MarkdownWriter

diff --git a/rasa/nlu/training_data/formats/rasa_yaml.py b/rasa/nlu/training_data/formats/rasa_yaml.py
@@ -0,0 +1,161 @@
+import logging
+from typing import Text, Any, List, Dict, Tuple
+
+from rasa.nlu.training_data.entities_parser import EntitiesParser
+from rasa.nlu.training_data.formats.readerwriter import TrainingDataReader
+import rasa.utils.io
+from rasa.nlu.training_data.lookup_tables_parser import LookupTablesParser
+from rasa.nlu.training_data.synonyms_parser import SynonymsParser
+
+logger = logging.getLogger(__name__)
+
+KEY_NLU = "nlu"
+KEY_INTENT = "intent"
+KEY_INTENT_EXAMPLES = "examples"
+KEY_INTENT_TEXT = "text"
+KEY_SYNONYM = "synonym"
+KEY_SYNONYM_EXAMPLES = "examples"
+KEY_REGEX = "regex"
+KEY_REGEX_EXAMPLES = "examples"
+KEY_LOOKUP = "lookup"
+KEY_LOOKUP_EXAMPLES = "examples"
+
+
+class RasaYAMLReader(TrainingDataReader):
+    def __init__(self) -> None:
+        self.training_examples = []
+        self.entity_synonyms = {}
+        self.regex_features = []
+        self.lookup_tables = []
+
+    def reads(self, s: Text, **kwargs: Any) -> "TrainingData":
+        from rasa.nlu.training_data import TrainingData
+
+        self.__init__()
+
+        yaml_content = rasa.utils.io.read_yaml(s)
+
+        for key, value in yaml_content.items():
+            if key == KEY_NLU:
+                self._parse_nlu(value)
+            else:
+                logger.warning(f"Unexpected key {key} found in {self.filename}")
+
+        return TrainingData(
+            self.training_examples,
+            self.entity_synonyms,
+            self.regex_features,
+            self.lookup_tables,
+        )
+
+    def _parse_nlu(self, nlu_data: List[Dict[Text, Any]]) -> None:
+
+        for nlu_item in nlu_data:
+            if not isinstance(nlu_item, dict):
+                logger.warning(
+                    f"Unexpected block found in {self.filename}: \n"
+                    f"{nlu_item}\n"
+                    f"Items under the `nlu` key must be YAML dictionaries."
+                    f"It will be skipped."
+                )
+                continue
+
+            if KEY_INTENT in nlu_item.keys():
+                self._parse_intent(nlu_item)
+            elif KEY_SYNONYM in nlu_item.keys():
+                self._parse_synonym(nlu_item)
+            elif KEY_REGEX in nlu_item.keys():
+                self._parse_regex(nlu_item)
+            elif KEY_LOOKUP in nlu_item.keys():
+                self._parse_lookup(nlu_item)
+
+    def _parse_intent(self, data: Dict[Text, Any]) -> None:
+        from rasa.nlu.training_data import Message
+
+        intent = data.get(KEY_INTENT, "")
+        examples = data.get(KEY_INTENT_EXAMPLES, "")
+
+        for example, entities in self._parse_training_examples(examples):
+
+            SynonymsParser.add_synonyms_from_entities(
+                example, entities, self.entity_synonyms
+            )
+
+            plain_text = EntitiesParser.replace_entities(example)
+
+            message = Message.build(plain_text, intent)
+            message.set("entities", entities)
+            self.training_examples.append(message)
+
+    def _parse_training_examples(self, examples: Text) -> List[Tuple[Text, List[Dict]]]:
+
+        if isinstance(examples, list):
+            iterable = [
+                example.get(KEY_INTENT_TEXT, "") for example in examples if example
+            ]
+        elif isinstance(examples, str):
+            iterable = examples.splitlines()
+        else:
+            logger.warning(
+                f"Unexpected block found in {self.filename}:\n"
+                f"{examples}\n"
+                f"It will be skipped."
+            )
+            return []
+
+        results = []
+        for example in iterable:
+            entities = EntitiesParser.find_entities_in_training_example(example)
+            results.append((example, entities))
+
+        return results
+
+    def _parse_synonym(self, nlu_item: Dict[Text, Any]) -> None:
+
+        synonym_name = nlu_item[KEY_SYNONYM]
+        examples = nlu_item[KEY_SYNONYM_EXAMPLES]
+
+        if not isinstance(examples, str):
+            logger.warning(
+                f"Unexpected block found in {self.filename}:\n"
+                f"{examples}\n"
+                f"It will be skipped."
+            )
+            return
+
+        for example in examples.splitlines():
+            SynonymsParser.add_synonym(example, synonym_name, self.entity_synonyms)
+
+    def _parse_regex(self, nlu_item: Dict[Text, Any]) -> None:
+
+        regex_name = nlu_item[KEY_REGEX]
+        examples = nlu_item[KEY_REGEX_EXAMPLES]
+
+        if not isinstance(examples, str):
+            logger.warning(
+                f"Unexpected block found in {self.filename}:\n"
+                f"{examples}\n"
+                f"It will be skipped."
+            )
+            return
+
+        for example in examples.splitlines():
+            self.regex_features.append({"name": regex_name, "pattern": example})
+
+    def _parse_lookup(self, nlu_item: Dict[Text, Any]):
+
+        lookup_item_name = nlu_item[KEY_LOOKUP]
+        examples = nlu_item[KEY_LOOKUP_EXAMPLES]
+
+        if not isinstance(examples, str):
+            logger.warning(
+                f"Unexpected block found in {self.filename}:\n"
+                f"{examples}\n"
+                f"It will be skipped."
+            )
+            return
+
+        for example in examples.splitlines():
+            LookupTablesParser.add_item_to_lookup_tables(
+                lookup_item_name, example, self.lookup_tables
+            )
diff --git a/rasa/nlu/training_data/formats/readerwriter.py b/rasa/nlu/training_data/formats/readerwriter.py
@@ -10,8 +10,12 @@
 
 
 class TrainingDataReader:
+
+    filename = ""
+
     def read(self, filename: Text, **kwargs: Any) -> "TrainingData":
         """Reads TrainingData from a file."""
+        self.filename = filename
         return self.reads(rasa.utils.io.read_file(filename), **kwargs)
 
     def reads(self, s: Text, **kwargs: Any) -> "TrainingData":

diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py
@@ -30,6 +30,7 @@
 LUIS = "luis"
 RASA = "rasa_nlu"
 MARKDOWN = "md"
+RASA_YAML = "rasa_yml"
 UNK = "unk"
 MARKDOWN_NLG = "nlg.md"
 DIALOGFLOW_RELEVANT = {DIALOGFLOW_ENTITIES, DIALOGFLOW_INTENT}
@@ -101,6 +102,7 @@ async def load_data_from_endpoint(
 def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]:
     """Generates the appropriate reader class based on the file format."""
     from rasa.nlu.training_data.formats import (
+        RasaYAMLReader,
         MarkdownReader,
         WitReader,
         LuisReader,
@@ -122,6 +124,8 @@ def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]:
         reader = MarkdownReader()
     elif fformat == MARKDOWN_NLG:
         reader = NLGMarkdownReader()
+    elif fformat == RASA_YAML:
+        reader = RasaYAMLReader()
     return reader
 
 
@@ -173,6 +177,14 @@ def guess_format(filename: Text) -> Text:
                 guess = fformat
                 break
 
+    if guess == UNK:
+        try:
+            io_utils.read_yaml_file(filename)
+        except ValueError:
+            pass
+        else:
+            guess = RASA_YAML
+
     logger.debug(f"Training data format of '{filename}' is '{guess}'.")
 
     return guess