Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Khizov committed Jun 10, 2020
1 parent 2725bda commit 1611414
Show file tree
Hide file tree
Showing 9 changed files with 372 additions and 7 deletions.
4 changes: 3 additions & 1 deletion rasa/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def _find_core_nlu_files_in_directory(directory: Text,) -> Tuple[Set[Text], Set[

def _is_valid_filetype(path: Text) -> bool:
is_file = os.path.isfile(path)
is_datafile = path.endswith(".json") or path.endswith(".md")
is_datafile = (
path.endswith(".json") or path.endswith(".md") or path.endswith(".yml")
)

return is_file and is_datafile

Expand Down
12 changes: 6 additions & 6 deletions rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,12 @@ def _generate_lookup_regex(
# if it's a list, it should be the elements directly
if isinstance(lookup_elements, list):
elements_to_regex = lookup_elements
common_utils.raise_warning(
"Directly including lookup tables as a list is deprecated since Rasa "
"1.6.",
FutureWarning,
docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
)
# common_utils.raise_warning(
# "Directly including lookup tables as a list is deprecated since Rasa "
# "1.6.",
# FutureWarning,
# docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
# )

# otherwise it's a file path.
else:
Expand Down
141 changes: 141 additions & 0 deletions rasa/nlu/training_data/entities_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
import re
from json import JSONDecodeError
from typing import Text, List, Dict, Match

from rasa.nlu.training_data.formats.markdown import EntityAttributes
from rasa.nlu.utils import build_entity

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU

from rasa.nlu.constants import (
ENTITY_ATTRIBUTE_GROUP,
ENTITY_ATTRIBUTE_TYPE,
ENTITY_ATTRIBUTE_ROLE,
ENTITY_ATTRIBUTE_VALUE,
)

from rasa.utils.common import raise_warning

GROUP_ENTITY_VALUE = "value"
GROUP_ENTITY_TYPE = "entity"
GROUP_ENTITY_DICT = "entity_dict"
GROUP_ENTITY_TEXT = "entity_text"
GROUP_COMPLETE_MATCH = 0

# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
entity_regex = re.compile(
r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})"
)


class EntitiesParser:
@staticmethod
def find_entities_in_training_example(example: Text) -> List[Dict]:
"""Extracts entities from an intent example.
Args:
example: intent example
Returns: list of extracted entities
"""
entities = []
offset = 0

for match in re.finditer(entity_regex, example):
entity_attributes = EntitiesParser._extract_entity_attributes(match)

start_index = match.start() - offset
end_index = start_index + len(entity_attributes.text)
offset += len(match.group(0)) - len(entity_attributes.text)

entity = build_entity(
start_index,
end_index,
entity_attributes.value,
entity_attributes.type,
entity_attributes.role,
entity_attributes.group,
)
entities.append(entity)

return entities

@staticmethod
def _extract_entity_attributes(match: Match) -> EntityAttributes:
"""Extract the entity attributes, i.e. type, value, etc., from the
regex match."""
entity_text = match.groupdict()[GROUP_ENTITY_TEXT]

if match.groupdict()[GROUP_ENTITY_DICT]:
return EntitiesParser._extract_entity_attributes_from_dict(
entity_text, match
)

entity_type = match.groupdict()[GROUP_ENTITY_TYPE]

if match.groupdict()[GROUP_ENTITY_VALUE]:
entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
# self._deprecated_synonym_format_was_used = True
else:
entity_value = entity_text

return EntityAttributes(entity_type, entity_value, entity_text, None, None)

@staticmethod
def _extract_entity_attributes_from_dict(
entity_text: Text, match: Match
) -> EntityAttributes:
"""Extract the entity attributes from the dict format."""
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
entity_dict = EntitiesParser._get_validated_dict(entity_dict_str)
return EntityAttributes(
entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
entity_text,
entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
)

@staticmethod
def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
"""Converts the provided json_str to a valid dict containing the entity
attributes.
Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"}
Args:
json_str: the entity dict as string without "{}"
Raises:
ValidationError if validation of entity dict fails.
JSONDecodeError if provided entity dict is not valid json.
Returns:
a proper python dict
"""
import json
import rasa.utils.validation as validation_utils
import rasa.nlu.schemas.data_schema as schema

# add {} as they are not part of the regex
try:
data = json.loads(f"{{{json_str}}}")
except JSONDecodeError as e:
raise_warning(
f"Incorrect training data format ('{{{json_str}}}'), make sure your "
f"data is valid. For more information about the format visit "
f"{DOCS_URL_TRAINING_DATA_NLU}."
)
raise e

validation_utils.validate_training_data(data, schema.entity_dict_schema())

return data

@staticmethod
def replace_entities(training_example: Text) -> Text:
return re.sub(
entity_regex, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
)
1 change: 1 addition & 0 deletions rasa/nlu/training_data/formats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
from rasa.nlu.training_data.formats.dialogflow import DialogflowReader
from rasa.nlu.training_data.formats.luis import LuisReader
from rasa.nlu.training_data.formats.markdown import MarkdownReader, MarkdownWriter
Expand Down
161 changes: 161 additions & 0 deletions rasa/nlu/training_data/formats/rasa_yaml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import logging
from typing import Text, Any, List, Dict, Tuple

from rasa.nlu.training_data.entities_parser import EntitiesParser
from rasa.nlu.training_data.formats.readerwriter import TrainingDataReader
import rasa.utils.io
from rasa.nlu.training_data.lookup_tables_parser import LookupTablesParser
from rasa.nlu.training_data.synonyms_parser import SynonymsParser

logger = logging.getLogger(__name__)

KEY_NLU = "nlu"
KEY_INTENT = "intent"
KEY_INTENT_EXAMPLES = "examples"
KEY_INTENT_TEXT = "text"
KEY_SYNONYM = "synonym"
KEY_SYNONYM_EXAMPLES = "examples"
KEY_REGEX = "regex"
KEY_REGEX_EXAMPLES = "examples"
KEY_LOOKUP = "lookup"
KEY_LOOKUP_EXAMPLES = "examples"


class RasaYAMLReader(TrainingDataReader):
def __init__(self) -> None:
self.training_examples = []
self.entity_synonyms = {}
self.regex_features = []
self.lookup_tables = []

def reads(self, s: Text, **kwargs: Any) -> "TrainingData":
from rasa.nlu.training_data import TrainingData

self.__init__()

yaml_content = rasa.utils.io.read_yaml(s)

for key, value in yaml_content.items():
if key == KEY_NLU:
self._parse_nlu(value)
else:
logger.warning(f"Unexpected key {key} found in {self.filename}")

return TrainingData(
self.training_examples,
self.entity_synonyms,
self.regex_features,
self.lookup_tables,
)

def _parse_nlu(self, nlu_data: List[Dict[Text, Any]]) -> None:

for nlu_item in nlu_data:
if not isinstance(nlu_item, dict):
logger.warning(
f"Unexpected block found in {self.filename}: \n"
f"{nlu_item}\n"
f"Items under the `nlu` key must be YAML dictionaries."
f"It will be skipped."
)
continue

if KEY_INTENT in nlu_item.keys():
self._parse_intent(nlu_item)
elif KEY_SYNONYM in nlu_item.keys():
self._parse_synonym(nlu_item)
elif KEY_REGEX in nlu_item.keys():
self._parse_regex(nlu_item)
elif KEY_LOOKUP in nlu_item.keys():
self._parse_lookup(nlu_item)

def _parse_intent(self, data: Dict[Text, Any]) -> None:
from rasa.nlu.training_data import Message

intent = data.get(KEY_INTENT, "")
examples = data.get(KEY_INTENT_EXAMPLES, "")

for example, entities in self._parse_training_examples(examples):

SynonymsParser.add_synonyms_from_entities(
example, entities, self.entity_synonyms
)

plain_text = EntitiesParser.replace_entities(example)

message = Message.build(plain_text, intent)
message.set("entities", entities)
self.training_examples.append(message)

def _parse_training_examples(self, examples: Text) -> List[Tuple[Text, List[Dict]]]:

if isinstance(examples, list):
iterable = [
example.get(KEY_INTENT_TEXT, "") for example in examples if example
]
elif isinstance(examples, str):
iterable = examples.splitlines()
else:
logger.warning(
f"Unexpected block found in {self.filename}:\n"
f"{examples}\n"
f"It will be skipped."
)
return []

results = []
for example in iterable:
entities = EntitiesParser.find_entities_in_training_example(example)
results.append((example, entities))

return results

def _parse_synonym(self, nlu_item: Dict[Text, Any]) -> None:

synonym_name = nlu_item[KEY_SYNONYM]
examples = nlu_item[KEY_SYNONYM_EXAMPLES]

if not isinstance(examples, str):
logger.warning(
f"Unexpected block found in {self.filename}:\n"
f"{examples}\n"
f"It will be skipped."
)
return

for example in examples.splitlines():
SynonymsParser.add_synonym(example, synonym_name, self.entity_synonyms)

def _parse_regex(self, nlu_item: Dict[Text, Any]) -> None:

regex_name = nlu_item[KEY_REGEX]
examples = nlu_item[KEY_REGEX_EXAMPLES]

if not isinstance(examples, str):
logger.warning(
f"Unexpected block found in {self.filename}:\n"
f"{examples}\n"
f"It will be skipped."
)
return

for example in examples.splitlines():
self.regex_features.append({"name": regex_name, "pattern": example})

def _parse_lookup(self, nlu_item: Dict[Text, Any]):

lookup_item_name = nlu_item[KEY_LOOKUP]
examples = nlu_item[KEY_LOOKUP_EXAMPLES]

if not isinstance(examples, str):
logger.warning(
f"Unexpected block found in {self.filename}:\n"
f"{examples}\n"
f"It will be skipped."
)
return

for example in examples.splitlines():
LookupTablesParser.add_item_to_lookup_tables(
lookup_item_name, example, self.lookup_tables
)
4 changes: 4 additions & 0 deletions rasa/nlu/training_data/formats/readerwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@


class TrainingDataReader:

filename = ""

def read(self, filename: Text, **kwargs: Any) -> "TrainingData":
"""Reads TrainingData from a file."""
self.filename = filename
return self.reads(rasa.utils.io.read_file(filename), **kwargs)

def reads(self, s: Text, **kwargs: Any) -> "TrainingData":
Expand Down
12 changes: 12 additions & 0 deletions rasa/nlu/training_data/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
LUIS = "luis"
RASA = "rasa_nlu"
MARKDOWN = "md"
RASA_YAML = "rasa_yml"
UNK = "unk"
MARKDOWN_NLG = "nlg.md"
DIALOGFLOW_RELEVANT = {DIALOGFLOW_ENTITIES, DIALOGFLOW_INTENT}
Expand Down Expand Up @@ -101,6 +102,7 @@ async def load_data_from_endpoint(
def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]:
"""Generates the appropriate reader class based on the file format."""
from rasa.nlu.training_data.formats import (
RasaYAMLReader,
MarkdownReader,
WitReader,
LuisReader,
Expand All @@ -122,6 +124,8 @@ def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]:
reader = MarkdownReader()
elif fformat == MARKDOWN_NLG:
reader = NLGMarkdownReader()
elif fformat == RASA_YAML:
reader = RasaYAMLReader()
return reader


Expand Down Expand Up @@ -173,6 +177,14 @@ def guess_format(filename: Text) -> Text:
guess = fformat
break

if guess == UNK:
try:
io_utils.read_yaml_file(filename)
except ValueError:
pass
else:
guess = RASA_YAML

logger.debug(f"Training data format of '{filename}' is '{guess}'.")

return guess
Expand Down
Loading

0 comments on commit 1611414

Please sign in to comment.