-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Alexander Khizov
committed
Jun 10, 2020
1 parent
2725bda
commit 1611414
Showing
9 changed files
with
372 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})` | ||
import re | ||
from json import JSONDecodeError | ||
from typing import Text, List, Dict, Match | ||
|
||
from rasa.nlu.training_data.formats.markdown import EntityAttributes | ||
from rasa.nlu.utils import build_entity | ||
|
||
from rasa.constants import DOCS_URL_TRAINING_DATA_NLU | ||
|
||
from rasa.nlu.constants import ( | ||
ENTITY_ATTRIBUTE_GROUP, | ||
ENTITY_ATTRIBUTE_TYPE, | ||
ENTITY_ATTRIBUTE_ROLE, | ||
ENTITY_ATTRIBUTE_VALUE, | ||
) | ||
|
||
from rasa.utils.common import raise_warning | ||
|
||
GROUP_ENTITY_VALUE = "value" | ||
GROUP_ENTITY_TYPE = "entity" | ||
GROUP_ENTITY_DICT = "entity_dict" | ||
GROUP_ENTITY_TEXT = "entity_text" | ||
GROUP_COMPLETE_MATCH = 0 | ||
|
||
# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})` | ||
entity_regex = re.compile( | ||
r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\})" | ||
) | ||
|
||
|
||
class EntitiesParser: | ||
@staticmethod | ||
def find_entities_in_training_example(example: Text) -> List[Dict]: | ||
"""Extracts entities from an intent example. | ||
Args: | ||
example: intent example | ||
Returns: list of extracted entities | ||
""" | ||
entities = [] | ||
offset = 0 | ||
|
||
for match in re.finditer(entity_regex, example): | ||
entity_attributes = EntitiesParser._extract_entity_attributes(match) | ||
|
||
start_index = match.start() - offset | ||
end_index = start_index + len(entity_attributes.text) | ||
offset += len(match.group(0)) - len(entity_attributes.text) | ||
|
||
entity = build_entity( | ||
start_index, | ||
end_index, | ||
entity_attributes.value, | ||
entity_attributes.type, | ||
entity_attributes.role, | ||
entity_attributes.group, | ||
) | ||
entities.append(entity) | ||
|
||
return entities | ||
|
||
@staticmethod | ||
def _extract_entity_attributes(match: Match) -> EntityAttributes: | ||
"""Extract the entity attributes, i.e. type, value, etc., from the | ||
regex match.""" | ||
entity_text = match.groupdict()[GROUP_ENTITY_TEXT] | ||
|
||
if match.groupdict()[GROUP_ENTITY_DICT]: | ||
return EntitiesParser._extract_entity_attributes_from_dict( | ||
entity_text, match | ||
) | ||
|
||
entity_type = match.groupdict()[GROUP_ENTITY_TYPE] | ||
|
||
if match.groupdict()[GROUP_ENTITY_VALUE]: | ||
entity_value = match.groupdict()[GROUP_ENTITY_VALUE] | ||
# self._deprecated_synonym_format_was_used = True | ||
else: | ||
entity_value = entity_text | ||
|
||
return EntityAttributes(entity_type, entity_value, entity_text, None, None) | ||
|
||
@staticmethod | ||
def _extract_entity_attributes_from_dict( | ||
entity_text: Text, match: Match | ||
) -> EntityAttributes: | ||
"""Extract the entity attributes from the dict format.""" | ||
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT] | ||
entity_dict = EntitiesParser._get_validated_dict(entity_dict_str) | ||
return EntityAttributes( | ||
entity_dict.get(ENTITY_ATTRIBUTE_TYPE), | ||
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text), | ||
entity_text, | ||
entity_dict.get(ENTITY_ATTRIBUTE_GROUP), | ||
entity_dict.get(ENTITY_ATTRIBUTE_ROLE), | ||
) | ||
|
||
@staticmethod | ||
def _get_validated_dict(json_str: Text) -> Dict[Text, Text]: | ||
"""Converts the provided json_str to a valid dict containing the entity | ||
attributes. | ||
Users can specify entity roles, synonyms, groups for an entity in a dict, e.g. | ||
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"} | ||
Args: | ||
json_str: the entity dict as string without "{}" | ||
Raises: | ||
ValidationError if validation of entity dict fails. | ||
JSONDecodeError if provided entity dict is not valid json. | ||
Returns: | ||
a proper python dict | ||
""" | ||
import json | ||
import rasa.utils.validation as validation_utils | ||
import rasa.nlu.schemas.data_schema as schema | ||
|
||
# add {} as they are not part of the regex | ||
try: | ||
data = json.loads(f"{{{json_str}}}") | ||
except JSONDecodeError as e: | ||
raise_warning( | ||
f"Incorrect training data format ('{{{json_str}}}'), make sure your " | ||
f"data is valid. For more information about the format visit " | ||
f"{DOCS_URL_TRAINING_DATA_NLU}." | ||
) | ||
raise e | ||
|
||
validation_utils.validate_training_data(data, schema.entity_dict_schema()) | ||
|
||
return data | ||
|
||
@staticmethod | ||
def replace_entities(training_example: Text) -> Text: | ||
return re.sub( | ||
entity_regex, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
import logging | ||
from typing import Text, Any, List, Dict, Tuple | ||
|
||
from rasa.nlu.training_data.entities_parser import EntitiesParser | ||
from rasa.nlu.training_data.formats.readerwriter import TrainingDataReader | ||
import rasa.utils.io | ||
from rasa.nlu.training_data.lookup_tables_parser import LookupTablesParser | ||
from rasa.nlu.training_data.synonyms_parser import SynonymsParser | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
KEY_NLU = "nlu" | ||
KEY_INTENT = "intent" | ||
KEY_INTENT_EXAMPLES = "examples" | ||
KEY_INTENT_TEXT = "text" | ||
KEY_SYNONYM = "synonym" | ||
KEY_SYNONYM_EXAMPLES = "examples" | ||
KEY_REGEX = "regex" | ||
KEY_REGEX_EXAMPLES = "examples" | ||
KEY_LOOKUP = "lookup" | ||
KEY_LOOKUP_EXAMPLES = "examples" | ||
|
||
|
||
class RasaYAMLReader(TrainingDataReader): | ||
def __init__(self) -> None: | ||
self.training_examples = [] | ||
self.entity_synonyms = {} | ||
self.regex_features = [] | ||
self.lookup_tables = [] | ||
|
||
def reads(self, s: Text, **kwargs: Any) -> "TrainingData": | ||
from rasa.nlu.training_data import TrainingData | ||
|
||
self.__init__() | ||
|
||
yaml_content = rasa.utils.io.read_yaml(s) | ||
|
||
for key, value in yaml_content.items(): | ||
if key == KEY_NLU: | ||
self._parse_nlu(value) | ||
else: | ||
logger.warning(f"Unexpected key {key} found in {self.filename}") | ||
|
||
return TrainingData( | ||
self.training_examples, | ||
self.entity_synonyms, | ||
self.regex_features, | ||
self.lookup_tables, | ||
) | ||
|
||
def _parse_nlu(self, nlu_data: List[Dict[Text, Any]]) -> None: | ||
|
||
for nlu_item in nlu_data: | ||
if not isinstance(nlu_item, dict): | ||
logger.warning( | ||
f"Unexpected block found in {self.filename}: \n" | ||
f"{nlu_item}\n" | ||
f"Items under the `nlu` key must be YAML dictionaries." | ||
f"It will be skipped." | ||
) | ||
continue | ||
|
||
if KEY_INTENT in nlu_item.keys(): | ||
self._parse_intent(nlu_item) | ||
elif KEY_SYNONYM in nlu_item.keys(): | ||
self._parse_synonym(nlu_item) | ||
elif KEY_REGEX in nlu_item.keys(): | ||
self._parse_regex(nlu_item) | ||
elif KEY_LOOKUP in nlu_item.keys(): | ||
self._parse_lookup(nlu_item) | ||
|
||
def _parse_intent(self, data: Dict[Text, Any]) -> None: | ||
from rasa.nlu.training_data import Message | ||
|
||
intent = data.get(KEY_INTENT, "") | ||
examples = data.get(KEY_INTENT_EXAMPLES, "") | ||
|
||
for example, entities in self._parse_training_examples(examples): | ||
|
||
SynonymsParser.add_synonyms_from_entities( | ||
example, entities, self.entity_synonyms | ||
) | ||
|
||
plain_text = EntitiesParser.replace_entities(example) | ||
|
||
message = Message.build(plain_text, intent) | ||
message.set("entities", entities) | ||
self.training_examples.append(message) | ||
|
||
def _parse_training_examples(self, examples: Text) -> List[Tuple[Text, List[Dict]]]: | ||
|
||
if isinstance(examples, list): | ||
iterable = [ | ||
example.get(KEY_INTENT_TEXT, "") for example in examples if example | ||
] | ||
elif isinstance(examples, str): | ||
iterable = examples.splitlines() | ||
else: | ||
logger.warning( | ||
f"Unexpected block found in {self.filename}:\n" | ||
f"{examples}\n" | ||
f"It will be skipped." | ||
) | ||
return [] | ||
|
||
results = [] | ||
for example in iterable: | ||
entities = EntitiesParser.find_entities_in_training_example(example) | ||
results.append((example, entities)) | ||
|
||
return results | ||
|
||
def _parse_synonym(self, nlu_item: Dict[Text, Any]) -> None: | ||
|
||
synonym_name = nlu_item[KEY_SYNONYM] | ||
examples = nlu_item[KEY_SYNONYM_EXAMPLES] | ||
|
||
if not isinstance(examples, str): | ||
logger.warning( | ||
f"Unexpected block found in {self.filename}:\n" | ||
f"{examples}\n" | ||
f"It will be skipped." | ||
) | ||
return | ||
|
||
for example in examples.splitlines(): | ||
SynonymsParser.add_synonym(example, synonym_name, self.entity_synonyms) | ||
|
||
def _parse_regex(self, nlu_item: Dict[Text, Any]) -> None: | ||
|
||
regex_name = nlu_item[KEY_REGEX] | ||
examples = nlu_item[KEY_REGEX_EXAMPLES] | ||
|
||
if not isinstance(examples, str): | ||
logger.warning( | ||
f"Unexpected block found in {self.filename}:\n" | ||
f"{examples}\n" | ||
f"It will be skipped." | ||
) | ||
return | ||
|
||
for example in examples.splitlines(): | ||
self.regex_features.append({"name": regex_name, "pattern": example}) | ||
|
||
def _parse_lookup(self, nlu_item: Dict[Text, Any]): | ||
|
||
lookup_item_name = nlu_item[KEY_LOOKUP] | ||
examples = nlu_item[KEY_LOOKUP_EXAMPLES] | ||
|
||
if not isinstance(examples, str): | ||
logger.warning( | ||
f"Unexpected block found in {self.filename}:\n" | ||
f"{examples}\n" | ||
f"It will be skipped." | ||
) | ||
return | ||
|
||
for example in examples.splitlines(): | ||
LookupTablesParser.add_item_to_lookup_tables( | ||
lookup_item_name, example, self.lookup_tables | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.