Skip to content

Commit

Permalink
Addressing PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Khizov committed Jul 2, 2020
1 parent 0f2db99 commit 7e5e5b1
Show file tree
Hide file tree
Showing 17 changed files with 444 additions and 392 deletions.
2 changes: 1 addition & 1 deletion data/rasa_yaml_examples/nlu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ nlu:
# Multiline examples, each line is a separate training example.
examples: |
how much CO2 will that use?
how much carbon will a one way flight from [new york]{"entity": "city", "role": "from"} to california produce?
how much carbon will a one way flight from [new york]{"entity": "city", "role": "from"} to california produce?
17 changes: 4 additions & 13 deletions rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from typing import Any, Dict, List, Optional, Text, Union, Type, Tuple

import numpy as np
import scipy.sparse

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
import rasa.utils.io
import rasa.utils.io
import scipy.sparse
from rasa.nlu import utils
from rasa.nlu.components import Component
from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.constants import (
RESPONSE,
Expand All @@ -19,12 +19,10 @@
FEATURE_TYPE_SEQUENCE,
FEATURIZER_CLASS_ALIAS,
)
from rasa.nlu.tokenizers.tokenizer import Tokenizer
from rasa.nlu.components import Component
from rasa.nlu.featurizers.featurizer import SparseFeaturizer, Features
from rasa.nlu.training_data import Message, TrainingData
import rasa.utils.common as common_utils
from rasa.nlu.model import Metadata
from rasa.nlu.tokenizers.tokenizer import Tokenizer
from rasa.nlu.training_data import Message, TrainingData

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -158,13 +156,6 @@ def _generate_lookup_regex(
# if it's a list, it should be the elements directly
if isinstance(lookup_elements, list):
elements_to_regex = lookup_elements
# common_utils.raise_warning(
# "Directly including lookup tables as a list is deprecated since Rasa "
# "1.6.",
# FutureWarning,
# docs=DOCS_URL_TRAINING_DATA_NLU + "#lookup-tables",
# )

# otherwise it's a file path.
else:

Expand Down
3 changes: 3 additions & 0 deletions rasa/nlu/training_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import rasa.nlu.training_data.entities_parser
import rasa.nlu.training_data.synonyms_parser
import rasa.nlu.training_data.lookup_tables_parser
from rasa.nlu.training_data.loading import load_data
from rasa.nlu.training_data.message import Message
from rasa.nlu.training_data.training_data import TrainingData
240 changes: 129 additions & 111 deletions rasa/nlu/training_data/entities_parser.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
# regex for: `[entity_text]((entity_type(:entity_synonym)?)|{entity_dict})`
import re
from json import JSONDecodeError
from typing import Text, List, Dict, Match, Optional, NamedTuple

from rasa.nlu.utils import build_entity

from rasa.constants import DOCS_URL_TRAINING_DATA_NLU

from rasa.nlu.constants import (
ENTITY_ATTRIBUTE_GROUP,
ENTITY_ATTRIBUTE_TYPE,
ENTITY_ATTRIBUTE_ROLE,
ENTITY_ATTRIBUTE_VALUE,
)

from rasa.utils.common import raise_warning

GROUP_ENTITY_VALUE = "value"
Expand All @@ -38,112 +33,135 @@ class EntityAttributes(NamedTuple):
role: Optional[Text]


class EntitiesParser:
@staticmethod
def find_entities_in_training_example(example: Text) -> List[Dict]:
"""Extracts entities from an intent example.
Args:
example: intent example
Returns: list of extracted entities
"""
entities = []
offset = 0

for match in re.finditer(ENTITY_REGEX, example):
entity_attributes = EntitiesParser._extract_entity_attributes(match)

start_index = match.start() - offset
end_index = start_index + len(entity_attributes.text)
offset += len(match.group(0)) - len(entity_attributes.text)

entity = build_entity(
start_index,
end_index,
entity_attributes.value,
entity_attributes.type,
entity_attributes.role,
entity_attributes.group,
)
entities.append(entity)

return entities

@staticmethod
def _extract_entity_attributes(match: Match) -> EntityAttributes:
"""Extract the entity attributes, i.e. type, value, etc., from the
regex match."""
entity_text = match.groupdict()[GROUP_ENTITY_TEXT]

if match.groupdict()[GROUP_ENTITY_DICT]:
return EntitiesParser._extract_entity_attributes_from_dict(
entity_text, match
)

entity_type = match.groupdict()[GROUP_ENTITY_TYPE]

if match.groupdict()[GROUP_ENTITY_VALUE]:
entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
else:
entity_value = entity_text

return EntityAttributes(entity_type, entity_value, entity_text, None, None)

@staticmethod
def _extract_entity_attributes_from_dict(
entity_text: Text, match: Match
) -> EntityAttributes:
"""Extract the entity attributes from the dict format."""
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
entity_dict = EntitiesParser._get_validated_dict(entity_dict_str)
return EntityAttributes(
entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
entity_text,
entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
def find_entities_in_training_example(example: Text) -> List[Dict]:
"""Extracts entities from an intent example.
Args:
example: Intent example.
Returns:
Extracted entities.
"""
import rasa.nlu.utils as rasa_nlu_utils

entities = []
offset = 0

for match in re.finditer(ENTITY_REGEX, example):
entity_attributes = extract_entity_attributes(match)

start_index = match.start() - offset
end_index = start_index + len(entity_attributes.text)
offset += len(match.group(0)) - len(entity_attributes.text)

entity = rasa_nlu_utils.build_entity(
start_index,
end_index,
entity_attributes.value,
entity_attributes.type,
entity_attributes.role,
entity_attributes.group,
)
entities.append(entity)

return entities


def extract_entity_attributes(match: Match) -> EntityAttributes:
"""Extract the entity attributes, i.e. type, value, etc., from the
regex match.
Args:
match: Regex match to extract the entity attributes from.
Returns:
EntityAttributes object.
"""
entity_text = match.groupdict()[GROUP_ENTITY_TEXT]

if match.groupdict()[GROUP_ENTITY_DICT]:
return extract_entity_attributes_from_dict(entity_text, match)

entity_type = match.groupdict()[GROUP_ENTITY_TYPE]

if match.groupdict()[GROUP_ENTITY_VALUE]:
entity_value = match.groupdict()[GROUP_ENTITY_VALUE]
else:
entity_value = entity_text

@staticmethod
def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
"""Converts the provided json_str to a valid dict containing the entity
attributes.
Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"}
Args:
json_str: the entity dict as string without "{}"
Raises:
ValidationError if validation of entity dict fails.
JSONDecodeError if provided entity dict is not valid json.
Returns:
a proper python dict
"""
import json
import rasa.utils.validation as validation_utils
import rasa.nlu.schemas.data_schema as schema

# add {} as they are not part of the regex
try:
data = json.loads(f"{{{json_str}}}")
except JSONDecodeError as e:
raise_warning(
f"Incorrect training data format ('{{{json_str}}}'), make sure your "
f"data is valid. For more information about the format visit "
f"{DOCS_URL_TRAINING_DATA_NLU}."
)
raise e

validation_utils.validate_training_data(data, schema.entity_dict_schema())

return data

@staticmethod
def replace_entities(training_example: Text) -> Text:
return re.sub(
ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
return EntityAttributes(entity_type, entity_value, entity_text, None, None)


def extract_entity_attributes_from_dict(
entity_text: Text, match: Match
) -> EntityAttributes:
"""Extract the entity attributes from the dict format.
Args:
entity_text: Original entity text.
match: Regex match.
Returns:
Extracted entity attributes.
"""
entity_dict_str = match.groupdict()[GROUP_ENTITY_DICT]
entity_dict = get_validated_dict(entity_dict_str)
return EntityAttributes(
entity_dict.get(ENTITY_ATTRIBUTE_TYPE),
entity_dict.get(ENTITY_ATTRIBUTE_VALUE, entity_text),
entity_text,
entity_dict.get(ENTITY_ATTRIBUTE_GROUP),
entity_dict.get(ENTITY_ATTRIBUTE_ROLE),
)


def get_validated_dict(json_str: Text) -> Dict[Text, Text]:
"""Converts the provided `json_str` to a valid dict containing the entity
attributes.
Users can specify entity roles, synonyms, groups for an entity in a dict, e.g.
[LA]{"entity": "city", "role": "to", "value": "Los Angeles"}.
Args:
json_str: The entity dict as string without "{}".
Raises:
ValidationError if validation of entity dict fails.
JSONDecodeError if provided entity dict is not valid json.
Returns:
Deserialized and validated `json_str`.
"""
import json
import rasa.utils.validation as validation_utils
import rasa.nlu.schemas.data_schema as schema

# add {} as they are not part of the regex
try:
data = json.loads(f"{{{json_str}}}")
except JSONDecodeError as e:
raise_warning(
f"Incorrect training data format ('{{{json_str}}}'). Make sure your "
f"data is valid.",
docs=DOCS_URL_TRAINING_DATA_NLU,
)
raise e

validation_utils.validate_training_data(data, schema.entity_dict_schema())

return data


def replace_entities(training_example: Text) -> Text:
"""Replace special symbols related to the entities in the provided
training example.
Args:
training_example: Original training example with special symbols.
Returns:
String with removed special symbols.
"""
return re.sub(
ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
)
1 change: 1 addition & 0 deletions rasa/nlu/training_data/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
NLGMarkdownReader,
NLGMarkdownWriter,
)
from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLReader
19 changes: 11 additions & 8 deletions rasa/nlu/training_data/formats/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,10 @@
ENTITY_ATTRIBUTE_END,
ENTITY_ATTRIBUTE_START,
)
from rasa.nlu.training_data.entities_parser import EntitiesParser, ENTITY_REGEX
from rasa.nlu.training_data.formats.readerwriter import (
TrainingDataReader,
TrainingDataWriter,
)
from rasa.nlu.training_data.lookup_tables_parser import LookupTablesParser
from rasa.nlu.training_data.synonyms_parser import SynonymsParser
from rasa.utils.common import raise_warning

GROUP_ENTITY_VALUE = "value"
Expand All @@ -38,6 +35,7 @@
REGEX = "regex"
LOOKUP = "lookup"
available_sections = [INTENT, SYNONYM, REGEX, LOOKUP]
markdown_section_markers = [f"## {s}:" for s in available_sections]

item_regex = re.compile(r"\s*[-*+]\s*(.+)")
comment_regex = re.compile(r"<!--[\s\S]*?--!*>", re.MULTILINE)
Expand Down Expand Up @@ -138,22 +136,25 @@ def _load_files(self, line: Text) -> None:

def _parse_item(self, line: Text) -> None:
"""Parses an md list item line based on the current section type."""
import rasa.nlu.training_data.lookup_tables_parser as lookup_tables_parser
import rasa.nlu.training_data.synonyms_parser as synonyms_parser

match = re.match(item_regex, line)
if match:
item = match.group(1)
if self.current_section == INTENT:
parsed = self.parse_training_example(item)
self.training_examples.append(parsed)
elif self.current_section == SYNONYM:
SynonymsParser.add_synonym(
synonyms_parser.add_synonym(
item, self.current_title, self.entity_synonyms
)
elif self.current_section == REGEX:
self.regex_features.append(
{"name": self.current_title, "pattern": item}
)
elif self.current_section == LOOKUP:
LookupTablesParser.add_item_to_lookup_tables(
lookup_tables_parser.add_item_to_lookup_tables(
self.current_title, item, self.lookup_tables
)

Expand Down Expand Up @@ -197,10 +198,12 @@ def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
def parse_training_example(self, example: Text) -> "Message":
"""Extract entities and synonyms, and convert to plain text."""
from rasa.nlu.training_data import Message
import rasa.nlu.training_data.entities_parser as entities_parser
import rasa.nlu.training_data.synonyms_parser as synonyms_parser

entities = EntitiesParser.find_entities_in_training_example(example)
plain_text = EntitiesParser.replace_entities(example)
SynonymsParser.add_synonyms_from_entities(
entities = entities_parser.find_entities_in_training_example(example)
plain_text = entities_parser.replace_entities(example)
synonyms_parser.add_synonyms_from_entities(
plain_text, entities, self.entity_synonyms
)

Expand Down
Loading

0 comments on commit 7e5e5b1

Please sign in to comment.