RasaHQ · JEM-Mosig · Dec 14, 2020 · Nov 27, 2020 · Dec 2, 2020 · Dec 3, 2020
diff --git a/changelog/7436.improvement.md b/changelog/7436.improvement.md
@@ -0,0 +1,4 @@
+Make `rasa data validate stories` work for end-to-end.
+
+The `rasa data validate stories` function now considers the tokenized user text instead of the plain text that is part of a state. 
+This is closer to what Rasa Core actually uses to distinguish states and thus captures more story structure problems.
diff --git a/data/test_config/config_defaults.yml b/data/test_config/config_defaults.yml
@@ -21,3 +21,12 @@ pipeline: []
 #     ambiguity_threshold: 0.1
 
 data:
+policies:
+# # No configuration for policies was provided. The following default policies were used to train your model.
+# # If you'd like to customize them, uncomment and adjust the policies.
+# # See https://rasa.com/docs/rasa/policies for more information.
+#   - name: MemoizationPolicy
+#   - name: TEDPolicy
+#     max_history: 5
+#     epochs: 100
+#   - name: RulePolicy
diff --git a/data/test_stories/stories_e2e_conflicting_1.yml b/data/test_stories/stories_e2e_conflicting_1.yml
@@ -0,0 +1,13 @@
+version: "2.0"
+
+stories:
+- story: path 1
+  steps:
+  - user: |
+      amazing!
+  - action: utter_happy
+- story: path 2 (should always conflict path 1)
+  steps:
+  - user: |
+      amazing!
+  - action: utter_cheer_up
diff --git a/data/test_stories/stories_e2e_conflicting_2.yml b/data/test_stories/stories_e2e_conflicting_2.yml
@@ -0,0 +1,13 @@
+version: "2.0"
+
+stories:
+- story: path 1
+  steps:
+  - user: |
+      truly amazing
+  - action: utter_greet
+- story: path 2 (should conflict path 1 when WhitespaceTokenizer is used)
+  steps:
+  - user: |
+      truly  amazing
+  - action: utter_goodbye
diff --git a/docs/docs/command-line-interface.mdx b/docs/docs/command-line-interface.mdx
@@ -338,6 +338,10 @@ rasa data validate stories
 :::note
 Running `rasa data validate` does **not** test if your [rules](./rules.mdx) are consistent with your stories. 
 However, during training, the `RulePolicy` checks for conflicts between rules and stories. Any such conflict will abort training.
+
+Furthermore, if you use end-to-end stories, then this might not capture all conflicts. Specifically, if two user inputs
+result in different tokens yet exactly the same featurization, then conflicting actions after these inputs
+may exist but will not be reported by the tool. 
 :::
 
 To interrupt validation even for minor issues such as unused intents or responses, use the `--fail-on-warnings` flag.

diff --git a/docs/docs/setting-up-ci-cd.mdx b/docs/docs/setting-up-ci-cd.mdx
@@ -59,6 +59,10 @@ always good to run this check before training a model. By including the
 :::note
 Running `rasa data validate` does **not** test if your [rules](./rules.mdx) are consistent with your stories. 
 However, during training, the `RulePolicy` checks for conflicts between rules and stories. Any such conflict will abort training.
+
+Furthermore, if you use end-to-end stories, then this might not capture all conflicts. Specifically, if two user inputs
+result in different tokens yet exactly the same featurization, then conflicting actions after these inputs
+may exist but will not be reported by the tool. 
 :::
 
 To read more about the validator and all of the available options, see [the documentation for 

diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
@@ -292,11 +292,13 @@ def encode_entities(
             # we cannot build a classifier if there are less than 2 class
             return {}
 
-        parsed_text = interpreter.featurize_message(Message({TEXT: entity_data[TEXT]}))
+        parsed_text = (
+            interpreter.featurize_message(Message({TEXT: entity_data[TEXT]})) or {}
+        )
         entities = entity_data.get(ENTITIES, [])
 
         _tags = []
-        for token in parsed_text.get(TOKENS_NAMES[TEXT]):
+        for token in parsed_text.get(TOKENS_NAMES[TEXT], []):
             _tag = determine_token_labels(
                 token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
             )

diff --git a/rasa/core/training/story_conflict.py b/rasa/core/training/story_conflict.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 import logging
-from typing import Dict, Generator, List, NamedTuple, Optional, Text, Tuple
+from typing import Dict, Generator, List, NamedTuple, Optional, Text, Tuple, Any
 
 from rasa.core.featurizers.tracker_featurizers import MaxHistoryTrackerFeaturizer
 from rasa.shared.core.constants import ACTION_LISTEN_NAME, PREVIOUS_ACTION, USER
@@ -9,6 +9,14 @@
 from rasa.shared.core.generator import TrackerWithCachedStates
 from rasa.shared.nlu.constants import INTENT
 
+from rasa.nlu.model import Trainer
+from rasa.nlu.components import Component
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.shared.nlu.constants import TEXT
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.utils.io import raise_warning
+
 logger = logging.getLogger(__name__)
 
 
@@ -124,7 +132,32 @@ class TrackerEventStateTuple(NamedTuple):
 
     @property
     def sliced_states_hash(self) -> int:
-        return hash(str(list(self.sliced_states)))
+        """Returns the hash of the sliced states."""
+        return hash(_as_sorted_text(self.sliced_states))
+
+
+def _as_sorted_text(obj: Any) -> Text:
+    """Returns the string of `obj` after sorting lists and dicts.
+
+    Args:
+        obj: Something made up of lists and dicts and stringifiable objects.
+
+    Returns:
+        A string representation of the object that doesn't change
+        randomly due to unsorted dicts or sets.
+    """
+    if isinstance(obj, str):
+        return obj
+    elif isinstance(obj, dict):
+        return str(
+            [
+                (_as_sorted_text(key), _as_sorted_text(value))
+                for key, value in sorted(obj.items())
+            ]
+        )
+    elif isinstance(obj, (list, set)):
+        return str(sorted([_as_sorted_text(element) for element in obj]))
+    return str(obj)
 
 
 def _get_length_of_longest_story(
@@ -146,56 +179,100 @@ def find_story_conflicts(
     trackers: List[TrackerWithCachedStates],
     domain: Domain,
     max_history: Optional[int] = None,
+    nlu_config: Optional[RasaNLUModelConfig] = None,
 ) -> List[StoryConflict]:
     """Generates `StoryConflict` objects, describing conflicts in the given trackers.
 
     Args:
         trackers: Trackers in which to search for conflicts.
         domain: The domain.
         max_history: The maximum history length to be taken into account.
+        nlu_config: NLU config.
 
     Returns:
         StoryConflict objects.
     """
-    if not max_history:
-        max_history = _get_length_of_longest_story(trackers, domain)
+    if max_history:
+        logger.info(
+            f"Considering the preceding {max_history} turns for conflict analysis."
+        )
+    else:
+        logger.info("Considering all preceding turns for conflict analysis.")
 
-    logger.info(f"Considering the preceding {max_history} turns for conflict analysis.")
+    tokenizing_function = _get_tokenizing_function_from_nlu_config(nlu_config)
 
     # We do this in two steps, to reduce memory consumption:
 
     # Create a 'state -> list of actions' dict, where the state is
     # represented by its hash
     conflicting_state_action_mapping = _find_conflicting_states(
-        trackers, domain, max_history
+        trackers, domain, max_history, tokenizing_function
     )
 
     # Iterate once more over all states and note the (unhashed) state,
     # for which a conflict occurs
     conflicts = _build_conflicts_from_states(
-        trackers, domain, max_history, conflicting_state_action_mapping
+        trackers,
+        domain,
+        max_history,
+        conflicting_state_action_mapping,
+        tokenizing_function,
     )
 
     return conflicts
 
 
+def _get_tokenizing_function_from_nlu_config(
+    nlu_config: Optional[RasaNLUModelConfig] = None,
+) -> Optional[callable]:
+    """Extracts the `tokenize` function of the first Tokenizer in the pipeline.
+
+    Args:
+        nlu_config: NLU Config.
+    """
+    if not nlu_config:
+        return None
+
+    pipeline: List[Component] = Trainer(
+        nlu_config, skip_validation=True
+    ).pipeline  # ToDo: ComponentBuilder?
+    tokenizer: Optional[Tokenizer] = None
+    for component in pipeline:
+        if isinstance(component, Tokenizer) and tokenizer:
+            raise_warning(
+                "The pipeline contains more than one tokenizer. "
+                "Only the first tokenizer will be used for story validation.",
+                category=UserWarning,
+            )
+        elif isinstance(component, Tokenizer):
+            tokenizer = component
+
+    return tokenizer.tokenize if tokenizer else None
+
+
 def _find_conflicting_states(
-    trackers: List[TrackerWithCachedStates], domain: Domain, max_history: int
+    trackers: List[TrackerWithCachedStates],
+    domain: Domain,
+    max_history: Optional[int],
+    tokenizing_function: Optional[callable],
 ) -> Dict[int, Optional[List[Text]]]:
     """Identifies all states from which different actions follow.
 
     Args:
         trackers: Trackers that contain the states.
         domain: The domain object.
         max_history: Number of turns to take into account for the state descriptions.
+        tokenizing_function: A `Tokenizer.tokenize` function.
 
     Returns:
         A dictionary mapping state-hashes to a list of actions that follow from each state.
     """
     # Create a 'state -> list of actions' dict, where the state is
     # represented by its hash
     state_action_mapping = defaultdict(list)
-    for element in _sliced_states_iterator(trackers, domain, max_history):
+    for element in _sliced_states_iterator(
+        trackers, domain, max_history, tokenizing_function
+    ):
         hashed_state = element.sliced_states_hash
         if element.event.as_story_string() not in state_action_mapping[hashed_state]:
             state_action_mapping[hashed_state] += [element.event.as_story_string()]
@@ -211,8 +288,9 @@ def _find_conflicting_states(
 def _build_conflicts_from_states(
     trackers: List[TrackerWithCachedStates],
     domain: Domain,
-    max_history: int,
+    max_history: Optional[int],
     conflicting_state_action_mapping: Dict[int, Optional[List[Text]]],
+    tokenizing_function: Optional[callable],
 ) -> List["StoryConflict"]:
     """Builds a list of `StoryConflict` objects for each given conflict.
 
@@ -222,6 +300,7 @@ def _build_conflicts_from_states(
         max_history: Number of turns to take into account for the state descriptions.
         conflicting_state_action_mapping: A dictionary mapping state-hashes to a list of actions
                                           that follow from each state.
+        tokenizing_function: A `Tokenizer.tokenize` function.
 
     Returns:
         A list of `StoryConflict` objects that describe inconsistencies in the story
@@ -230,7 +309,9 @@ def _build_conflicts_from_states(
     # Iterate once more over all states and note the (unhashed) state,
     # for which a conflict occurs
     conflicts = {}
-    for element in _sliced_states_iterator(trackers, domain, max_history):
+    for element in _sliced_states_iterator(
+        trackers, domain, max_history, tokenizing_function
+    ):
         hashed_state = element.sliced_states_hash
 
         if hashed_state in conflicting_state_action_mapping:
@@ -252,7 +333,10 @@ def _build_conflicts_from_states(
 
 
 def _sliced_states_iterator(
-    trackers: List[TrackerWithCachedStates], domain: Domain, max_history: int
+    trackers: List[TrackerWithCachedStates],
+    domain: Domain,
+    max_history: Optional[int],
+    tokenizing_function: Optional[callable],
 ) -> Generator[TrackerEventStateTuple, None, None]:
     """Creates an iterator over sliced states.
 
@@ -263,6 +347,7 @@ def _sliced_states_iterator(
         trackers: List of trackers.
         domain: Domain (used for tracker.past_states).
         max_history: Assumed `max_history` value for slicing.
+        tokenizing_function: A `Tokenizer.tokenize` function.
 
     Yields:
         A (tracker, event, sliced_states) triplet.
@@ -276,10 +361,33 @@ def _sliced_states_iterator(
                 sliced_states = MaxHistoryTrackerFeaturizer.slice_state_history(
                     states[: idx + 1], max_history
                 )
+                if tokenizing_function:
+                    _apply_tokenizer_to_states(tokenizing_function, sliced_states)
+                # ToDo: deal with oov (different tokens can lead to identical features if some of those tokens are out of vocabulary for all featurizers)
                 yield TrackerEventStateTuple(tracker, event, sliced_states)
                 idx += 1
 
 
+def _apply_tokenizer_to_states(
+    tokenizing_function: callable, states: List[State]
+) -> None:
+    """Split each user text into tokens and concatenate them again.
+
+    Args:
+        tokenizing_function: Should take a message and an attribute and return the tokens,
+        just like `Tokenizer.tokenize`.
+        states: The states to be tokenized.
+    """
+    for state in states:
+        if USER in state:
+            state[USER][TEXT] = "".join(
+                token.text
+                for token in tokenizing_function(
+                    Message({TEXT: state[USER][TEXT]}), TEXT
+                )
+            )
+
+
 def _get_previous_event(
     state: Optional[State],
 ) -> Tuple[Optional[Text], Optional[Text]]:

diff --git a/rasa/shared/nlu/training_data/features.py b/rasa/shared/nlu/training_data/features.py
@@ -7,7 +7,7 @@
 
 
 class Features:
-    """Stores the features produces by any featurizer."""
+    """Stores the features produced by any featurizer."""
 
     def __init__(
         self,
@@ -16,6 +16,14 @@ def __init__(
         attribute: Text,
         origin: Union[Text, List[Text]],
     ) -> None:
+        """Initializes the Features object.
+
+        Args:
+            features: The features.
+            feature_type: Type of the feature, e.g. FEATURE_TYPE_SENTENCE.
+            attribute: Message attribute, e.g. INTENT or TEXT.
+            origin: Name of the component that created the features.
+        """
         self.features = features
         self.type = feature_type
         self.origin = origin
@@ -83,10 +91,12 @@ def __key__(
     ) -> Tuple[
         Text, Text, Union[np.ndarray, scipy.sparse.spmatrix], Union[Text, List[Text]]
     ]:
-        return (self.type, self.attribute, self.features, self.origin)
+        """Returns a 4-tuple of defining properties.
 
-    def __hash__(self) -> int:
-        return hash(self.__key__())
+        Returns:
+            Tuple of type, attribute, features, and origin properties.
+        """
+        return (self.type, self.attribute, self.features, self.origin)
 
     def __eq__(self, other: Any) -> bool:
         if not isinstance(other, Features):