Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/book/reference/all-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ Check for regular expression matches.
| **DoesNotContain()** <ul><li>Checks if the text does not contain any or all specified items. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `DoesNotContain(items=["as a large language model"]` | **Required:** <br> `items: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`case_sensitive = True` or `False`</li></ul> |
| **IncludesWords()** <ul><li> Checks if the text includes **any** (default) or **all** specified words. </li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li> By default, considers inflected and variant forms of the same word. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `IncludesWords(words_list=['booking', 'hotel', 'flight']` | **Required:** <br> `words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'any'` or `'all'`</li><li>`lemmatize = True` or `False`</li></ul> |
| **ExcludesWords()** <ul><li>Checks if the text excludes all specified words.</li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li>By default, considers inflected and variant forms of the same word. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExcludesWords(words_list=['buy', 'sell', 'bet']`| **Required:** <br>`words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`lemmatize = True` or `False`</li></ul> |
| **ItemMatch()** <ul><li>Checks whether the text contains **any** (default) or **all** specified items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="expected")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
| **ItemNoMatch()** <ul><li>Checks whether the text excludes **any** (default) or **all** specified items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="forbidden")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |

## Descriptors: Text stats

Expand Down
4 changes: 4 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from .sentiment_descriptor import Sentiment
from .text_contains_descriptor import Contains
from .text_contains_descriptor import DoesNotContain
from .text_contains_descriptor import ItemMatch
from .text_contains_descriptor import ItemNoMatch
from .text_length_descriptor import TextLength
from .text_part_descriptor import BeginsWith
from .text_part_descriptor import EndsWith
Expand Down Expand Up @@ -47,6 +49,8 @@
"EndsWith",
"DoesNotContain",
"IncludesWords",
"ItemMatch",
"ItemNoMatch",
"ExcludesWords",
"TextLength",
"TriggerWordsPresence",
Expand Down
6 changes: 6 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@
"evidently.descriptors.text_contains_descriptor.DoesNotContain",
"evidently:descriptor:DoesNotContain",
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.text_contains_descriptor.ItemMatch", "evidently:descriptor:ItemMatch"
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.text_contains_descriptor.ItemNoMatch", "evidently:descriptor:ItemNoMatch"
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.text_length_descriptor.TextLength", "evidently:descriptor:TextLength"
)
Expand Down
34 changes: 34 additions & 0 deletions src/evidently/descriptors/text_contains_descriptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,37 @@ def feature(self, column_name: str) -> GeneratedFeature:
self.mode,
self.display_name,
)


class ItemMatch(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:ItemMatch"

with_column: str
mode: str = "any"
case_sensitive: bool = True

def feature(self, column_name: str) -> GeneratedFeature:
return text_contains_feature.ItemMatch(
columns=[column_name, self.with_column],
case_sensitive=self.case_sensitive,
mode=self.mode,
display_name=self.display_name,
)


class ItemNoMatch(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:ItemNoMatch"

with_column: str
mode: str = "any"
case_sensitive: bool = True

def feature(self, column_name: str) -> GeneratedFeature:
return text_contains_feature.ItemNoMatch(
columns=[column_name, self.with_column],
case_sensitive=self.case_sensitive,
mode=self.mode,
display_name=self.display_name,
)
6 changes: 6 additions & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@
register_type_alias(
GeneratedFeatures, "evidently.features.text_contains_feature.DoesNotContain", "evidently:feature:DoesNotContain"
)
register_type_alias(
GeneratedFeatures, "evidently.features.text_contains_feature.ItemMatch", "evidently:feature:ItemMatch"
)
register_type_alias(
GeneratedFeatures, "evidently.features.text_contains_feature.ItemNoMatch", "evidently:feature:ItemNoMatch"
)
register_type_alias(
GeneratedFeatures, "evidently.features.text_length_feature.TextLength", "evidently:feature:TextLength"
)
Expand Down
106 changes: 106 additions & 0 deletions src/evidently/features/text_contains_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,109 @@ def comparison(self, item: str, string: str):
if self.case_sensitive:
return item in string
return item.casefold() in string.casefold()


class ItemMatch(GeneratedFeature):
class Config:
type_alias = "evidently:feature:ItemMatch"

__feature_type__: ClassVar = ColumnType.Categorical
columns: List[str]
case_sensitive: bool
mode: str

def __init__(
self,
columns: List[str],
case_sensitive: bool = True,
mode: str = "any",
display_name: Optional[str] = None,
):
if len(columns) != 2:
raise ValueError("two columns must be provided")
self.columns = columns
self.display_name = display_name
self.case_sensitive = case_sensitive
if mode not in ["any", "all"]:
raise ValueError("mode must be either 'any' or 'all'")
self.mode = mode
super().__init__()

def _feature_column_name(self) -> str:
return f"{self.columns[0]}_{self.columns[1]}" + "_item_match_" + str(self.case_sensitive) + "_" + self.mode

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
if self.mode == "any":
calculated = data.apply(
lambda row: any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
else:
calculated = data.apply(
lambda row: all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
return pd.DataFrame({self._feature_column_name(): calculated})

def _as_column(self) -> ColumnName:
return self._create_column(
self._feature_column_name(),
default_display_name=f"Text contains {self.mode} of defined items",
)

def comparison(self, item: str, string: str):
if self.case_sensitive:
return item in string
return item.casefold() in string.casefold()


class ItemNoMatch(GeneratedFeature):
class Config:
type_alias = "evidently:feature:ItemNoMatch"

__feature_type__: ClassVar = ColumnType.Categorical
columns: List[str]
case_sensitive: bool
mode: str

def __init__(
self,
columns: List[str],
case_sensitive: bool = True,
mode: str = "any",
display_name: Optional[str] = None,
):
self.columns = columns
self.display_name = display_name
self.case_sensitive = case_sensitive
if mode not in ["any", "all"]:
raise ValueError("mode must be either 'any' or 'all'")
self.mode = mode
super().__init__()

def _feature_column_name(self) -> str:
return f"{self.columns[0]}_{self.columns[1]}" + "_item_no_match_" + str(self.case_sensitive) + "_" + self.mode

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
if self.mode == "any":
calculated = data.apply(
lambda row: not any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
else:
calculated = data.apply(
lambda row: not all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
return pd.DataFrame({self._feature_column_name(): calculated})

def _as_column(self) -> ColumnName:
return self._create_column(
self._feature_column_name(),
default_display_name=f"Text does not contain {self.mode} of defined items",
)

def comparison(self, item: str, string: str):
if self.case_sensitive:
return item in string
return item.casefold() in string.casefold()
82 changes: 82 additions & 0 deletions tests/features/test_text_contains_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from evidently.features.text_contains_feature import Contains
from evidently.features.text_contains_feature import DoesNotContain
from evidently.features.text_contains_feature import ItemMatch
from evidently.features.text_contains_feature import ItemNoMatch
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition

Expand Down Expand Up @@ -61,3 +63,83 @@ def test_text_not_contains_feature(items: List[str], case: bool, mode: str, expe
column_expected = feature_generator._feature_column_name()
expected_df = pd.DataFrame({column_expected: expected})
assert result.equals(expected_df)


@pytest.mark.parametrize(
("case", "mode", "expected"),
[
(True, "any", [False, True, False, True, False]),
(True, "all", [False, True, False, False, False]),
(False, "any", [True, True, True, True, False]),
(False, "all", [False, True, True, False, False]),
],
)
def test_item_match(case: bool, mode: str, expected: List[bool]):
data = {
"generated": [
"You should consider purchasing Nike or Adidas shoes.",
"I eat apples, grapes, and oranges",
"grapes, oranges, apples.",
"Oranges are more sour than grapes.",
"This test doesn't have the words.",
],
"expected": [
["nike", "adidas", "puma"],
["grapes", "apples", "oranges"],
["Apples", "Oranges", "Grapes"],
["orange", "sweet", "grape"],
["none", "of", "these"],
],
}
df = pd.DataFrame(data)
df["expected"] = df["expected"].apply(tuple)
feature_generator = ItemMatch(columns=["generated", "expected"], case_sensitive=case, mode=mode)
result = feature_generator.generate_feature(
data=df,
data_definition=create_data_definition(None, df, ColumnMapping()),
)
column_expected = feature_generator._feature_column_name()
column_name_obj = feature_generator._as_column()
expected_df = pd.DataFrame({column_expected: expected})
assert result.equals(expected_df)
assert column_name_obj.display_name == f"Text contains {mode} of defined items"


@pytest.mark.parametrize(
("case", "mode", "expected"),
[
(True, "any", [True, False, True, False, True]),
(True, "all", [True, False, True, True, True]),
(False, "any", [False, False, False, False, True]),
(False, "all", [True, False, False, True, True]),
],
)
def test_item_no_match(case: bool, mode: str, expected: List[bool]):
data = {
"generated": [
"You should consider purchasing Nike or Adidas shoes.",
"I eat apples, grapes, and oranges",
"grapes, oranges, apples.",
"Oranges are more sour than grapes.",
"This test doesn't have the words.",
],
"forbidden": [
["nike", "adidas", "puma"],
["grapes", "apples", "oranges"],
["Apples", "Oranges", "Grapes"],
["orange", "sweet", "grape"],
["none", "of", "these"],
],
}
feature_generator = ItemNoMatch(columns=["generated", "forbidden"], case_sensitive=case, mode=mode)
df = pd.DataFrame(data)
df["forbidden"] = df["forbidden"].apply(tuple)
result = feature_generator.generate_feature(
data=df,
data_definition=create_data_definition(None, df, ColumnMapping()),
)
column_expected = feature_generator._feature_column_name()
column_name_obj = feature_generator._as_column()
expected_df = pd.DataFrame({column_expected: expected})
assert result.equals(expected_df)
assert column_name_obj.display_name == f"Text does not contain {mode} of defined items"