From d0729f1f192cce0a69079a9f36e5eb8ffc3d754a Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 12:05:48 +0200
Subject: [PATCH 01/63] base

---
 src/evidently/dataset_generators/__init__.py |  0
 src/evidently/dataset_generators/base.py     | 12 ++++++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 src/evidently/dataset_generators/__init__.py
 create mode 100644 src/evidently/dataset_generators/base.py

diff --git a/src/evidently/dataset_generators/__init__.py b/src/evidently/dataset_generators/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/evidently/dataset_generators/base.py b/src/evidently/dataset_generators/base.py
new file mode 100644
index 0000000000..1ea71b4edc
--- /dev/null
+++ b/src/evidently/dataset_generators/base.py
@@ -0,0 +1,12 @@
+from abc import ABC
+
+from evidently.pydantic_utils import EvidentlyBaseModel
+
+
+class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
+    class Config:
+        type_alias = "evidently:dataset_generator:BaseDatasetGenerator"
+        is_base_type = True
+
+    def generate(self):
+        pass

From 3c59469b6dc1478bfaddd35bc034200d54754fdc Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 13:01:10 +0200
Subject: [PATCH 02/63] i am a banana

---
 examples/synth_data.py                        |  26 ++++
 src/evidently/dataset_generators/base.py      |  13 +-
 .../dataset_generators/llm/__init__.py        |   0
 src/evidently/dataset_generators/llm/aaa.py   | 104 +++++++++++++++
 src/evidently/dataset_generators/llm/base.py  |  25 ++++
 src/evidently/features/llm_judge.py           | 119 +----------------
 src/evidently/utils/llm.py                    | 124 ++++++++++++++++++
 7 files changed, 295 insertions(+), 116 deletions(-)
 create mode 100644 examples/synth_data.py
 create mode 100644 src/evidently/dataset_generators/llm/__init__.py
 create mode 100644 src/evidently/dataset_generators/llm/aaa.py
 create mode 100644 src/evidently/dataset_generators/llm/base.py
 create mode 100644 src/evidently/utils/llm.py

diff --git a/examples/synth_data.py b/examples/synth_data.py
new file mode 100644
index 0000000000..830710dd61
--- /dev/null
+++ b/examples/synth_data.py
@@ -0,0 +1,26 @@
+import os
+
+from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator, QuestionPairGenerator, SimpleChunkGenerator, SimpleQuestionPrompt
+from evidently.options.base import Options
+from evidently.ui.workspace import CloudWorkspace
+
+
+def main():
+    generator = QuestionPairGenerator(
+        chunks=SimpleChunkGenerator(chunks=["I am a banana"]),
+        questions=PromptQuestionGenerator(prompt=SimpleQuestionPrompt()),
+        num_questions=2,
+        provider="openai",
+        model="gpt-4o-mini",
+        options=Options.from_any_options(None)
+    )
+    generated = generator.generate()
+    print(generated)
+
+    client = CloudWorkspace(token=os.environ.get("EVIDENTLY_TOKEN"))
+
+    client.add_dataset(generated, "synth data", project_id=...)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/src/evidently/dataset_generators/base.py b/src/evidently/dataset_generators/base.py
index 1ea71b4edc..9dcd5094d0 100644
--- a/src/evidently/dataset_generators/base.py
+++ b/src/evidently/dataset_generators/base.py
@@ -1,12 +1,21 @@
 from abc import ABC
+from abc import abstractmethod
 
+import pandas as pd
+
+from evidently.options.base import Options
 from evidently.pydantic_utils import EvidentlyBaseModel
 
+DatasetGeneratorResult = pd.DataFrame
+
 
 class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
     class Config:
         type_alias = "evidently:dataset_generator:BaseDatasetGenerator"
         is_base_type = True
 
-    def generate(self):
-        pass
+    options: Options
+
+    @abstractmethod
+    def generate(self) -> DatasetGeneratorResult:
+        raise NotImplementedError
diff --git a/src/evidently/dataset_generators/llm/__init__.py b/src/evidently/dataset_generators/llm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
new file mode 100644
index 0000000000..c22ed1d416
--- /dev/null
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -0,0 +1,104 @@
+import abc
+import json
+from abc import ABC
+from pathlib import Path
+from typing import ClassVar
+from typing import Iterator
+from typing import List
+from typing import Tuple
+
+import pandas as pd
+
+from evidently.dataset_generators.base import DatasetGeneratorResult
+from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
+from evidently.pydantic_utils import EvidentlyBaseModel
+from evidently.utils.llm import LLMWrapper
+
+LLMChunk = str
+
+
+class ChunkGenerator(EvidentlyBaseModel, ABC):
+    @abc.abstractmethod
+    def generate_chunks(self) -> Iterator[LLMChunk]:
+        raise NotImplementedError
+
+
+class FileContextGenerator(ChunkGenerator):
+    class Config:
+        type_alias = "asdfasdfasd"
+
+    path: str
+
+    def generate_chunks(self) -> Iterator[LLMChunk]:
+        data = Path(self.path).read_text()
+        for i in range(10):
+            yield data
+
+
+class SimpleChunkGenerator(ChunkGenerator):
+    class Config:
+        type_alias = "asdfasdasdfafasd"
+
+    chunks: List[LLMChunk]
+
+    def generate_chunks(self) -> Iterator[LLMChunk]:
+        yield from self.chunks
+
+
+Question = str
+Answer = str
+GeneratedQuestion = Tuple[Question, Answer, LLMChunk]
+
+
+class QuestionGenerator(EvidentlyBaseModel, ABC):
+    @abc.abstractmethod
+    def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQuestion:
+        raise NotImplementedError
+
+
+class QuestionPrompt(EvidentlyBaseModel):
+    class Config:
+        type_alias = "asdfasdasdfaadsfasfasd"
+
+    template: ClassVar[str] = ""
+
+
+class SimpleQuestionPrompt(QuestionPrompt):
+    class Config:
+        type_alias = "asdfasdasdfaaasdfadsfasfasd"
+
+    template: ClassVar[str] = (
+        'please generate a json with two fields "question" and "answer" with '
+        "question and answer about this: {chunk}. dont use markdown in resposne"
+    )
+
+
+class PromptQuestionGenerator(QuestionGenerator):
+    class Config:
+        type_alias = "asdfasdasdfaaasdfdsfasfasd"
+
+    prompt: QuestionPrompt
+
+    def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQuestion:
+        rendered = self.prompt.template.format(chunk=chunk)
+
+        result = wrapper.complete([("user", rendered)])
+        data = json.loads(result)
+        return data["question"], data["answer"], chunk
+
+
+class QuestionPairGenerator(BaseLLMDatasetGenerator):
+    class Config:
+        type_alias = "asdfasdasdfaaasdfdsfasfasd"
+
+    chunks: ChunkGenerator
+    questions: QuestionGenerator
+    num_questions: int
+
+    def generate(self) -> DatasetGeneratorResult:
+        qs: List[GeneratedQuestion] = []
+        for chunk in self.chunks.generate_chunks():
+            for i in range(self.num_questions):
+                qs.append(self.questions.generate_question(self.wrapper, chunk))
+
+        return pd.DataFrame(qs, columns=["question", "answer", "context"])
diff --git a/src/evidently/dataset_generators/llm/base.py b/src/evidently/dataset_generators/llm/base.py
new file mode 100644
index 0000000000..8be9f7aad6
--- /dev/null
+++ b/src/evidently/dataset_generators/llm/base.py
@@ -0,0 +1,25 @@
+from typing import Optional
+
+from evidently._pydantic_compat import PrivateAttr
+from evidently.dataset_generators.base import BaseDatasetGenerator
+from evidently.options.base import Options
+from evidently.utils.llm import LLMWrapper
+from evidently.utils.llm import get_llm_wrapper
+
+
+class BaseLLMDatasetGenerator(
+    # fixme WithLLMWrapper,
+    BaseDatasetGenerator
+):
+    provider: str
+    model: str
+    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
+
+    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
+        if self._llm_wrapper is None:
+            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
+        return self._llm_wrapper
+
+    @property
+    def wrapper(self):
+        return self.get_llm_wrapper(self.options)
diff --git a/src/evidently/features/llm_judge.py b/src/evidently/features/llm_judge.py
index 5a193c0853..4f64a75d0c 100644
--- a/src/evidently/features/llm_judge.py
+++ b/src/evidently/features/llm_judge.py
@@ -2,81 +2,29 @@
 from abc import ABC
 from abc import abstractmethod
 from enum import Enum
-from typing import Callable
 from typing import ClassVar
 from typing import Dict
 from typing import Iterator
 from typing import List
 from typing import Optional
 from typing import Tuple
-from typing import Type
 from typing import Union
 
 import pandas as pd
 
 from evidently import ColumnType
 from evidently._pydantic_compat import Field
-from evidently._pydantic_compat import PrivateAttr
-from evidently._pydantic_compat import SecretStr
 from evidently.base_metric import ColumnName
-from evidently.errors import EvidentlyError
 from evidently.features.generated_features import GeneratedFeatures
 from evidently.options.base import Options
-from evidently.options.option import Option
 from evidently.pydantic_utils import EnumValueMixin
 from evidently.pydantic_utils import EvidentlyBaseModel
 from evidently.pydantic_utils import autoregister
 from evidently.utils.data_preprocessing import DataDefinition
-
-LLMMessage = Tuple[str, str]
-LLMResponse = Dict[str, Union[str, float]]
-
-
-class EvidentlyLLMError(EvidentlyError):
-    pass
-
-
-class LLMResponseParseError(EvidentlyLLMError):
-    pass
-
-
-class LLMRequestError(EvidentlyLLMError):
-    pass
-
-
-class LLMWrapper(ABC):
-    __used_options__: ClassVar[List[Type[Option]]] = []
-
-    @abstractmethod
-    def complete(self, messages: List[LLMMessage]) -> str:
-        raise NotImplementedError
-
-    def get_used_options(self) -> List[Type[Option]]:
-        return self.__used_options__
-
-
-LLMProvider = str
-LLMModel = str
-LLMWrapperProvider = Callable[[LLMModel, Options], LLMWrapper]
-_wrappers: Dict[Tuple[LLMProvider, Optional[LLMModel]], LLMWrapperProvider] = {}
-
-
-def llm_provider(name: LLMProvider, model: Optional[LLMModel]):
-    def dec(f: LLMWrapperProvider):
-        _wrappers[(name, model)] = f
-        return f
-
-    return dec
-
-
-def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) -> LLMWrapper:
-    key: Tuple[str, Optional[str]] = (provider, model)
-    if key in _wrappers:
-        return _wrappers[key](model, options)
-    key = (provider, None)
-    if key in _wrappers:
-        return _wrappers[key](model, options)
-    raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
+from evidently.utils.llm import LLMMessage
+from evidently.utils.llm import LLMResponse
+from evidently.utils.llm import LLMResponseParseError
+from evidently.utils.llm import WithLLMWrapper
 
 
 class BaseLLMPromptTemplate(EvidentlyBaseModel, ABC):
@@ -251,7 +199,7 @@ def get_system_prompts(self) -> List[LLMMessage]:
         return self.pre_messages
 
 
-class LLMJudge(GeneratedFeatures):
+class LLMJudge(GeneratedFeatures, WithLLMWrapper):
     class Config:
         type_alias = "evidently:feature:LLMJudge"
 
@@ -259,20 +207,10 @@ class Config:
 
     DEFAULT_INPUT_COLUMN: ClassVar = "input"
 
-    provider: str
-    model: str
-
     input_column: Optional[str] = None
     input_columns: Optional[Dict[str, str]] = None
     template: BaseLLMPromptTemplate
 
-    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
-
-    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
-        if self._llm_wrapper is None:
-            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
-        return self._llm_wrapper
-
     def get_input_columns(self):
         if self.input_column is None:
             assert self.input_columns is not None  # todo: validate earlier
@@ -300,50 +238,3 @@ def get_type(self, subcolumn: Optional[str] = None) -> ColumnType:
             subcolumn = self._extract_subcolumn_name(subcolumn)
 
         return self.template.get_type(subcolumn)
-
-
-class OpenAIKey(Option):
-    api_key: Optional[SecretStr] = None
-
-    def __init__(self, api_key: Optional[str] = None):
-        self.api_key = SecretStr(api_key) if api_key is not None else None
-        super().__init__()
-
-    def get_value(self) -> Optional[str]:
-        if self.api_key is None:
-            return None
-        return self.api_key.get_secret_value()
-
-
-@llm_provider("openai", None)
-class OpenAIWrapper(LLMWrapper):
-    __used_options__: ClassVar = [OpenAIKey]
-
-    def __init__(self, model: str, options: Options):
-        import openai
-
-        self.model = model
-        self.client = openai.OpenAI(api_key=options.get(OpenAIKey).get_value())
-
-    def complete(self, messages: List[LLMMessage]) -> str:
-        import openai
-
-        messages = [{"role": user, "content": msg} for user, msg in messages]
-        try:
-            response = self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
-        except openai.OpenAIError as e:
-            raise LLMRequestError("Failed to call OpenAI complete API") from e
-        content = response.choices[0].message.content
-        assert content is not None  # todo: better error
-        return content
-
-
-@llm_provider("litellm", None)
-class LiteLLMWrapper(LLMWrapper):
-    def __init__(self, model: str):
-        self.model = model
-
-    def complete(self, messages: List[LLMMessage]) -> str:
-        from litellm import completion
-
-        return completion(model=self.model, messages=messages).choices[0].message.content
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
new file mode 100644
index 0000000000..a0e6f06eac
--- /dev/null
+++ b/src/evidently/utils/llm.py
@@ -0,0 +1,124 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Callable
+from typing import ClassVar
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Type
+from typing import Union
+
+from evidently._pydantic_compat import PrivateAttr
+from evidently._pydantic_compat import SecretStr
+from evidently.errors import EvidentlyError
+from evidently.options.base import Options
+from evidently.options.option import Option
+
+LLMMessage = Tuple[str, str]
+LLMResponse = Dict[str, Union[str, float]]
+
+
+class EvidentlyLLMError(EvidentlyError):
+    pass
+
+
+class LLMResponseParseError(EvidentlyLLMError):
+    pass
+
+
+class LLMRequestError(EvidentlyLLMError):
+    pass
+
+
+class LLMWrapper(ABC):
+    __used_options__: ClassVar[List[Type[Option]]] = []
+
+    @abstractmethod
+    def complete(self, messages: List[LLMMessage]) -> str:
+        raise NotImplementedError
+
+    def get_used_options(self) -> List[Type[Option]]:
+        return self.__used_options__
+
+
+LLMProvider = str
+LLMModel = str
+LLMWrapperProvider = Callable[[LLMModel, Options], LLMWrapper]
+_wrappers: Dict[Tuple[LLMProvider, Optional[LLMModel]], LLMWrapperProvider] = {}
+
+
+def llm_provider(name: LLMProvider, model: Optional[LLMModel]):
+    def dec(f: LLMWrapperProvider):
+        _wrappers[(name, model)] = f
+        return f
+
+    return dec
+
+
+def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) -> LLMWrapper:
+    key: Tuple[str, Optional[str]] = (provider, model)
+    if key in _wrappers:
+        return _wrappers[key](model, options)
+    key = (provider, None)
+    if key in _wrappers:
+        return _wrappers[key](model, options)
+    raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
+
+
+class WithLLMWrapper:
+    provider: str
+    model: str
+    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
+
+    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
+        if self._llm_wrapper is None:
+            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
+        return self._llm_wrapper
+
+
+class OpenAIKey(Option):
+    api_key: Optional[SecretStr] = None
+
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = SecretStr(api_key) if api_key is not None else None
+        super().__init__()
+
+    def get_value(self) -> Optional[str]:
+        if self.api_key is None:
+            return None
+        return self.api_key.get_secret_value()
+
+
+@llm_provider("openai", None)
+class OpenAIWrapper(LLMWrapper):
+    __used_options__: ClassVar = [OpenAIKey]
+
+    def __init__(self, model: str, options: Options):
+        import openai
+
+        self.model = model
+        self.client = openai.OpenAI(api_key=options.get(OpenAIKey).get_value())
+
+    def complete(self, messages: List[LLMMessage]) -> str:
+        import openai
+
+        messages = [{"role": user, "content": msg} for user, msg in messages]
+        try:
+            response = self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
+        except openai.OpenAIError as e:
+            raise LLMRequestError("Failed to call OpenAI complete API") from e
+        content = response.choices[0].message.content
+        assert content is not None  # todo: better error
+        return content
+
+
+@llm_provider("litellm", None)
+class LiteLLMWrapper(LLMWrapper):
+    def __init__(self, model: str):
+        self.model = model
+
+    def complete(self, messages: List[LLMMessage]) -> str:
+        from litellm import completion
+
+        return completion(model=self.model, messages=messages).choices[0].message.content

From 57e975a6bb09057ebc70815c2574dd0d014d539e Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 13:13:04 +0200
Subject: [PATCH 03/63] fix example

---
 examples/synth_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 830710dd61..66b61263eb 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -17,9 +17,9 @@ def main():
     generated = generator.generate()
     print(generated)
 
-    client = CloudWorkspace(token=os.environ.get("EVIDENTLY_TOKEN"))
+    client = CloudWorkspace(token=os.environ["EVIDENTLY_TOKEN"], url="https://app.evidently.dev")
 
-    client.add_dataset(generated, "synth data", project_id=...)
+    client.add_dataset(generated, "synth data", project_id="019270f6-6dda-7516-854b-aea2d84a4671")
 
 
 if __name__ == '__main__':

From 5f85465216ce52f9e47ec4fa1b6946c73c718063 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 14:29:40 +0200
Subject: [PATCH 04/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/aaa.py   | 11 +++++----
 .../dataset_generators/llm/generator.py       | 23 +++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 src/evidently/dataset_generators/llm/generator.py

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index c22ed1d416..e47aee6671 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -1,13 +1,13 @@
 import abc
 import json
 from abc import ABC
-from pathlib import Path
 from typing import ClassVar
 from typing import Iterator
 from typing import List
 from typing import Tuple
 
 import pandas as pd
+from llama_index.core.node_parser import SentenceSplitter
 
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
@@ -30,9 +30,11 @@ class Config:
     path: str
 
     def generate_chunks(self) -> Iterator[LLMChunk]:
-        data = Path(self.path).read_text()
-        for i in range(10):
-            yield data
+        with open(self.path) as f:
+            text = f.read()
+        splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
+        text_nodes = splitter.split_text(text)
+        yield from text_nodes
 
 
 class SimpleChunkGenerator(ChunkGenerator):
@@ -83,6 +85,7 @@ def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQu
         rendered = self.prompt.template.format(chunk=chunk)
 
         result = wrapper.complete([("user", rendered)])
+        print(result)
         data = json.loads(result)
         return data["question"], data["answer"], chunk
 
diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
new file mode 100644
index 0000000000..401b7857e4
--- /dev/null
+++ b/src/evidently/dataset_generators/llm/generator.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+import pandas as pd
+
+from evidently.dataset_generators.llm.aaa import FileContextGenerator
+from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator
+from evidently.dataset_generators.llm.aaa import QuestionPairGenerator
+from evidently.dataset_generators.llm.aaa import SimpleQuestionPrompt
+from evidently.options.base import Options
+
+
+def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFrame:
+    chunks = FileContextGenerator(path=file_path)
+    generator = QuestionPairGenerator(
+        chunks=chunks,
+        questions=PromptQuestionGenerator(prompt=SimpleQuestionPrompt()),
+        num_questions=num_questions,
+        provider="openai",
+        model="gpt-4o-mini",
+        options=Options.from_any_options(None),
+    )
+    generated = generator.generate()
+    return generated

From 79d58b0e51579b882cd71f674321ae27bd6b57f5 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 15:57:28 +0200
Subject: [PATCH 05/63] wip

---
 src/evidently/features/llm_judge.py | 127 +++++++++++++++++++++-
 src/evidently/utils/llm.py          | 157 +++++++++++++++++++++++++---
 2 files changed, 266 insertions(+), 18 deletions(-)

diff --git a/src/evidently/features/llm_judge.py b/src/evidently/features/llm_judge.py
index 4f64a75d0c..e7ad1c7370 100644
--- a/src/evidently/features/llm_judge.py
+++ b/src/evidently/features/llm_judge.py
@@ -2,29 +2,92 @@
 from abc import ABC
 from abc import abstractmethod
 from enum import Enum
+from typing import Callable
 from typing import ClassVar
 from typing import Dict
 from typing import Iterator
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Type
 from typing import Union
 
 import pandas as pd
 
 from evidently import ColumnType
 from evidently._pydantic_compat import Field
+from evidently._pydantic_compat import PrivateAttr
+from evidently._pydantic_compat import SecretStr
 from evidently.base_metric import ColumnName
+from evidently.errors import EvidentlyError
 from evidently.features.generated_features import GeneratedFeatures
 from evidently.options.base import Options
+from evidently.options.option import Option
 from evidently.pydantic_utils import EnumValueMixin
 from evidently.pydantic_utils import EvidentlyBaseModel
 from evidently.pydantic_utils import autoregister
 from evidently.utils.data_preprocessing import DataDefinition
-from evidently.utils.llm import LLMMessage
-from evidently.utils.llm import LLMResponse
-from evidently.utils.llm import LLMResponseParseError
-from evidently.utils.llm import WithLLMWrapper
+
+LLMMessage = Tuple[str, str]
+LLMResponse = Dict[str, Union[str, float]]
+
+
+class EvidentlyLLMError(EvidentlyError):
+    pass
+
+
+class LLMResponseParseError(EvidentlyLLMError):
+    pass
+
+
+class LLMRequestError(EvidentlyLLMError):
+    pass
+
+
+class LLMWrapper(ABC):
+    __used_options__: ClassVar[List[Type[Option]]] = []
+
+    @abstractmethod
+    def complete(self, messages: List[LLMMessage]) -> str:
+        raise NotImplementedError
+
+    def get_used_options(self) -> List[Type[Option]]:
+        return self.__used_options__
+
+
+LLMProvider = str
+LLMModel = str
+LLMWrapperProvider = Callable[[LLMModel, Options], LLMWrapper]
+_wrappers: Dict[Tuple[LLMProvider, Optional[LLMModel]], LLMWrapperProvider] = {}
+
+
+def llm_provider(name: LLMProvider, model: Optional[LLMModel]):
+    def dec(f: LLMWrapperProvider):
+        _wrappers[(name, model)] = f
+        return f
+
+    return dec
+
+
+def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) -> LLMWrapper:
+    key: Tuple[str, Optional[str]] = (provider, model)
+    if key in _wrappers:
+        return _wrappers[key](model, options)
+    key = (provider, None)
+    if key in _wrappers:
+        return _wrappers[key](model, options)
+    raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
+
+
+class WithLLMWrapper:
+    provider: str
+    model: str
+    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
+
+    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
+        if self._llm_wrapper is None:
+            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
+        return self._llm_wrapper
 
 
 class BaseLLMPromptTemplate(EvidentlyBaseModel, ABC):
@@ -211,6 +274,15 @@ class Config:
     input_columns: Optional[Dict[str, str]] = None
     template: BaseLLMPromptTemplate
 
+    provider: str
+    model: str
+    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
+
+    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
+        if self._llm_wrapper is None:
+            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
+        return self._llm_wrapper
+
     def get_input_columns(self):
         if self.input_column is None:
             assert self.input_columns is not None  # todo: validate earlier
@@ -238,3 +310,50 @@ def get_type(self, subcolumn: Optional[str] = None) -> ColumnType:
             subcolumn = self._extract_subcolumn_name(subcolumn)
 
         return self.template.get_type(subcolumn)
+
+
+class OpenAIKey(Option):
+    api_key: Optional[SecretStr] = None
+
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = SecretStr(api_key) if api_key is not None else None
+        super().__init__()
+
+    def get_value(self) -> Optional[str]:
+        if self.api_key is None:
+            return None
+        return self.api_key.get_secret_value()
+
+
+@llm_provider("openai", None)
+class OpenAIWrapper(LLMWrapper):
+    __used_options__: ClassVar = [OpenAIKey]
+
+    def __init__(self, model: str, options: Options):
+        import openai
+
+        self.model = model
+        self.client = openai.OpenAI(api_key=options.get(OpenAIKey).get_value())
+
+    def complete(self, messages: List[LLMMessage]) -> str:
+        import openai
+
+        messages = [{"role": user, "content": msg} for user, msg in messages]
+        try:
+            response = self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
+        except openai.OpenAIError as e:
+            raise LLMRequestError("Failed to call OpenAI complete API") from e
+        content = response.choices[0].message.content
+        assert content is not None  # todo: better error
+        return content
+
+
+@llm_provider("litellm", None)
+class LiteLLMWrapper(LLMWrapper):
+    def __init__(self, model: str):
+        self.model = model
+
+    def complete(self, messages: List[LLMMessage]) -> str:
+        from litellm import completion
+
+        return completion(model=self.model, messages=messages).choices[0].message.content
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index a0e6f06eac..703df8b70b 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -1,22 +1,34 @@
+import dataclasses
 from abc import ABC
 from abc import abstractmethod
+from typing import Any
 from typing import Callable
 from typing import ClassVar
 from typing import Dict
+from typing import Iterator
 from typing import List
 from typing import Optional
+from typing import Sequence
 from typing import Tuple
 from typing import Type
 from typing import Union
 
-from evidently._pydantic_compat import PrivateAttr
+from evidently._pydantic_compat import Field
 from evidently._pydantic_compat import SecretStr
 from evidently.errors import EvidentlyError
+from evidently.features.llm_judge import Uncertainty
 from evidently.options.base import Options
 from evidently.options.option import Option
+from evidently.pydantic_utils import EvidentlyBaseModel
 
-LLMMessage = Tuple[str, str]
-LLMResponse = Dict[str, Union[str, float]]
+
+@dataclasses.dataclass
+class LLMMessage:
+    role: str
+    content: str
+
+
+LLMResponse = Dict[str, Any]
 
 
 class EvidentlyLLMError(EvidentlyError):
@@ -66,17 +78,6 @@ def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) ->
     raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
 
 
-class WithLLMWrapper:
-    provider: str
-    model: str
-    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
-
-    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
-        if self._llm_wrapper is None:
-            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
-        return self._llm_wrapper
-
-
 class OpenAIKey(Option):
     api_key: Optional[SecretStr] = None
 
@@ -122,3 +123,131 @@ def complete(self, messages: List[LLMMessage]) -> str:
         from litellm import completion
 
         return completion(model=self.model, messages=messages).choices[0].message.content
+
+
+class PromptBlock(EvidentlyBaseModel):
+    class Config:
+        alias_required = False  # fixme
+
+    def render(self) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    def simple(cls, value: str):
+        return SimpleBlock(value=value)
+
+    @classmethod
+    def input(cls, placeholder_name: str = "input"):
+        return SimpleBlock(value=f"{{{placeholder_name}}}")
+
+    def anchored(self, start: str, end: str):
+        return Anchor(start=start, block=self, end=end)
+
+
+class Anchor(PromptBlock):
+    start: str
+    block: PromptBlock
+    end: str
+
+    def render(self) -> str:
+        return f"{self.start}\n{self.block.render()}\n{self.end}"
+
+
+class SimpleBlock(PromptBlock):
+    value: str
+
+    def render(self) -> str:
+        return self.value
+
+
+class OutputFormatBlock(PromptBlock):
+    def parse_response(self, response: str) -> Dict[str, str]:
+        raise NotImplementedError
+
+
+class JsonOutputFormatBlock(PromptBlock):
+    fields: Dict[str, Union[Tuple[str, str], str]]
+
+    def render(self) -> str:
+        values = []
+        example_rows = []
+        for field, descr in self.fields.items():
+            if isinstance(descr, tuple):
+                descr, field_key = descr
+            else:
+                field_key = field
+            values.append(field)
+            example_rows.append(f'"{field_key}": "{descr}"')
+
+        example_rows_str = "\n".join(example_rows)
+        return f"Return {', '.join(values)} formatted as json without formatting as follows:\n{{{{\n{example_rows_str}\n}}}}"
+
+
+class PromptTemplate(EvidentlyBaseModel):
+    class Config:
+        alias_required = False  # fixme
+
+    def get_blocks(self) -> Sequence[PromptBlock]:
+        raise NotImplementedError
+
+    def iterate(self, values: Sequence[Dict[str, str]]) -> Iterator[str]:
+        template = self.render()
+        for vals in values:
+            yield template.format(**vals)
+
+    def render(self) -> str:
+        return "\n".join(block.render() for block in self.get_blocks())
+
+    def parse(self, response: str) -> Dict[str, str]:
+        output = next((b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None)
+        if output is None:
+            return {"": response}
+        return output.parse_response(response)
+
+
+class BinaryClassificationPromtTemplate(PromptTemplate):
+    def get_blocks(self) -> Sequence[PromptBlock]:
+        fields = {}
+        if self.include_category:
+            cat = f"{self.target_category} or {self.non_target_category}"
+            if self.uncertainty == Uncertainty.UNKNOWN:
+                cat += " or UNKNOWN"
+            fields["category"] = (cat, self.output_column)
+        if self.include_score:
+            fields["score"] = ("<score here>", self.output_score_column)
+        if self.include_reasoning:
+            fields["reasoning"] = ('"<reasoning here>"', self.output_reasoning_column)
+        return [
+            PromptBlock.simple(self.criteria),
+            PromptBlock.simple(
+                f"Classify text between {self.anchor_start} and {self.anchor_end} "
+                f"into two categories: {self.target_category} and {self.non_target_category}."
+            ),
+            PromptBlock.input().anchored(self.anchor_start, self.anchor_end),
+            PromptBlock.func(self._instructions),
+            JsonOutputFormatBlock(fields=fields),
+        ]
+
+    criteria: str = ""
+    instructions_template: str = (
+        "Use the following categories for classification:\n{__categories__}\n{__scoring__}\nThink step by step."
+    )
+    anchor_start: str = "___text_starts_here___"
+    anchor_end: str = "___text_ends_here___"
+
+    placeholders: Dict[str, str] = {}
+    target_category: str
+    non_target_category: str
+
+    uncertainty: Uncertainty = Uncertainty.UNKNOWN
+
+    include_category: bool = True
+    include_reasoning: bool = False
+    include_score: bool = False
+    score_range: Tuple[float, float] = (0.0, 1.0)
+
+    output_column: str = "category"
+    output_reasoning_column: str = "reasoning"
+    output_score_column: str = "score"
+
+    pre_messages: List[LLMMessage] = Field(default_factory=list)

From 39271214418acf4a35c84f03f1f78f982b8af45a Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 15:58:37 +0200
Subject: [PATCH 06/63] wip

---
 src/evidently/features/llm_judge.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/evidently/features/llm_judge.py b/src/evidently/features/llm_judge.py
index e7ad1c7370..5a193c0853 100644
--- a/src/evidently/features/llm_judge.py
+++ b/src/evidently/features/llm_judge.py
@@ -79,17 +79,6 @@ def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) ->
     raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
 
 
-class WithLLMWrapper:
-    provider: str
-    model: str
-    _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
-
-    def get_llm_wrapper(self, options: Options) -> LLMWrapper:
-        if self._llm_wrapper is None:
-            self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
-        return self._llm_wrapper
-
-
 class BaseLLMPromptTemplate(EvidentlyBaseModel, ABC):
     class Config:
         is_base_type = True
@@ -262,7 +251,7 @@ def get_system_prompts(self) -> List[LLMMessage]:
         return self.pre_messages
 
 
-class LLMJudge(GeneratedFeatures, WithLLMWrapper):
+class LLMJudge(GeneratedFeatures):
     class Config:
         type_alias = "evidently:feature:LLMJudge"
 
@@ -270,12 +259,13 @@ class Config:
 
     DEFAULT_INPUT_COLUMN: ClassVar = "input"
 
+    provider: str
+    model: str
+
     input_column: Optional[str] = None
     input_columns: Optional[Dict[str, str]] = None
     template: BaseLLMPromptTemplate
 
-    provider: str
-    model: str
     _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
 
     def get_llm_wrapper(self, options: Options) -> LLMWrapper:

From 74660bfe4607cb1c4263926787a363bd8375aee8 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 16:02:48 +0200
Subject: [PATCH 07/63] wip

---
 src/evidently/dataset_generators/llm/aaa.py | 3 ++-
 src/evidently/utils/llm.py                  | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index e47aee6671..892a9d28b3 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -12,6 +12,7 @@
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.pydantic_utils import EvidentlyBaseModel
+from evidently.utils.llm import LLMMessage
 from evidently.utils.llm import LLMWrapper
 
 LLMChunk = str
@@ -84,7 +85,7 @@ class Config:
     def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQuestion:
         rendered = self.prompt.template.format(chunk=chunk)
 
-        result = wrapper.complete([("user", rendered)])
+        result = wrapper.complete([LLMMessage.user(rendered)])
         print(result)
         data = json.loads(result)
         return data["question"], data["answer"], chunk
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index 703df8b70b..8c96d2555f 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -27,6 +27,10 @@ class LLMMessage:
     role: str
     content: str
 
+    @classmethod
+    def user(cls, message: str):
+        return LLMMessage("user", message)
+
 
 LLMResponse = Dict[str, Any]
 

From 9d1971828eb2a99be10deeef861f3edfcdc651c5 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 17:19:10 +0200
Subject: [PATCH 08/63] wip

---
 examples/synth_data.py                      |   6 +-
 src/evidently/dataset_generators/llm/aaa.py |  33 ++---
 src/evidently/utils/llm.py                  | 131 ++++++++++++--------
 3 files changed, 92 insertions(+), 78 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 66b61263eb..b53d42dbca 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -17,9 +17,9 @@ def main():
     generated = generator.generate()
     print(generated)
 
-    client = CloudWorkspace(token=os.environ["EVIDENTLY_TOKEN"], url="https://app.evidently.dev")
-
-    client.add_dataset(generated, "synth data", project_id="019270f6-6dda-7516-854b-aea2d84a4671")
+    # client = CloudWorkspace(token=os.environ["EVIDENTLY_TOKEN"], url="https://app.evidently.dev")
+    #
+    # client.add_dataset(generated, "synth data", project_id="019270f6-6dda-7516-854b-aea2d84a4671")
 
 
 if __name__ == '__main__':
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 892a9d28b3..2f8be9e463 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -1,5 +1,4 @@
 import abc
-import json
 from abc import ABC
 from typing import ClassVar
 from typing import Iterator
@@ -12,8 +11,11 @@
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.pydantic_utils import EvidentlyBaseModel
+from evidently.utils.llm import BlockPromptTemplate
 from evidently.utils.llm import LLMMessage
 from evidently.utils.llm import LLMWrapper
+from evidently.utils.llm import PromptBlock
+from evidently.utils.llm import PromptTemplate
 
 LLMChunk = str
 
@@ -59,35 +61,24 @@ def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQu
         raise NotImplementedError
 
 
-class QuestionPrompt(EvidentlyBaseModel):
-    class Config:
-        type_alias = "asdfasdasdfaadsfasfasd"
-
-    template: ClassVar[str] = ""
-
-
-class SimpleQuestionPrompt(QuestionPrompt):
-    class Config:
-        type_alias = "asdfasdasdfaaasdfadsfasfasd"
-
-    template: ClassVar[str] = (
-        'please generate a json with two fields "question" and "answer" with '
-        "question and answer about this: {chunk}. dont use markdown in resposne"
-    )
+class SimpleQuestionPrompt(BlockPromptTemplate):
+    blocks: ClassVar = [
+        PromptBlock.simple("Please generate a question about this:"),
+        PromptBlock.input("chunk").anchored(),
+        PromptBlock.json_output(question="question text", answer="answer text"),
+    ]
 
 
 class PromptQuestionGenerator(QuestionGenerator):
     class Config:
         type_alias = "asdfasdasdfaaasdfdsfasfasd"
 
-    prompt: QuestionPrompt
+    prompt: PromptTemplate
 
     def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQuestion:
-        rendered = self.prompt.template.format(chunk=chunk)
-
+        rendered = self.prompt.render(chunk=chunk)
         result = wrapper.complete([LLMMessage.user(rendered)])
-        print(result)
-        data = json.loads(result)
+        data = self.prompt.parse(result)
         return data["question"], data["answer"], chunk
 
 
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index 8c96d2555f..e6873a0a80 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -1,4 +1,5 @@
 import dataclasses
+import json
 from abc import ABC
 from abc import abstractmethod
 from typing import Any
@@ -13,10 +14,8 @@
 from typing import Type
 from typing import Union
 
-from evidently._pydantic_compat import Field
 from evidently._pydantic_compat import SecretStr
 from evidently.errors import EvidentlyError
-from evidently.features.llm_judge import Uncertainty
 from evidently.options.base import Options
 from evidently.options.option import Option
 from evidently.pydantic_utils import EvidentlyBaseModel
@@ -108,7 +107,7 @@ def __init__(self, model: str, options: Options):
     def complete(self, messages: List[LLMMessage]) -> str:
         import openai
 
-        messages = [{"role": user, "content": msg} for user, msg in messages]
+        messages = [{"role": msg.role, "content": msg.content} for msg in messages]
         try:
             response = self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
         except openai.OpenAIError as e:
@@ -144,7 +143,11 @@ def simple(cls, value: str):
     def input(cls, placeholder_name: str = "input"):
         return SimpleBlock(value=f"{{{placeholder_name}}}")
 
-    def anchored(self, start: str, end: str):
+    @classmethod
+    def json_output(cls, **fields: Union[str, Tuple[str, str]]):
+        return JsonOutputFormatBlock(fields=fields)
+
+    def anchored(self, start: str = "__start__", end: str = "__end__"):
         return Anchor(start=start, block=self, end=end)
 
 
@@ -169,7 +172,7 @@ def parse_response(self, response: str) -> Dict[str, str]:
         raise NotImplementedError
 
 
-class JsonOutputFormatBlock(PromptBlock):
+class JsonOutputFormatBlock(OutputFormatBlock):
     fields: Dict[str, Union[Tuple[str, str], str]]
 
     def render(self) -> str:
@@ -186,72 +189,92 @@ def render(self) -> str:
         example_rows_str = "\n".join(example_rows)
         return f"Return {', '.join(values)} formatted as json without formatting as follows:\n{{{{\n{example_rows_str}\n}}}}"
 
+    def parse_response(self, response: str) -> Dict[str, str]:
+        try:
+            return json.loads(response)
+        except json.JSONDecodeError as e:
+            raise LLMResponseParseError(f"Failed to parse response '{response}' as json") from e
+
 
 class PromptTemplate(EvidentlyBaseModel):
     class Config:
         alias_required = False  # fixme
 
+    @abstractmethod
     def get_blocks(self) -> Sequence[PromptBlock]:
         raise NotImplementedError
 
     def iterate(self, values: Sequence[Dict[str, str]]) -> Iterator[str]:
-        template = self.render()
+        template = self.get_template()
         for vals in values:
             yield template.format(**vals)
 
-    def render(self) -> str:
+    def render(self, **values: str):
+        return self.get_template().format(**values)
+
+    def get_template(self) -> str:
         return "\n".join(block.render() for block in self.get_blocks())
 
-    def parse(self, response: str) -> Dict[str, str]:
+    def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, str]:
         output = next((b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None)
         if output is None:
             return {"": response}
-        return output.parse_response(response)
+        parsed = output.parse_response(response)
+        if keys is not None and set(keys) != set(parsed.keys()):
+            raise LLMResponseParseError(f"Keys {keys} are required but got {list(parsed.keys())}")
+        return parsed
+
 
+class BlockPromptTemplate(PromptTemplate):
+    blocks: ClassVar[List[PromptBlock]]
 
-class BinaryClassificationPromtTemplate(PromptTemplate):
     def get_blocks(self) -> Sequence[PromptBlock]:
-        fields = {}
-        if self.include_category:
-            cat = f"{self.target_category} or {self.non_target_category}"
-            if self.uncertainty == Uncertainty.UNKNOWN:
-                cat += " or UNKNOWN"
-            fields["category"] = (cat, self.output_column)
-        if self.include_score:
-            fields["score"] = ("<score here>", self.output_score_column)
-        if self.include_reasoning:
-            fields["reasoning"] = ('"<reasoning here>"', self.output_reasoning_column)
-        return [
-            PromptBlock.simple(self.criteria),
-            PromptBlock.simple(
-                f"Classify text between {self.anchor_start} and {self.anchor_end} "
-                f"into two categories: {self.target_category} and {self.non_target_category}."
-            ),
-            PromptBlock.input().anchored(self.anchor_start, self.anchor_end),
-            PromptBlock.func(self._instructions),
-            JsonOutputFormatBlock(fields=fields),
-        ]
-
-    criteria: str = ""
-    instructions_template: str = (
-        "Use the following categories for classification:\n{__categories__}\n{__scoring__}\nThink step by step."
-    )
-    anchor_start: str = "___text_starts_here___"
-    anchor_end: str = "___text_ends_here___"
-
-    placeholders: Dict[str, str] = {}
-    target_category: str
-    non_target_category: str
-
-    uncertainty: Uncertainty = Uncertainty.UNKNOWN
-
-    include_category: bool = True
-    include_reasoning: bool = False
-    include_score: bool = False
-    score_range: Tuple[float, float] = (0.0, 1.0)
-
-    output_column: str = "category"
-    output_reasoning_column: str = "reasoning"
-    output_score_column: str = "score"
-
-    pre_messages: List[LLMMessage] = Field(default_factory=list)
+        return self.blocks
+
+
+# class BinaryClassificationPromtTemplate(PromptTemplate):
+#     def get_blocks(self) -> Sequence[PromptBlock]:
+#         fields = {}
+#         if self.include_category:
+#             cat = f"{self.target_category} or {self.non_target_category}"
+#             if self.uncertainty == Uncertainty.UNKNOWN:
+#                 cat += " or UNKNOWN"
+#             fields["category"] = (cat, self.output_column)
+#         if self.include_score:
+#             fields["score"] = ("<score here>", self.output_score_column)
+#         if self.include_reasoning:
+#             fields["reasoning"] = ('"<reasoning here>"', self.output_reasoning_column)
+#         return [
+#             PromptBlock.simple(self.criteria),
+#             PromptBlock.simple(
+#                 f"Classify text between {self.anchor_start} and {self.anchor_end} "
+#                 f"into two categories: {self.target_category} and {self.non_target_category}."
+#             ),
+#             PromptBlock.input().anchored(self.anchor_start, self.anchor_end),
+#             PromptBlock.func(self._instructions),
+#             JsonOutputFormatBlock(fields=fields),
+#         ]
+#
+#     criteria: str = ""
+#     instructions_template: str = (
+#         "Use the following categories for classification:\n{__categories__}\n{__scoring__}\nThink step by step."
+#     )
+#     anchor_start: str = "___text_starts_here___"
+#     anchor_end: str = "___text_ends_here___"
+#
+#     placeholders: Dict[str, str] = {}
+#     target_category: str
+#     non_target_category: str
+#
+#     uncertainty: Uncertainty = Uncertainty.UNKNOWN
+#
+#     include_category: bool = True
+#     include_reasoning: bool = False
+#     include_score: bool = False
+#     score_range: Tuple[float, float] = (0.0, 1.0)
+#
+#     output_column: str = "category"
+#     output_reasoning_column: str = "reasoning"
+#     output_score_column: str = "score"
+#
+#     pre_messages: List[LLMMessage] = Field(default_factory=list)

From b827f8f79e37034b38256454db2394de18185752 Mon Sep 17 00:00:00 2001
From: Emeli Dral <emeli.dral@gmail.com>
Date: Wed, 9 Oct 2024 17:46:18 +0200
Subject: [PATCH 09/63] a draft code for a RAG dataset generation

---
 .../llm/data_generation_for_RAG.ipynb         | 1238 +++++++++++++++++
 1 file changed, 1238 insertions(+)
 create mode 100644 src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb

diff --git a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
new file mode 100644
index 0000000000..d9bae54b0b
--- /dev/null
+++ b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
@@ -0,0 +1,1238 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "697ef555-f62c-424f-90da-bec9fbdace28",
+   "metadata": {},
+   "source": [
+    "## Extra Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bf4855a8-0d91-4d88-8fa2-05d2eb2ddbad",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting chromadb\n",
+      "  Obtaining dependency information for chromadb from https://files.pythonhosted.org/packages/43/cd/a875ed1f61365c9fdb46ee2de0cbea1735a9575ff718886f7eb218d4ef45/chromadb-0.5.12-py3-none-any.whl.metadata\n",
+      "  Downloading chromadb-0.5.12-py3-none-any.whl.metadata (6.8 kB)\n",
+      "Collecting build>=1.0.3 (from chromadb)\n",
+      "  Obtaining dependency information for build>=1.0.3 from https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl.metadata\n",
+      "  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)\n",
+      "Requirement already satisfied: pydantic>=1.9 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.10.14)\n",
+      "Collecting chroma-hnswlib==0.7.6 (from chromadb)\n",
+      "  Obtaining dependency information for chroma-hnswlib==0.7.6 from https://files.pythonhosted.org/packages/0d/19/aa6f2139f1ff7ad23a690ebf2a511b2594ab359915d7979f76f3213e46c4/chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl.metadata\n",
+      "  Downloading chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (252 bytes)\n",
+      "Requirement already satisfied: fastapi>=0.95.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.104.1)\n",
+      "Requirement already satisfied: uvicorn[standard]>=0.18.3 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.23.2)\n",
+      "Requirement already satisfied: numpy>=1.22.5 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.26.2)\n",
+      "Collecting posthog>=2.4.0 (from chromadb)\n",
+      "  Obtaining dependency information for posthog>=2.4.0 from https://files.pythonhosted.org/packages/c2/11/a8d4283b324cda992fbb72611c46c5c68f87902a10383dba1bde91660cc6/posthog-3.7.0-py2.py3-none-any.whl.metadata\n",
+      "  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)\n",
+      "Requirement already satisfied: typing-extensions>=4.5.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (4.8.0)\n",
+      "Collecting onnxruntime>=1.14.1 (from chromadb)\n",
+      "  Obtaining dependency information for onnxruntime>=1.14.1 from https://files.pythonhosted.org/packages/f0/ff/77bee5df55f034ee81d2e1bc58b2b8511b9c54f06ce6566cb562c5d95aa5/onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl.metadata\n",
+      "  Downloading onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl.metadata (4.5 kB)\n",
+      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.25.0)\n",
+      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.25.0)\n",
+      "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation-fastapi>=0.41b0 from https://files.pythonhosted.org/packages/ee/50/745ab075a3041b7a5f29a579d2c28eaad54f64b4589d8f9fd364c62cf0f3/opentelemetry_instrumentation_fastapi-0.48b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation_fastapi-0.48b0-py3-none-any.whl.metadata (2.1 kB)\n",
+      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.25.0)\n",
+      "Requirement already satisfied: tokenizers>=0.13.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.15.2)\n",
+      "Collecting pypika>=0.48.9 (from chromadb)\n",
+      "  Downloading PyPika-0.48.9.tar.gz (67 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: tqdm>=4.65.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (4.66.1)\n",
+      "Requirement already satisfied: overrides>=7.3.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (7.4.0)\n",
+      "Requirement already satisfied: importlib-resources in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (6.1.3)\n",
+      "Requirement already satisfied: grpcio>=1.58.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.65.0)\n",
+      "Collecting bcrypt>=4.0.1 (from chromadb)\n",
+      "  Obtaining dependency information for bcrypt>=4.0.1 from https://files.pythonhosted.org/packages/96/86/8c6a84daed4dd878fbab094400c9174c43d9b838ace077a2f8ee8bc3ae12/bcrypt-4.2.0-cp39-abi3-macosx_10_12_universal2.whl.metadata\n",
+      "  Downloading bcrypt-4.2.0-cp39-abi3-macosx_10_12_universal2.whl.metadata (9.6 kB)\n",
+      "Requirement already satisfied: typer>=0.9.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.12.3)\n",
+      "Requirement already satisfied: kubernetes>=28.1.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (29.0.0)\n",
+      "Collecting tenacity>=8.2.3 (from chromadb)\n",
+      "  Obtaining dependency information for tenacity>=8.2.3 from https://files.pythonhosted.org/packages/b6/cb/b86984bed139586d01532a587464b5805f12e397594f19f931c4c2fbfa61/tenacity-9.0.0-py3-none-any.whl.metadata\n",
+      "  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)\n",
+      "Requirement already satisfied: PyYAML>=6.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (6.0.1)\n",
+      "Collecting mmh3>=4.0.1 (from chromadb)\n",
+      "  Obtaining dependency information for mmh3>=4.0.1 from https://files.pythonhosted.org/packages/13/f0/2d3daca276a4673f82af859e4b0b18befd4e6e54f1017ba48ea9735b2f1b/mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl.metadata\n",
+      "  Downloading mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (14 kB)\n",
+      "Requirement already satisfied: orjson>=3.9.12 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (3.10.1)\n",
+      "Collecting httpx>=0.27.0 (from chromadb)\n",
+      "  Obtaining dependency information for httpx>=0.27.0 from https://files.pythonhosted.org/packages/56/95/9377bcb415797e44274b51d46e3249eba641711cf3348050f76ee7b15ffc/httpx-0.27.2-py3-none-any.whl.metadata\n",
+      "  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n",
+      "Requirement already satisfied: rich>=10.11.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (13.5.2)\n",
+      "Requirement already satisfied: packaging>=19.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from build>=1.0.3->chromadb) (23.1)\n",
+      "Collecting pyproject_hooks (from build>=1.0.3->chromadb)\n",
+      "  Obtaining dependency information for pyproject_hooks from https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl.metadata\n",
+      "  Downloading pyproject_hooks-1.2.0-py3-none-any.whl.metadata (1.3 kB)\n",
+      "Requirement already satisfied: anyio<4.0.0,>=3.7.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from fastapi>=0.95.2->chromadb) (3.7.1)\n",
+      "Requirement already satisfied: starlette<0.28.0,>=0.27.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from fastapi>=0.95.2->chromadb) (0.27.0)\n",
+      "Requirement already satisfied: certifi in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (2024.7.4)\n",
+      "Collecting httpcore==1.* (from httpx>=0.27.0->chromadb)\n",
+      "  Obtaining dependency information for httpcore==1.* from https://files.pythonhosted.org/packages/06/89/b161908e2f51be56568184aeb4a880fd287178d176fd1c860d2217f41106/httpcore-1.0.6-py3-none-any.whl.metadata\n",
+      "  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)\n",
+      "Requirement already satisfied: idna in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (3.4)\n",
+      "Requirement already satisfied: sniffio in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (1.3.0)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.27.0->chromadb) (0.14.0)\n",
+      "Requirement already satisfied: six>=1.9.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n",
+      "Requirement already satisfied: google-auth>=1.0.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.25.1)\n",
+      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.6.1)\n",
+      "Requirement already satisfied: requests in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.32.1)\n",
+      "Requirement already satisfied: requests-oauthlib in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.3.1)\n",
+      "Requirement already satisfied: oauthlib>=3.2.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n",
+      "Requirement already satisfied: urllib3>=1.24.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.4)\n",
+      "Collecting coloredlogs (from onnxruntime>=1.14.1->chromadb)\n",
+      "  Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata\n",
+      "  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting flatbuffers (from onnxruntime>=1.14.1->chromadb)\n",
+      "  Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl.metadata\n",
+      "  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)\n",
+      "Requirement already satisfied: protobuf in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.1)\n",
+      "Requirement already satisfied: sympy in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\n",
+      "Requirement already satisfied: deprecated>=1.2.6 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\n",
+      "Requirement already satisfied: importlib-metadata<=7.1,>=6.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (6.8.0)\n",
+      "Requirement already satisfied: googleapis-common-protos~=1.52 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.61.0)\n",
+      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.25.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.25.0)\n",
+      "Requirement already satisfied: opentelemetry-proto==1.25.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.25.0)\n",
+      "Collecting opentelemetry-instrumentation-asgi==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation-asgi==0.48b0 from https://files.pythonhosted.org/packages/db/74/a0e0d38622856597dd8e630f2bd793760485eb165708e11b8be1696bbb5a/opentelemetry_instrumentation_asgi-0.48b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation_asgi-0.48b0-py3-none-any.whl.metadata (2.0 kB)\n",
+      "Collecting opentelemetry-instrumentation==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation==0.48b0 from https://files.pythonhosted.org/packages/0a/7f/405c41d4f359121376c9d5117dcf68149b8122d3f6c718996d037bd4d800/opentelemetry_instrumentation-0.48b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation-0.48b0-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting opentelemetry-semantic-conventions==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-semantic-conventions==0.48b0 from https://files.pythonhosted.org/packages/b7/7a/4f0063dbb0b6c971568291a8bc19a4ca70d3c185db2d956230dd67429dfc/opentelemetry_semantic_conventions-0.48b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_semantic_conventions-0.48b0-py3-none-any.whl.metadata (2.4 kB)\n",
+      "Collecting opentelemetry-util-http==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-util-http==0.48b0 from https://files.pythonhosted.org/packages/ad/2e/36097c0a4d0115b8c7e377c90bab7783ac183bc5cb4071308f8959454311/opentelemetry_util_http-0.48b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_util_http-0.48b0-py3-none-any.whl.metadata (2.5 kB)\n",
+      "Requirement already satisfied: setuptools>=16.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.48b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (65.5.1)\n",
+      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.48b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\n",
+      "Collecting asgiref~=3.0 (from opentelemetry-instrumentation-asgi==0.48b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for asgiref~=3.0 from https://files.pythonhosted.org/packages/39/e3/893e8757be2612e6c266d9bb58ad2e3651524b5b40cf56761e985a28b13e/asgiref-3.8.1-py3-none-any.whl.metadata\n",
+      "  Downloading asgiref-3.8.1-py3-none-any.whl.metadata (9.3 kB)\n",
+      "Collecting opentelemetry-api>=1.2.0 (from chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-api>=1.2.0 from https://files.pythonhosted.org/packages/fb/1f/737dcdbc9fea2fa96c1b392ae47275165a7c641663fbb08a8d252968eed2/opentelemetry_api-1.27.0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_api-1.27.0-py3-none-any.whl.metadata (1.4 kB)\n",
+      "INFO: pip is looking at multiple versions of opentelemetry-sdk to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation-fastapi>=0.41b0 from https://files.pythonhosted.org/packages/a5/29/a97842d6dfa679bf0f3624ce1ea3458eb185befd536cafe580daa9ab68ae/opentelemetry_instrumentation_fastapi-0.47b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation_fastapi-0.47b0-py3-none-any.whl.metadata (2.1 kB)\n",
+      "Collecting opentelemetry-instrumentation-asgi==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation-asgi==0.47b0 from https://files.pythonhosted.org/packages/ba/d9/c74cb6d69589cc97d856cb3f427dfcef37ec16f9564586290c9c075d9020/opentelemetry_instrumentation_asgi-0.47b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation_asgi-0.47b0-py3-none-any.whl.metadata (2.0 kB)\n",
+      "Collecting opentelemetry-instrumentation==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation==0.47b0 from https://files.pythonhosted.org/packages/1f/6a/be31a84ddd13e9018fcca6885e4710f227eb0fd06eda1896da67287faa2e/opentelemetry_instrumentation-0.47b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation-0.47b0-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting opentelemetry-semantic-conventions==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-semantic-conventions==0.47b0 from https://files.pythonhosted.org/packages/00/c2/ca5cef8e4cd8eec5a95deed95ec3f6005e499fd9d17ca08731ced03a6921/opentelemetry_semantic_conventions-0.47b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_semantic_conventions-0.47b0-py3-none-any.whl.metadata (2.4 kB)\n",
+      "Collecting opentelemetry-util-http==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-util-http==0.47b0 from https://files.pythonhosted.org/packages/10/7e/98749e14a4e3f4db8bc016e6b42aba40e4d934baeb8767b8658a99d0dfac/opentelemetry_util_http-0.47b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_util_http-0.47b0-py3-none-any.whl.metadata (2.5 kB)\n",
+      "Collecting opentelemetry-api>=1.2.0 (from chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-api>=1.2.0 from https://files.pythonhosted.org/packages/e3/a7/6322d1d7a1fb926e8b99208c27730f21217da2f1e0e11dab48a78a0427a4/opentelemetry_api-1.26.0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)\n",
+      "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation-fastapi>=0.41b0 from https://files.pythonhosted.org/packages/b8/96/905d575947342c4fd6781a28f6d7bc7f4f6670d45e3b1a85f8a06955c9ae/opentelemetry_instrumentation_fastapi-0.46b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation_fastapi-0.46b0-py3-none-any.whl.metadata (2.0 kB)\n",
+      "Collecting opentelemetry-instrumentation-asgi==0.46b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation-asgi==0.46b0 from https://files.pythonhosted.org/packages/47/8d/8955c7fbd949e3ea1c186c7422047f675bf4f7c8976afd2fdf713183318e/opentelemetry_instrumentation_asgi-0.46b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation_asgi-0.46b0-py3-none-any.whl.metadata (1.9 kB)\n",
+      "Collecting opentelemetry-instrumentation==0.46b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-instrumentation==0.46b0 from https://files.pythonhosted.org/packages/10/e5/d6fff0a6f6fbddf03c7fb48ab47925581c4f1a8268f9ad98e5ea4a8b90a5/opentelemetry_instrumentation-0.46b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_instrumentation-0.46b0-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Requirement already satisfied: opentelemetry-semantic-conventions==0.46b0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.46b0)\n",
+      "Collecting opentelemetry-util-http==0.46b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
+      "  Obtaining dependency information for opentelemetry-util-http==0.46b0 from https://files.pythonhosted.org/packages/a2/7f/26d3d8880ea79adde8bb7bc306b25ca5134d6f6c3006ba464716405b4729/opentelemetry_util_http-0.46b0-py3-none-any.whl.metadata\n",
+      "  Downloading opentelemetry_util_http-0.46b0-py3-none-any.whl.metadata (2.4 kB)\n",
+      "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)\n",
+      "  Obtaining dependency information for monotonic>=1.5 from https://files.pythonhosted.org/packages/9a/67/7e8406a29b6c45be7af7740456f7f37025f0506ae2e05fb9009a53946860/monotonic-1.6-py2.py3-none-any.whl.metadata\n",
+      "  Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)\n",
+      "Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb)\n",
+      "  Obtaining dependency information for backoff>=1.10.0 from https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl.metadata\n",
+      "  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n",
+      "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (3.0.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (2.16.1)\n",
+      "Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from tokenizers>=0.13.2->chromadb) (0.22.2)\n",
+      "Requirement already satisfied: click>=8.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from typer>=0.9.0->chromadb) (8.1.6)\n",
+      "Requirement already satisfied: shellingham>=1.3.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from typer>=0.9.0->chromadb) (1.5.4)\n",
+      "Requirement already satisfied: httptools>=0.5.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\n",
+      "Requirement already satisfied: python-dotenv>=0.13 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\n",
+      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.19.0)\n",
+      "Requirement already satisfied: watchfiles>=0.13 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.21.0)\n",
+      "Requirement already satisfied: websockets>=10.4 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (12.0)\n",
+      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.2)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.3.0)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n",
+      "Requirement already satisfied: filelock in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.13.1)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.9.0)\n",
+      "Requirement already satisfied: zipp>=0.5 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from importlib-metadata<=7.1,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.16.2)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->chromadb) (0.1.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from requests->kubernetes>=28.1.0->chromadb) (3.2.0)\n",
+      "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.14.1->chromadb)\n",
+      "  Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata\n",
+      "  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n",
+      "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.5.1)\n",
+      "Downloading chromadb-0.5.12-py3-none-any.whl (602 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m602.6/602.6 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl (185 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m185.0/185.0 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading bcrypt-4.2.0-cp39-abi3-macosx_10_12_universal2.whl (472 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m472.4/472.4 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
+      "\u001b[?25hDownloading build-1.2.2.post1-py3-none-any.whl (22 kB)\n",
+      "Downloading httpx-0.27.2-py3-none-any.whl (76 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl (38 kB)\n",
+      "Downloading onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl (16.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.8/16.8 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading opentelemetry_instrumentation_fastapi-0.46b0-py3-none-any.whl (11 kB)\n",
+      "Downloading opentelemetry_instrumentation-0.46b0-py3-none-any.whl (29 kB)\n",
+      "Downloading opentelemetry_instrumentation_asgi-0.46b0-py3-none-any.whl (14 kB)\n",
+      "Downloading opentelemetry_util_http-0.46b0-py3-none-any.whl (6.9 kB)\n",
+      "Downloading posthog-3.7.0-py2.py3-none-any.whl (54 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.4/54.4 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading tenacity-9.0.0-py3-none-any.whl (28 kB)\n",
+      "Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
+      "Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
+      "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading flatbuffers-24.3.25-py2.py3-none-any.whl (26 kB)\n",
+      "Downloading pyproject_hooks-1.2.0-py3-none-any.whl (10 kB)\n",
+      "Downloading asgiref-3.8.1-py3-none-any.whl (23 kB)\n",
+      "Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hBuilding wheels for collected packages: pypika\n",
+      "  Building wheel for pypika (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for pypika: filename=PyPika-0.48.9-py2.py3-none-any.whl size=53723 sha256=8643461882832391b1d76edba76a526166c2d9df66a46f567985fb53216f60b6\n",
+      "  Stored in directory: /Users/emelidral/Library/Caches/pip/wheels/a3/01/bd/4c40ceb9d5354160cb186dcc153360f4ab7eb23e2b24daf96d\n",
+      "Successfully built pypika\n",
+      "Installing collected packages: pypika, monotonic, flatbuffers, tenacity, pyproject_hooks, opentelemetry-util-http, mmh3, humanfriendly, httpcore, chroma-hnswlib, bcrypt, backoff, asgiref, posthog, httpx, coloredlogs, build, opentelemetry-instrumentation, onnxruntime, opentelemetry-instrumentation-asgi, opentelemetry-instrumentation-fastapi, chromadb\n",
+      "  Attempting uninstall: tenacity\n",
+      "    Found existing installation: tenacity 8.2.2\n",
+      "    Uninstalling tenacity-8.2.2:\n",
+      "      Successfully uninstalled tenacity-8.2.2\n",
+      "  Attempting uninstall: httpcore\n",
+      "    Found existing installation: httpcore 0.17.3\n",
+      "    Uninstalling httpcore-0.17.3:\n",
+      "      Successfully uninstalled httpcore-0.17.3\n",
+      "  Attempting uninstall: httpx\n",
+      "    Found existing installation: httpx 0.24.1\n",
+      "    Uninstalling httpx-0.24.1:\n",
+      "      Successfully uninstalled httpx-0.24.1\n",
+      "Successfully installed asgiref-3.8.1 backoff-2.2.1 bcrypt-4.2.0 build-1.2.2.post1 chroma-hnswlib-0.7.6 chromadb-0.5.12 coloredlogs-15.0.1 flatbuffers-24.3.25 httpcore-1.0.6 httpx-0.27.2 humanfriendly-10.0 mmh3-5.0.1 monotonic-1.6 onnxruntime-1.19.2 opentelemetry-instrumentation-0.46b0 opentelemetry-instrumentation-asgi-0.46b0 opentelemetry-instrumentation-fastapi-0.46b0 opentelemetry-util-http-0.46b0 posthog-3.7.0 pypika-0.48.9 pyproject_hooks-1.2.0 tenacity-9.0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "! pip install chromadb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5b8aec3-e00e-4919-b2a8-b19722311261",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "id": "547c43f3-e58f-450c-b80b-c396eb2655a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import openai \n",
+    "from openai import OpenAI\n",
+    "import pprint\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "\n",
+    "import chromadb\n",
+    "from chromadb.utils import embedding_functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c6fac99-e1db-48be-9554-88ecddac271e",
+   "metadata": {},
+   "source": [
+    "## Chunked data collection setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "9b93b470-9d32-4757-9d03-915992e2a7c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collection db_collection created successfully\n",
+      "{'data': None,\n",
+      " 'documents': [],\n",
+      " 'embeddings': None,\n",
+      " 'ids': [],\n",
+      " 'included': ['metadatas', 'documents'],\n",
+      " 'metadatas': [],\n",
+      " 'uris': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "collection_name = \"db_collection\"\n",
+    "default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=\"all-MiniLM-L6-v2\")\n",
+    "\n",
+    "chroma_client = chromadb.PersistentClient(path=\"./chromadb/\")\n",
+    "# declare ChromaDB collection\n",
+    "collection = chroma_client.get_or_create_collection(\n",
+    "    name=collection_name,\n",
+    "    embedding_function=default_embedding_function\n",
+    "    )\n",
+    "\n",
+    "result = collection.get()\n",
+    "\n",
+    "print(f\"Collection {collection_name} created successfully\")\n",
+    "pprint.pprint(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d547021f-9d4d-42cf-b580-abc6a1008cd1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "def load_md_from_dir(dir_path):\n",
+    "    \"\"\"\n",
+    "    Loads Markdown (.md) files from the specified directory.\n",
+    "\n",
+    "    Args:\n",
+    "        dir_path (str): Path to the directory containing .md files.\n",
+    "\n",
+    "    Returns:\n",
+    "        List[dict]: A list of dictionaries with the text content of each .md file.\n",
+    "    \"\"\"\n",
+    "    md_files = [\n",
+    "        os.path.join(dir_path, filename) \n",
+    "        for filename in os.listdir(dir_path) \n",
+    "        if filename.endswith(\".md\")\n",
+    "    ]\n",
+    "    \n",
+    "    documents = []\n",
+    "    for file_path in md_files:\n",
+    "        with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "            documents.append({\"text\": file.read()})\n",
+    "    \n",
+    "    return documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "53f147f1-f8a2-4095-a840-2bacbc0aaf63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_text(text, chunk_size=100, chunk_overlap=20):\n",
+    "    \"\"\"\n",
+    "    Splits the input text into overlapping chunks.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): The text to split.\n",
+    "        chunk_size (int): The size of each chunk. Default is 100.\n",
+    "        chunk_overlap (int): The number of overlapping characters between chunks. Default is 20.\n",
+    "\n",
+    "    Returns:\n",
+    "        List[str]: A list of text chunks.\n",
+    "    \"\"\"\n",
+    "    chunks = []\n",
+    "    text_length = len(text)\n",
+    "    \n",
+    "    for start in range(0, text_length, chunk_size - chunk_overlap):\n",
+    "        end = min(start + chunk_size, text_length)\n",
+    "        chunks.append(text[start:end])\n",
+    "    \n",
+    "    return chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "c0c9c8e2-0f2f-4fe0-aeee-68b0bb67cea8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " 4 files loaded\n",
+      "Split in to 270 chunks\n"
+     ]
+    }
+   ],
+   "source": [
+    "directory_path = \"./evidently_reference/\"\n",
+    "\n",
+    "# load documents from directory\n",
+    "md_files = load_md_from_dir(directory_path)\n",
+    "\n",
+    "print(f\" {len(md_files)} files loaded\")\n",
+    "\n",
+    "# Split text into chunks\n",
+    "chunked_files = [\n",
+    "    {\n",
+    "        'id': f\"{file_id}-{chunk_id}\",\n",
+    "        'text': chunk,\n",
+    "    }\n",
+    "    for file_id, file in enumerate(md_files)\n",
+    "    for chunk_id, chunk in enumerate(split_text(file[\"text\"], chunk_size=500, chunk_overlap=50))\n",
+    "]\n",
+    "\n",
+    "print(f\"Split in to {len(chunked_files)} chunks\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "7bf99fde-8aa7-4111-ad7e-eec59bd0c23e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collection db_collection has 270 documents\n"
+     ]
+    }
+   ],
+   "source": [
+    "# insert documents with embeddings to collection ChromaDB\n",
+    "for chunk in chunked_files:\n",
+    "    collection.upsert(\n",
+    "            ids=chunk['id'],\n",
+    "            documents=chunk['text'],\n",
+    "    )\n",
+    "\n",
+    "result = collection.get()\n",
+    "\n",
+    "print(f\"Collection {collection_name} has {len(result['ids'])} documents\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "50ec8822-e2dc-4a01-bad3-44f1f123ed5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Just incase we need to delete collection\n",
+    "#list_collections = chroma_client.list_collections()\n",
+    "#print(list_collections)\n",
+    "\n",
+    "chroma_client.delete_collection(collection_name)\n",
+    "list_collections = chroma_client.list_collections()\n",
+    "print(list_collections)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebaf34e6-d454-4d38-b08a-f57550b39e74",
+   "metadata": {},
+   "source": [
+    "## Dataset Generation chain of promts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "ced14436-c4b3-4b25-8cc7-7cdb112eed66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n",
+    "client = OpenAI(\n",
+    "    api_key=os.environ.get(\"OPENAI_API_KEY\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04b0cf1e-5356-44e4-855b-5169a21260e2",
+   "metadata": {},
+   "source": [
+    "### Get a seed query"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe13a2c7-1c76-4d18-8bde-d1821078822f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seed_query = \"How do I get Evidently data drift report for my data?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "id": "d81f6108-79ef-4148-bc8e-b1050cc1637f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ods](../customization/options-for-statistical-tests.md).\\n{% endhint %}\\n\\n## Text Data \\n\\n![](../.gitbook/assets/reports/metric_column_drift_text-min.png)\\n\\nText content drift using a **domain classifier**. Evidently trains a binary classification model to discriminate between data from reference and current distributions. \\n\\nThe default for **small data with <= 1000 observations** detects drift if the ROC AUC of the drift detection classifier > possible ROC AUC of the random classifier at a 95th per',\n",
+       " 'score and compares it to the reference or against a defined condition. | **Required**:<br>N/A<br><br> **Optional**:<ul><li>`threshold_probas`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k`</li></ul> **Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/-20% or better than a dummy model.<br><br>**With reference**: if the F1 is over 20% higher or lower, the test fails.<br><br>**No reference**: if the F1 is lower than the F1 of the d',\n",
+       " '><li> Checks if the text begins with a specified combination. </li><li> Returns True/False for every input.</li></ul> Example use:<br> `BeginsWith(prefix=\"How\")`| **Required:**<br>`prefix`<br><br>**Optional:**<ul><li>`display_name`</li><li>`case_sensitive = True` or `False`</li></ul> |\\n| **EndsWith()** <ul><li> Checks if the text ends with a specified combination. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `EndsWith(suffix=\"Thank you.\")`| **Required:**<br>`suffix`<',\n",
+       " 'ositive Rate)</li><li>TNR (True Negative Rate)</li><li>FPR (False Positive Rate)</li><li>FNR (False Negative Rate)</li><li>ROC AUC Score (for probabilistic classification)</li><li>LogLoss (for probabilistic classification) </li></ul> | **Required:**:<br>n/a<br><br>**Optional:**<ul><li>`probas_threshold` (default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul> |\\n| **ClassificationClassBalance()** <br><br> Calculates the number of o',\n",
+       " 'columns contain empty or infinite values (+-np.inf), these values will be filtered out when calculating distribution drift in the corresponding column.\\n\\nBy default, drift tests do not react to changes or increases in the number of empty values. Since the high number of nulls can be an important indicator, we recommend grouping the data drift tests (that check for distribution shift) with data integrity tests (that check for a share of nulls). You can choose from several null-related [tests](all-',\n",
+       " '. Prediction and target are required. Input features are optional.\\n\\n**Composition**:\\n* `RegressionQualityMetric()`\\n* `RegressionPredictedVsActualScatter()`\\n* `RegressionPredictedVsActualPlot()`\\n* `RegressionErrorPlot()`\\n* `RegressionAbsPercentageErrorPlot()`\\n* `RegressionErrorDistribution()`\\n* `RegressionErrorNormality()`\\n* `RegressionTopErrorMetric()`\\n* `RegressionErrorBiasTable()` for all or specified `columns`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n  \\n<summary>Classifica',\n",
+       " 'ders items that are present in training. \\n\\nFurther reading: [Castells, P., Vargas, S., & Wang, J. (2011). Novelty and Diversity Metrics for Recommender Systems: Choice, Discovery and Relevance](https://repositorio.uam.es/bitstream/handle/10486/666094/novelty_castells_DDR_2011.pdf)\\n\\n# Serendipity\\n\\n![](../.gitbook/assets/reports/metric_serendipity-min.png)\\n\\n**Evidently Metric**: `SerendipityMetric`\\n\\nRecommendation serendipity: this metric measures how unusual the relevant recommendations are in K,',\n",
+       " 'ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\\\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\\\text{MAP@K} = \\\\frac{1}{U} \\\\sum_{u=1}^{U} \\\\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica',\n",
+       " 'r><br>**With reference**: the test fails if the number of columns with missing values is higher than in reference.  <br>**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. <br><br> Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:<br> N/A <br><br> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default ',\n",
+       " 'talog.\\n\\n**Range**: 0 to 1, where 0 represents the perfect equality (recommended items are evenly distributed among users), and 1 is complete inequality (the recommendations are concentrated on a single item).\\n\\n**Interpretation**: the lower the value (usually preferable), the more equal the item distribution in recommendations. If the value is high, a few items are frequently recommended to many users while others are ignored.\\n\\nFurther reading: [Abdollahpouri, H., Mansoury, M., Burke, R., Mobashe']"
+      ]
+     },
+     "execution_count": 131,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Fixed size for the random list\n",
+    "sample_size = 10\n",
+    "\n",
+    "# Generate a random list with the fixed size from the existing list\n",
+    "random_chuncks = [item['text'] for item in random.sample(chunked_files, min(sample_size, len(chunked_files)))]\n",
+    "random_chuncks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "90712120-3ac4-48d0-a749-9f8ef72f4247",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "system_prompt = \"You are an assisstant who generates questions based on provided context\"\n",
+    "number_of_questions = 10\n",
+    "user_prompt = \"\"\"\n",
+    "Generate {N} conceptual questions based on the provided context and can be answered from the information in the provided context.\n",
+    "Here is a context\n",
+    "<context>\n",
+    "    {context}\n",
+    "</context>\n",
+    "\n",
+    "Remain faithful to the underlying context. \n",
+    "Avoid providing any preamble!\n",
+    "Avoid providing any closing statement!\n",
+    "Please return only a list of coma separated generated questions in string format.\n",
+    "\"\"\"\n",
+    "\n",
+    "context = \"\\n\\n\".join(random_chuncks)\n",
+    "\n",
+    "formated_user_prompt = user_prompt.format(context=context, N=number_of_questions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "id": "7aa2632f-5bda-4395-81a2-77ccb4dd994b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",  # Updated to a valid model\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": formated_user_prompt}\n",
+    "    ],\n",
+    "    max_tokens=400,  # Limits the response length\n",
+    "    temperature=0.7,  # Controls randomness in the output\n",
+    "    n=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "id": "d01f1b79-7781-4e60-b6b0-71a31f860376",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_seed_queries = response.choices[0].message.content.strip().split(\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "id": "4ba9d23b-b502-4e6e-9535-6c672d6ec309",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['\"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?',\n",
+       " ' What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?',\n",
+       " ' What is the purpose of the HuggingFaceModel function?',\n",
+       " ' How does the HuggingFaceToxicityModel function detect hate speech?',\n",
+       " ' What condition causes the TestNumberOfDuplicatedRows to fail without a reference?',\n",
+       " ' What is measured by the TestShareOfDriftedColumns function?',\n",
+       " ' What are the required and optional parameters for the ScoreDistribution function?',\n",
+       " ' What is the role of the ColumnSummaryMetric in the DataQualityPreset?',\n",
+       " ' How does the drift detection method choose the appropriate test for each column?',\n",
+       " ' How is AP@K calculated in the context of relevant item positions?\"']"
+      ]
+     },
+     "execution_count": 118,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_seed_queries"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e8547f2-b3a7-4058-8175-b3872f318d1a",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "### Get alternative questions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "39df8c68-84cb-43af-aba3-63d1f10537ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#do not forget to write a prompt for seed query generation\n",
+    "system_prompt = \"You are a smart assistant who helps rephrase questions\" \n",
+    "\n",
+    "number_of_reformulations = 5\n",
+    "\n",
+    "seed_query = \"How do I get Evidently data drift report for my data?\"\n",
+    "\n",
+    "user_prompt = \"\"\"Write for me {number_of_reformulations} alternative questions quite similar to the question you got.\n",
+    "The question: {seed_query}\n",
+    "\n",
+    "Return a list of questions.\n",
+    "This should be only a list of string questions, separated by comma\n",
+    "\"\"\"\n",
+    "\n",
+    "formated_user_prompt = user_prompt.format(number_of_reformulations=number_of_reformulations, \n",
+    "                                          seed_query = generated_seed_query)\n",
+    "                         #seed_query=seed_query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "983b4545-0511-473e-8797-7fbdf2d5ff54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Make a request to the OpenAI to expand a seed question\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",  # Updated to a valid model\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": formated_user_prompt}\n",
+    "    ],\n",
+    "    max_tokens=400,  # Limits the response length\n",
+    "    temperature=0.7,  # Controls randomness in the output\n",
+    "    n=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "9c2fe61b-5470-469a-949c-9e1a65c0f4e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated Completion:\n",
+      "What criteria does the `TestShareOfColumnsWithMissingValues()` function use to identify failure without a reference dataset?, How does the absence of a reference affect the `TestShareOfColumnsWithMissingValues()` function's failure detection?, In what way does the `TestShareOfColumnsWithMissingValues()` function assess failure without having a reference?, How is failure determined by the `TestShareOfColumnsWithMissingValues()` function when a reference is not given?, What is the method used by the `TestShareOfColumnsWithMissingValues()` function to evaluate failure without a reference dataset?\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['What criteria does the `TestShareOfColumnsWithMissingValues()` function use to identify failure without a reference dataset?',\n",
+       " \" How does the absence of a reference affect the `TestShareOfColumnsWithMissingValues()` function's failure detection?\",\n",
+       " ' In what way does the `TestShareOfColumnsWithMissingValues()` function assess failure without having a reference?',\n",
+       " ' How is failure determined by the `TestShareOfColumnsWithMissingValues()` function when a reference is not given?',\n",
+       " ' What is the method used by the `TestShareOfColumnsWithMissingValues()` function to evaluate failure without a reference dataset?']"
+      ]
+     },
+     "execution_count": 110,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completion_text = response.choices[0].message.content\n",
+    "print(f\"Generated Completion:\\n{completion_text}\")\n",
+    "\n",
+    "queries = completion_text.strip().split(\",\")\n",
+    "queries"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df169dc5-acbd-46e7-b163-c5ebebb8ea0d",
+   "metadata": {},
+   "source": [
+    "### Find relevant chuncks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "0333932a-8f6e-48e6-9f28-9f5c0406d091",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_collection(question, n_results = 3):\n",
+    "    \"\"\"\n",
+    "    Queries the collection with a given question and returns the relevant text chunks.\n",
+    "    \n",
+    "    Args:\n",
+    "        question (str): The query or question text to search for.\n",
+    "        n_results (int): Number of results to retrieve. Default is 3.\n",
+    "\n",
+    "    Returns:\n",
+    "        List[str]: A list of relevant text chunks.\n",
+    "    \"\"\"\n",
+    "    # Perform the query\n",
+    "    results = collection.query(\n",
+    "        query_texts=question,\n",
+    "        n_results=n_results,\n",
+    "        # include=['embeddings', 'documents', 'distances']\n",
+    "    )\n",
+    "\n",
+    "    # Extract relevant text chunks from the documents\n",
+    "    relevant_chunks = [\n",
+    "        chunk for document in results[\"documents\"] for chunk in document\n",
+    "    ]\n",
+    "    \n",
+    "    return relevant_chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "b827e30d-a7b2-406f-a139-5b7fdd3bab6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['how to detect drift in ML embeddings](https://www.evidentlyai.com/blog/embedding-drift-detection).  \\n\\nAdditional links:  \\n\\n* [How to interpret data and prediction drift together? ](https://evidentlyai.com/blog/data-and-prediction-drift)  \\n\\n* [Do I need to monitor data drift if I can measure the ML model quality?](https://evidentlyai.com/blog/ml-monitoring-do-i-need-data-drift)  \\n\\n* [\"My data drifted. What\\'s next?\" How to handle ML model drift in production.](https://evidentlyai.com/blog/ml-monit',\n",
+       " 'arget). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label=\"hate\"` (default)</li><li> `display_name`</li></ul> |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,',\n",
+       " 'alculates the number and share of drifted features in the dataset. </li><li>Each feature is tested for drift individually using the default algorithm, unless a custom approach is specified.</li></ul>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`сolumns` (default=all)</li><li>`drift_share`(default for dataset drift = 0.5)</li> <li>`stattest`</li><li>`cat_stattest`</li><li>`num_stattest`</li><li>`per_column_stattest`</li><li>`stattest_threshold`</li><li>`cat_stattest_threshold`</li><li>`num_']"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query_collection(seed_query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "id": "d549b9ab-1e3a-490e-a57e-669af72dbdb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#relevant_chunks = [query_collection(query) for query in queries]\n",
+    "relevant_chunks = [query_collection(query) for query in generated_seed_queries]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "id": "cfeb5073-f37c-4d24-85a7-bf2043dacb1e",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['r><br>**With reference**: the test fails if the number of columns with missing values is higher than in reference.  <br>**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. <br><br> Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:<br> N/A <br><br> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default ',\n",
+       "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O',\n",
+       "  ' test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of'],\n",
+       " [\"the Test's defaults. You can see them in the tables below. The listed Preset parameters apply to the relevant individual Tests inside the Preset.\\n\\n<details>\\n \\n<summary>NoTargetPerformance Test Preset</summary>\\n\\nPreset name: `NoTargetPerformanceTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift(column_name=prediction)`\\n* `TestColumnShareOfMissingValues()` for `all` or `сolumns` if provided\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* \",\n",
+       "  'lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo',\n",
+       "  \"**: N/A |\\n\\n## Column Values\\n\\n| Test name  | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestColumnValueMin**(column_name='num-column') | Column-level. <br><br> Tests the minimum value of a given numerical column against reference or a defined condition. |  **Required**:<ul><li>`column_name`</li></ul> **Optional:** N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects not lower.<br><br>**With reference**: the test fails if the minimum \"],\n",
+       " ['tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com',\n",
+       "  'ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
+       "  'igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use'],\n",
+       " ['l>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |\\n| **HuggingFaceModel()** <br><br> Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|\\n| **HuggingFaceToxicityModel()** <ul><li> Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). </li><li> Returns predicted probability fo',\n",
+       "  'xts (containing critical or pessimistic tone). Returns a label (NEGATIVE or POSITIVE) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **BiasLLMEval()** <br><br> Detects biased texts (containing prejudice for or against a person or group). Returns a label (BIAS or OK) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **ToxicityLLMEval()** <br><br> Detects toxic texts (containing harmful, offensive, or derogatory language). Returns a label (T',\n",
+       "  'arget). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label=\"hate\"` (default)</li><li> `display_name`</li></ul> |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,'],\n",
+       " ['*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te',\n",
+       "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O',\n",
+       "  ' in the reference.<br><br>**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. <br><br> Tests the number of duplicate columns against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects =< or none.<br><br>**With reference**: the test fails if the number of duplicate columns is higher than in the reference.<b'],\n",
+       " ['lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo',\n",
+       "  'sition**: \\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestMostCommonValueShare()` for all or specified `columns`\\n* `TestNumberOfConstantColumns()`\\n* `TestNumberOfDuplicatedColumns()`\\n* `TestNumberOfDuplicatedRows()`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Drift Test Preset</summary>\\n\\nPreset name: `DataDriftTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift()` for all or specified `columns`\\n\\n**Optio',\n",
+       "  \"10%.<br><br>**With reference**: the test fails if the median value is different by more than 10%.<br><br>**No reference**: N/A |\\n| **TestColumnValueStd**(column_name='num-column')<br>| Column-level. <br><br> Tests the standard deviation of a given numerical column against reference or a defined condition. |   **Required**:<ul><li>`column_name`</li></ul> **Optional:**<br> N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10%.<br><br>**With reference**: the tes\"],\n",
+       " ['in the training dataset.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li><li>`column_name`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ScoreDistribution()** <br><br> Computes the predicted score entropy. Visualizes the distribution of the scores at `k` (and all scores, if available).<br><br>Applies only when the `recommendations_type` is a `score`. | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **RecCasesTable()** <br><br> Shows the list of recomm',\n",
+       "  'Evidently Metric**: `ScoreDistribution`\\n\\nThis metric computes the predicted score entropy. It applies only when the `recommendations_type` is a score.\\n\\n**Implementation**:\\n* Apply softmax transformation for top-K scores for all users.\\n* Compute the KL divergence (relative entropy in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)). \\n\\nThe visualization shows the distribution of the predicted scores at K (and all scores, if available). \\n\\n# Item Bias \\n\\n![](../',\n",
+       "  'icationProbDistribution()`- if probabilistic classification\\n* `ClassificationRocCurve()` - if probabilistic classification\\n* `ClassificationPRCurve()` - if probabilistic classification\\n* `ClassificationPRTable()` - if probabilistic classification\\n* `ClassificationQualityByFeatureTable()` for all or specified `columns`</li></ul>\\n\\n**Optional parameters**:\\n* `columns`\\n* `probas_threshold`\\n\\n</details>\\n\\n<details>\\n  \\n<summary>Text Overview Preset</summary>\\n\\n`TextOverviewPreset()` provides a summary fo'],\n",
+       " ['ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
+       "  '%}\\n\\n# Metric Presets\\n\\n**Defaults**: Presets use the default parameters for each Metric. You can see them in the tables below. \\n\\n<details>\\n\\n<summary>Data Quality Preset</summary>\\n\\n`DataQualityPreset` captures column and dataset summaries. Input columns are required. Prediction and target are optional.\\n\\n**Composition**:\\n* `DatasetSummaryMetric()`\\n* `ColumnSummaryMetric()` for `all` or specified `сolumns`\\n* `DatasetMissingValuesMetric()`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n',\n",
+       "  'lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo'],\n",
+       " ['In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect ',\n",
+       "  'ct a different test, you should set [data drift parameters](../customization/options-for-statistical-tests.md). \\n\\n| Test name | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestNumberOfDriftedColumns()** | Dataset-level. <br><br> Compares the distribution of each column in the current dataset to the reference and tests the number of drifting features against a defined condition.| **Required**:<br>N/A<br><br>**Optional:**<ul><li>`сolumns`</li><li>`stattest`(default=',\n",
+       "  'tical tests and drift detection methods to detect if the distribution has changed significantly. It returns a \"drift detected\" or \"not detected\" result.\\n\\nThere is a default logic to choosing the appropriate drift test for each column. It is based on:\\n\\n* column type: categorical, numerical, text data or embeddings\\n* the number of observations in the reference dataset\\n* the number of unique values in the column (n\\\\_unique)\\n\\n## Tabular Data \\n\\n![](../.gitbook/assets/reports/metric_data_drift_table_2'],\n",
+       " ['at each relevant item position within the top K. To do that, we sum up precision at all values of K when the item is relevant (e.g., Precision @1, Precision@2..), and divide it by the total number of relevant items in K.\\n\\n$$\\n\\\\text{AP@K} = \\\\frac{1}{N} \\\\sum_{k=1}^{K} Precision(k) \\\\times rel(k)\\n$$\\n\\nWhere *N* is the total number of relevant items at K, and *rel(k)* is equal to 1 if the item is relevant, and is 0 otherwise.\\n\\nExample: if K = 10, and items in positions 1, 2, and 10 are relevant, the fo',\n",
+       "  ' 1 if any relevant item is included in K, or 0 otherwise.\\n* **Compute average hit rate**. The average of this metric is calculated across all users or queries.\\n\\n**Range**: 0 to 1, where 1 indicates that each user / query gets at least one relevant recommendation / retrieval.\\n\\n**Interpretation**: A higher Hit Rate indicates that a higher share of users / queries have relevant items in their lists. \\n\\n**Note**: the Hit Rate will typically increase for higher values of K (since there is a higher cha',\n",
+       "  'ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\\\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\\\text{MAP@K} = \\\\frac{1}{U} \\\\sum_{u=1}^{U} \\\\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica']]"
+      ]
+     },
+     "execution_count": 120,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "relevant_chunks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e24bdc4-fb5c-4df4-b00f-01903c4ed370",
+   "metadata": {},
+   "source": [
+    "### Baseline answer generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "ee992257-020d-461b-9b2f-928b93acb4c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Make a request to the OpenAI to answer generated question with relevant context\n",
+    "\n",
+    "def generate_baseline_answer(query, relevant_chunks):\n",
+    "    system_prompt = \"You are a helpful assistant thet answer a given question directly withou any preamble\"\n",
+    "\n",
+    "    user_prompt = \"\"\"\n",
+    "    Your task is to answer the following query: \n",
+    "    <query>\n",
+    "    {query}\n",
+    "    </query>\n",
+    "    \n",
+    "    You have access to the following documents which are meant to provide context as you answer the query:\n",
+    "    <documents>\n",
+    "    {context}\n",
+    "    </documents>\n",
+    "    \n",
+    "    Please remain faithful to the underlying context, and deviate from it only if you haven't found the answer in the provided context. \n",
+    "    Avoid providing any preamble!\n",
+    "    Avoid providing any closing statement!\n",
+    "    Please return the answer only\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    context = \"\\n\\n\".join(relevant_chunks)\n",
+    "    formated_user_prompt = user_prompt.format(query=query, context=context)\n",
+    "\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"gpt-4o\",  # Updated to a valid model\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": formated_user_prompt}\n",
+    "        ],\n",
+    "        max_tokens=400,  # Limits the response length\n",
+    "        temperature=0.7,  # Controls randomness in the output\n",
+    "        n=1\n",
+    "    )\n",
+    "    \n",
+    "    completion_text = response.choices[0].message.content\n",
+    "    return completion_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "id": "8db672b0-f63a-400b-b00f-22e96d02dbe5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline_answers = [generate_baseline_answer(generated_seed_queries[i], relevant_chunks[i]) for i in range(min(len(generated_seed_queries), len(relevant_chunks)))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "id": "ae366d95-3438-4d6c-8030-8e8c666e0e17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_dataset = pd.DataFrame({\n",
+    "    'Query': generated_seed_queries,\n",
+    "    'Relevant chunks': relevant_chunks,\n",
+    "    'Baseline_answers': baseline_answers\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "id": "bdfc1029-34bb-4870-bcc2-0c32f56a0bc1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Query</th>\n",
+       "      <th>Relevant chunks</th>\n",
+       "      <th>Baseline_answers</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>\"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?</td>\n",
+       "      <td>[r&gt;&lt;br&gt;**With reference**: the test fails if the number of columns with missing values is higher than in reference.  &lt;br&gt;**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Optional**: &lt;ul&gt;&lt;li&gt;`missing_values = [], replace = True/False` (default , **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. &lt;br&gt;&lt;br&gt;**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**O,  test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**Optional**:&lt;ul&gt;&lt;li&gt;`missing_values = [], replace = True/False` (default = default list)&lt;/li&gt;&lt;/ul&gt;**Test conditions** &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt;| Expects up to +10% or 0.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the share of]</td>\n",
+       "      <td>The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?</td>\n",
+       "      <td>[the Test's defaults. You can see them in the tables below. The listed Preset parameters apply to the relevant individual Tests inside the Preset.\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;NoTargetPerformance Test Preset&lt;/summary&gt;\\n\\nPreset name: `NoTargetPerformanceTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift(column_name=prediction)`\\n* `TestColumnShareOfMissingValues()` for `all` or `сolumns` if provided\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* , lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Quality Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, **: N/A |\\n\\n## Column Values\\n\\n| Test name  | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestColumnValueMin**(column_name='num-column') | Column-level. &lt;br&gt;&lt;br&gt; Tests the minimum value of a given numerical column against reference or a defined condition. |  **Required**:&lt;ul&gt;&lt;li&gt;`column_name`&lt;/li&gt;&lt;/ul&gt; **Optional:** N/A &lt;br&gt;&lt;br&gt; **Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt; | Expects not lower.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the minimum ]</td>\n",
+       "      <td>The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>What is the purpose of the HuggingFaceModel function?</td>\n",
+       "      <td>[tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com, ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use]</td>\n",
+       "      <td>The purpose of the HuggingFaceModel function is not specified in the provided documents.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>How does the HuggingFaceToxicityModel function detect hate speech?</td>\n",
+       "      <td>[l&gt;| **Required:**&lt;br&gt;n/a&lt;br&gt;&lt;br&gt;**Optional:**&lt;ul&gt;&lt;li&gt;`display_name`&lt;/li&gt;&lt;/ul&gt; |\\n| **HuggingFaceModel()** &lt;br&gt;&lt;br&gt; Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|\\n| **HuggingFaceToxicityModel()** &lt;ul&gt;&lt;li&gt; Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). &lt;/li&gt;&lt;li&gt; Returns predicted probability fo, xts (containing critical or pessimistic tone). Returns a label (NEGATIVE or POSITIVE) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **BiasLLMEval()** &lt;br&gt;&lt;br&gt; Detects biased texts (containing prejudice for or against a person or group). Returns a label (BIAS or OK) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **ToxicityLLMEval()** &lt;br&gt;&lt;br&gt; Detects toxic texts (containing harmful, offensive, or derogatory language). Returns a label (T, arget). &lt;/li&gt;&lt;li&gt; Returns predicted probability for the “hate” label. &lt;/li&gt;&lt;li&gt; Scale: 0 to 1. &lt;/li&gt;&lt;/ul&gt; | **Optional**: &lt;ul&gt;&lt;li&gt;`toxic_label=\"hate\"` (default)&lt;/li&gt;&lt;li&gt; `display_name`&lt;/li&gt;&lt;/ul&gt; |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,]</td>\n",
+       "      <td>The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What condition causes the TestNumberOfDuplicatedRows to fail without a reference?</td>\n",
+       "      <td>[*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of duplicate rows against reference or a defined condition. |**Required**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Optional**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt;**Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt;| Expects +/- 10% or none.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.&lt;br&gt;&lt;br&gt;**No reference**: the te, **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. &lt;br&gt;&lt;br&gt;**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**O,  in the reference.&lt;br&gt;&lt;br&gt;**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of duplicate columns against reference or a defined condition. |**Required**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Optional**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt;**Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt;| Expects =&lt; or none.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the number of duplicate columns is higher than in the reference.&lt;b]</td>\n",
+       "      <td>The test fails if there is at least one duplicate row.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>What is measured by the TestShareOfDriftedColumns function?</td>\n",
+       "      <td>[lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Quality Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, sition**: \\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestMostCommonValueShare()` for all or specified `columns`\\n* `TestNumberOfConstantColumns()`\\n* `TestNumberOfDuplicatedColumns()`\\n* `TestNumberOfDuplicatedRows()`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Drift Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataDriftTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift()` for all or specified `columns`\\n\\n**Optio, 10%.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the median value is different by more than 10%.&lt;br&gt;&lt;br&gt;**No reference**: N/A |\\n| **TestColumnValueStd**(column_name='num-column')&lt;br&gt;| Column-level. &lt;br&gt;&lt;br&gt; Tests the standard deviation of a given numerical column against reference or a defined condition. |   **Required**:&lt;ul&gt;&lt;li&gt;`column_name`&lt;/li&gt;&lt;/ul&gt; **Optional:**&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt; | Expects +/-10%.&lt;br&gt;&lt;br&gt;**With reference**: the tes]</td>\n",
+       "      <td>The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>What are the required and optional parameters for the ScoreDistribution function?</td>\n",
+       "      <td>[in the training dataset.&lt;br&gt;&lt;br&gt;Requires a training dataset. | **Required**:&lt;ul&gt;&lt;li&gt;`k`&lt;/li&gt;&lt;li&gt;`column_name`&lt;/li&gt;&lt;/ul&gt;**Optional**:&lt;ul&gt;&lt;li&gt;-&lt;/li&gt;&lt;/ul&gt; |\\n| **ScoreDistribution()** &lt;br&gt;&lt;br&gt; Computes the predicted score entropy. Visualizes the distribution of the scores at `k` (and all scores, if available).&lt;br&gt;&lt;br&gt;Applies only when the `recommendations_type` is a `score`. | **Required**:&lt;ul&gt;&lt;li&gt;`k`&lt;/li&gt;&lt;/ul&gt;**Optional**:&lt;ul&gt;&lt;li&gt;-&lt;/li&gt;&lt;/ul&gt; |\\n| **RecCasesTable()** &lt;br&gt;&lt;br&gt; Shows the list of recomm, Evidently Metric**: `ScoreDistribution`\\n\\nThis metric computes the predicted score entropy. It applies only when the `recommendations_type` is a score.\\n\\n**Implementation**:\\n* Apply softmax transformation for top-K scores for all users.\\n* Compute the KL divergence (relative entropy in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)). \\n\\nThe visualization shows the distribution of the predicted scores at K (and all scores, if available). \\n\\n# Item Bias \\n\\n![](../, icationProbDistribution()`- if probabilistic classification\\n* `ClassificationRocCurve()` - if probabilistic classification\\n* `ClassificationPRCurve()` - if probabilistic classification\\n* `ClassificationPRTable()` - if probabilistic classification\\n* `ClassificationQualityByFeatureTable()` for all or specified `columns`&lt;/li&gt;&lt;/ul&gt;\\n\\n**Optional parameters**:\\n* `columns`\\n* `probas_threshold`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n  \\n&lt;summary&gt;Text Overview Preset&lt;/summary&gt;\\n\\n`TextOverviewPreset()` provides a summary fo]</td>\n",
+       "      <td>**Required**: `k`  \\n**Optional**: None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>What is the role of the ColumnSummaryMetric in the DataQualityPreset?</td>\n",
+       "      <td>[ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, %}\\n\\n# Metric Presets\\n\\n**Defaults**: Presets use the default parameters for each Metric. You can see them in the tables below. \\n\\n&lt;details&gt;\\n\\n&lt;summary&gt;Data Quality Preset&lt;/summary&gt;\\n\\n`DataQualityPreset` captures column and dataset summaries. Input columns are required. Prediction and target are optional.\\n\\n**Composition**:\\n* `DatasetSummaryMetric()`\\n* `ColumnSummaryMetric()` for `all` or specified `сolumns`\\n* `DatasetMissingValuesMetric()`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n, lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Quality Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo]</td>\n",
+       "      <td>The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>How does the drift detection method choose the appropriate test for each column?</td>\n",
+       "      <td>[In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect , ct a different test, you should set [data drift parameters](../customization/options-for-statistical-tests.md). \\n\\n| Test name | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestNumberOfDriftedColumns()** | Dataset-level. &lt;br&gt;&lt;br&gt; Compares the distribution of each column in the current dataset to the reference and tests the number of drifting features against a defined condition.| **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**Optional:**&lt;ul&gt;&lt;li&gt;`сolumns`&lt;/li&gt;&lt;li&gt;`stattest`(default=, tical tests and drift detection methods to detect if the distribution has changed significantly. It returns a \"drift detected\" or \"not detected\" result.\\n\\nThere is a default logic to choosing the appropriate drift test for each column. It is based on:\\n\\n* column type: categorical, numerical, text data or embeddings\\n* the number of observations in the reference dataset\\n* the number of unique values in the column (n\\_unique)\\n\\n## Tabular Data \\n\\n![](../.gitbook/assets/reports/metric_data_drift_table_2]</td>\n",
+       "      <td>The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>How is AP@K calculated in the context of relevant item positions?\"</td>\n",
+       "      <td>[at each relevant item position within the top K. To do that, we sum up precision at all values of K when the item is relevant (e.g., Precision @1, Precision@2..), and divide it by the total number of relevant items in K.\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nWhere *N* is the total number of relevant items at K, and *rel(k)* is equal to 1 if the item is relevant, and is 0 otherwise.\\n\\nExample: if K = 10, and items in positions 1, 2, and 10 are relevant, the fo,  1 if any relevant item is included in K, or 0 otherwise.\\n* **Compute average hit rate**. The average of this metric is calculated across all users or queries.\\n\\n**Range**: 0 to 1, where 1 indicates that each user / query gets at least one relevant recommendation / retrieval.\\n\\n**Interpretation**: A higher Hit Rate indicates that a higher share of users / queries have relevant items in their lists. \\n\\n**Note**: the Hit Rate will typically increase for higher values of K (since there is a higher cha, ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\text{MAP@K} = \\frac{1}{U} \\sum_{u=1}^{U} \\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica]</td>\n",
+       "      <td>AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                              Query  \\\n",
+       "0  \"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?   \n",
+       "1                     What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?   \n",
+       "2                                                             What is the purpose of the HuggingFaceModel function?   \n",
+       "3                                                How does the HuggingFaceToxicityModel function detect hate speech?   \n",
+       "4                                 What condition causes the TestNumberOfDuplicatedRows to fail without a reference?   \n",
+       "5                                                       What is measured by the TestShareOfDriftedColumns function?   \n",
+       "6                                 What are the required and optional parameters for the ScoreDistribution function?   \n",
+       "7                                             What is the role of the ColumnSummaryMetric in the DataQualityPreset?   \n",
+       "8                                  How does the drift detection method choose the appropriate test for each column?   \n",
+       "9                                                How is AP@K calculated in the context of relevant item positions?\"   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Relevant chunks  \\\n",
+       "0                                                [r><br>**With reference**: the test fails if the number of columns with missing values is higher than in reference.  <br>**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. <br><br> Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:<br> N/A <br><br> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default , **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O,  test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of]   \n",
+       "1              [the Test's defaults. You can see them in the tables below. The listed Preset parameters apply to the relevant individual Tests inside the Preset.\\n\\n<details>\\n \\n<summary>NoTargetPerformance Test Preset</summary>\\n\\nPreset name: `NoTargetPerformanceTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift(column_name=prediction)`\\n* `TestColumnShareOfMissingValues()` for `all` or `сolumns` if provided\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* , lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, **: N/A |\\n\\n## Column Values\\n\\n| Test name  | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestColumnValueMin**(column_name='num-column') | Column-level. <br><br> Tests the minimum value of a given numerical column against reference or a defined condition. |  **Required**:<ul><li>`column_name`</li></ul> **Optional:** N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects not lower.<br><br>**With reference**: the test fails if the minimum ]   \n",
+       "2                             [tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com, ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use]   \n",
+       "3                                         [l>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |\\n| **HuggingFaceModel()** <br><br> Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|\\n| **HuggingFaceToxicityModel()** <ul><li> Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). </li><li> Returns predicted probability fo, xts (containing critical or pessimistic tone). Returns a label (NEGATIVE or POSITIVE) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **BiasLLMEval()** <br><br> Detects biased texts (containing prejudice for or against a person or group). Returns a label (BIAS or OK) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **ToxicityLLMEval()** <br><br> Detects toxic texts (containing harmful, offensive, or derogatory language). Returns a label (T, arget). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label=\"hate\"` (default)</li><li> `display_name`</li></ul> |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,]   \n",
+       "4                                                [*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te, **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O,  in the reference.<br><br>**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. <br><br> Tests the number of duplicate columns against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects =< or none.<br><br>**With reference**: the test fails if the number of duplicate columns is higher than in the reference.<b]   \n",
+       "5          [lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, sition**: \\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestMostCommonValueShare()` for all or specified `columns`\\n* `TestNumberOfConstantColumns()`\\n* `TestNumberOfDuplicatedColumns()`\\n* `TestNumberOfDuplicatedRows()`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Drift Test Preset</summary>\\n\\nPreset name: `DataDriftTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift()` for all or specified `columns`\\n\\n**Optio, 10%.<br><br>**With reference**: the test fails if the median value is different by more than 10%.<br><br>**No reference**: N/A |\\n| **TestColumnValueStd**(column_name='num-column')<br>| Column-level. <br><br> Tests the standard deviation of a given numerical column against reference or a defined condition. |   **Required**:<ul><li>`column_name`</li></ul> **Optional:**<br> N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10%.<br><br>**With reference**: the tes]   \n",
+       "6                     [in the training dataset.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li><li>`column_name`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ScoreDistribution()** <br><br> Computes the predicted score entropy. Visualizes the distribution of the scores at `k` (and all scores, if available).<br><br>Applies only when the `recommendations_type` is a `score`. | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **RecCasesTable()** <br><br> Shows the list of recomm, Evidently Metric**: `ScoreDistribution`\\n\\nThis metric computes the predicted score entropy. It applies only when the `recommendations_type` is a score.\\n\\n**Implementation**:\\n* Apply softmax transformation for top-K scores for all users.\\n* Compute the KL divergence (relative entropy in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)). \\n\\nThe visualization shows the distribution of the predicted scores at K (and all scores, if available). \\n\\n# Item Bias \\n\\n![](../, icationProbDistribution()`- if probabilistic classification\\n* `ClassificationRocCurve()` - if probabilistic classification\\n* `ClassificationPRCurve()` - if probabilistic classification\\n* `ClassificationPRTable()` - if probabilistic classification\\n* `ClassificationQualityByFeatureTable()` for all or specified `columns`</li></ul>\\n\\n**Optional parameters**:\\n* `columns`\\n* `probas_threshold`\\n\\n</details>\\n\\n<details>\\n  \\n<summary>Text Overview Preset</summary>\\n\\n`TextOverviewPreset()` provides a summary fo]   \n",
+       "7  [ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, %}\\n\\n# Metric Presets\\n\\n**Defaults**: Presets use the default parameters for each Metric. You can see them in the tables below. \\n\\n<details>\\n\\n<summary>Data Quality Preset</summary>\\n\\n`DataQualityPreset` captures column and dataset summaries. Input columns are required. Prediction and target are optional.\\n\\n**Composition**:\\n* `DatasetSummaryMetric()`\\n* `ColumnSummaryMetric()` for `all` or specified `сolumns`\\n* `DatasetMissingValuesMetric()`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n, lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo]   \n",
+       "8                                 [In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect , ct a different test, you should set [data drift parameters](../customization/options-for-statistical-tests.md). \\n\\n| Test name | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestNumberOfDriftedColumns()** | Dataset-level. <br><br> Compares the distribution of each column in the current dataset to the reference and tests the number of drifting features against a defined condition.| **Required**:<br>N/A<br><br>**Optional:**<ul><li>`сolumns`</li><li>`stattest`(default=, tical tests and drift detection methods to detect if the distribution has changed significantly. It returns a \"drift detected\" or \"not detected\" result.\\n\\nThere is a default logic to choosing the appropriate drift test for each column. It is based on:\\n\\n* column type: categorical, numerical, text data or embeddings\\n* the number of observations in the reference dataset\\n* the number of unique values in the column (n\\_unique)\\n\\n## Tabular Data \\n\\n![](../.gitbook/assets/reports/metric_data_drift_table_2]   \n",
+       "9                    [at each relevant item position within the top K. To do that, we sum up precision at all values of K when the item is relevant (e.g., Precision @1, Precision@2..), and divide it by the total number of relevant items in K.\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nWhere *N* is the total number of relevant items at K, and *rel(k)* is equal to 1 if the item is relevant, and is 0 otherwise.\\n\\nExample: if K = 10, and items in positions 1, 2, and 10 are relevant, the fo,  1 if any relevant item is included in K, or 0 otherwise.\\n* **Compute average hit rate**. The average of this metric is calculated across all users or queries.\\n\\n**Range**: 0 to 1, where 1 indicates that each user / query gets at least one relevant recommendation / retrieval.\\n\\n**Interpretation**: A higher Hit Rate indicates that a higher share of users / queries have relevant items in their lists. \\n\\n**Note**: the Hit Rate will typically increase for higher values of K (since there is a higher cha, ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\text{MAP@K} = \\frac{1}{U} \\sum_{u=1}^{U} \\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica]   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                             Baseline_answers  \n",
+       "0                                                                                                                                                                                                             The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.  \n",
+       "1                                                                                                                                                                                                                                                                                               The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.  \n",
+       "2                                                                                                                                                                                                                                                                                                    The purpose of the HuggingFaceModel function is not specified in the provided documents.  \n",
+       "3                                                                                                                                  The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.  \n",
+       "4                                                                                                                                                                                                                                                                                                                                      The test fails if there is at least one duplicate row.  \n",
+       "5                                                                                                                                                                                                                                                                             The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.  \n",
+       "6                                                                                                                                                                                                                                                                                                                                                     **Required**: `k`  \\n**Optional**: None  \n",
+       "7                                                                                                                                                                                                                                             The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.  \n",
+       "8                                                                                                                                    The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.  \n",
+       "9  AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.  "
+      ]
+     },
+     "execution_count": 128,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 129,
+   "id": "3de32ca3-162f-4ed8-ba88-09a5b9572457",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option(\"display.max_colwidth\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "id": "db43a50d-4b1a-4b42-a529-67e85bef0f9a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Query</th>\n",
+       "      <th>Baseline_answers</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>\"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?</td>\n",
+       "      <td>The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?</td>\n",
+       "      <td>The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>What is the purpose of the HuggingFaceModel function?</td>\n",
+       "      <td>The purpose of the HuggingFaceModel function is not specified in the provided documents.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>How does the HuggingFaceToxicityModel function detect hate speech?</td>\n",
+       "      <td>The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What condition causes the TestNumberOfDuplicatedRows to fail without a reference?</td>\n",
+       "      <td>The test fails if there is at least one duplicate row.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>What is measured by the TestShareOfDriftedColumns function?</td>\n",
+       "      <td>The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>What are the required and optional parameters for the ScoreDistribution function?</td>\n",
+       "      <td>**Required**: `k`  \\n**Optional**: None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>What is the role of the ColumnSummaryMetric in the DataQualityPreset?</td>\n",
+       "      <td>The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>How does the drift detection method choose the appropriate test for each column?</td>\n",
+       "      <td>The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>How is AP@K calculated in the context of relevant item positions?\"</td>\n",
+       "      <td>AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                              Query  \\\n",
+       "0  \"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?   \n",
+       "1                     What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?   \n",
+       "2                                                             What is the purpose of the HuggingFaceModel function?   \n",
+       "3                                                How does the HuggingFaceToxicityModel function detect hate speech?   \n",
+       "4                                 What condition causes the TestNumberOfDuplicatedRows to fail without a reference?   \n",
+       "5                                                       What is measured by the TestShareOfDriftedColumns function?   \n",
+       "6                                 What are the required and optional parameters for the ScoreDistribution function?   \n",
+       "7                                             What is the role of the ColumnSummaryMetric in the DataQualityPreset?   \n",
+       "8                                  How does the drift detection method choose the appropriate test for each column?   \n",
+       "9                                                How is AP@K calculated in the context of relevant item positions?\"   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                             Baseline_answers  \n",
+       "0                                                                                                                                                                                                             The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.  \n",
+       "1                                                                                                                                                                                                                                                                                               The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.  \n",
+       "2                                                                                                                                                                                                                                                                                                    The purpose of the HuggingFaceModel function is not specified in the provided documents.  \n",
+       "3                                                                                                                                  The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.  \n",
+       "4                                                                                                                                                                                                                                                                                                                                      The test fails if there is at least one duplicate row.  \n",
+       "5                                                                                                                                                                                                                                                                             The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.  \n",
+       "6                                                                                                                                                                                                                                                                                                                                                     **Required**: `k`  \\n**Optional**: None  \n",
+       "7                                                                                                                                                                                                                                             The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.  \n",
+       "8                                                                                                                                    The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.  \n",
+       "9  AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.  "
+      ]
+     },
+     "execution_count": 130,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_dataset[[\"Query\", \"Baseline_answers\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8530a6c0-6d4d-4c44-be73-f7d3c7d88e50",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 8c94d2a0925c8f1acd3c5f8347e87b9c74255825 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 16:45:56 +0200
Subject: [PATCH 10/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
index 401b7857e4..19918b6340 100644
--- a/src/evidently/dataset_generators/llm/generator.py
+++ b/src/evidently/dataset_generators/llm/generator.py
@@ -13,7 +13,7 @@ def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFram
     chunks = FileContextGenerator(path=file_path)
     generator = QuestionPairGenerator(
         chunks=chunks,
-        questions=PromptQuestionGenerator(prompt=SimpleQuestionPrompt()),
+        questions=PromptQuestionGenerator(system_promt=SimpleQuestionPrompt()),
         num_questions=num_questions,
         provider="openai",
         model="gpt-4o-mini",

From e537bede33ed5f55a56acc24790a103864944f28 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 17:23:08 +0200
Subject: [PATCH 11/63] generate_dataset_from_docs

---
 .../dataset_generators/llm/chunks.py          | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 src/evidently/dataset_generators/llm/chunks.py

diff --git a/src/evidently/dataset_generators/llm/chunks.py b/src/evidently/dataset_generators/llm/chunks.py
new file mode 100644
index 0000000000..81236fd73e
--- /dev/null
+++ b/src/evidently/dataset_generators/llm/chunks.py
@@ -0,0 +1,40 @@
+import abc
+from abc import ABC
+from typing import Iterator
+from typing import List
+
+from llama_index.core.node_parser import SentenceSplitter
+
+from evidently.pydantic_utils import EvidentlyBaseModel
+
+LLMChunk = str
+
+
+class ChunkGenerator(EvidentlyBaseModel, ABC):
+    @abc.abstractmethod
+    def generate_chunks(self) -> Iterator[LLMChunk]:
+        raise NotImplementedError
+
+
+class FileContextGenerator(ChunkGenerator):
+    class Config:
+        type_alias = "asdfasdfasd"
+
+    path: str
+
+    def generate_chunks(self) -> Iterator[LLMChunk]:
+        with open(self.path) as f:
+            text = f.read()
+        splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
+        text_nodes = splitter.split_text(text)
+        yield from text_nodes
+
+
+class SimpleChunkGenerator(ChunkGenerator):
+    class Config:
+        type_alias = "asdfasdasdfafasd"
+
+    chunks: List[LLMChunk]
+
+    def generate_chunks(self) -> Iterator[LLMChunk]:
+        yield from self.chunks

From da7aa097d66c5b7497c9dd6de9d3a142ccfcba61 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 17:57:57 +0200
Subject: [PATCH 12/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/aaa.py | 39 ++-------------------
 1 file changed, 3 insertions(+), 36 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 2f8be9e463..a4597b1553 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -1,15 +1,15 @@
 import abc
 from abc import ABC
 from typing import ClassVar
-from typing import Iterator
 from typing import List
 from typing import Tuple
 
 import pandas as pd
-from llama_index.core.node_parser import SentenceSplitter
 
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
+from evidently.dataset_generators.llm.chunks import ChunkGenerator
+from evidently.dataset_generators.llm.chunks import LLMChunk
 from evidently.pydantic_utils import EvidentlyBaseModel
 from evidently.utils.llm import BlockPromptTemplate
 from evidently.utils.llm import LLMMessage
@@ -17,39 +17,6 @@
 from evidently.utils.llm import PromptBlock
 from evidently.utils.llm import PromptTemplate
 
-LLMChunk = str
-
-
-class ChunkGenerator(EvidentlyBaseModel, ABC):
-    @abc.abstractmethod
-    def generate_chunks(self) -> Iterator[LLMChunk]:
-        raise NotImplementedError
-
-
-class FileContextGenerator(ChunkGenerator):
-    class Config:
-        type_alias = "asdfasdfasd"
-
-    path: str
-
-    def generate_chunks(self) -> Iterator[LLMChunk]:
-        with open(self.path) as f:
-            text = f.read()
-        splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
-        text_nodes = splitter.split_text(text)
-        yield from text_nodes
-
-
-class SimpleChunkGenerator(ChunkGenerator):
-    class Config:
-        type_alias = "asdfasdasdfafasd"
-
-    chunks: List[LLMChunk]
-
-    def generate_chunks(self) -> Iterator[LLMChunk]:
-        yield from self.chunks
-
-
 Question = str
 Answer = str
 GeneratedQuestion = Tuple[Question, Answer, LLMChunk]
@@ -63,7 +30,7 @@ def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQu
 
 class SimpleQuestionPrompt(BlockPromptTemplate):
     blocks: ClassVar = [
-        PromptBlock.simple("Please generate a question about this:"),
+        PromptBlock.simple("Please generate a question {} about this:"),
         PromptBlock.input("chunk").anchored(),
         PromptBlock.json_output(question="question text", answer="answer text"),
     ]

From 94452be8a1ac3e1dd640d8fd581036519b67517b Mon Sep 17 00:00:00 2001
From: Emeli Dral <emeli.dral@gmail.com>
Date: Wed, 9 Oct 2024 17:59:48 +0200
Subject: [PATCH 13/63] readable version

---
 .../llm/data_generation_for_RAG.ipynb         | 553 +++++++++++-------
 1 file changed, 328 insertions(+), 225 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
index d9bae54b0b..5f5b572832 100644
--- a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
+++ b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
@@ -255,7 +255,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 6,
    "id": "547c43f3-e58f-450c-b80b-c396eb2655a1",
    "metadata": {},
    "outputs": [],
@@ -281,9 +281,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 14,
    "id": "9b93b470-9d32-4757-9d03-915992e2a7c3",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -319,7 +321,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 15,
    "id": "d547021f-9d4d-42cf-b580-abc6a1008cd1",
    "metadata": {
     "scrolled": true
@@ -352,7 +354,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "53f147f1-f8a2-4095-a840-2bacbc0aaf63",
    "metadata": {},
    "outputs": [],
@@ -381,7 +383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 17,
    "id": "c0c9c8e2-0f2f-4fe0-aeee-68b0bb67cea8",
    "metadata": {},
    "outputs": [
@@ -417,7 +419,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 18,
    "id": "7bf99fde-8aa7-4111-ad7e-eec59bd0c23e",
    "metadata": {},
    "outputs": [
@@ -444,26 +446,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 19,
    "id": "50ec8822-e2dc-4a01-bad3-44f1f123ed5c",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[]\n"
+      "[Collection(id=9d10a2e1-2a39-4ba6-863a-577069d1d2af, name=db_collection)]\n"
      ]
     }
    ],
    "source": [
     "#Just incase we need to delete collection\n",
-    "#list_collections = chroma_client.list_collections()\n",
-    "#print(list_collections)\n",
-    "\n",
-    "chroma_client.delete_collection(collection_name)\n",
     "list_collections = chroma_client.list_collections()\n",
-    "print(list_collections)"
+    "print(list_collections)\n",
+    "\n",
+    "#chroma_client.delete_collection(collection_name)\n",
+    "#list_collections = chroma_client.list_collections()\n",
+    "#print(list_collections)"
    ]
   },
   {
@@ -476,7 +480,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 20,
    "id": "ced14436-c4b3-4b25-8cc7-7cdb112eed66",
    "metadata": {},
    "outputs": [],
@@ -492,41 +496,37 @@
    "id": "04b0cf1e-5356-44e4-855b-5169a21260e2",
    "metadata": {},
    "source": [
-    "### Get a seed query"
+    "### Naive questions generation"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "fe13a2c7-1c76-4d18-8bde-d1821078822f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "seed_query = \"How do I get Evidently data drift report for my data?\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 21,
    "id": "d81f6108-79ef-4148-bc8e-b1050cc1637f",
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['ods](../customization/options-for-statistical-tests.md).\\n{% endhint %}\\n\\n## Text Data \\n\\n![](../.gitbook/assets/reports/metric_column_drift_text-min.png)\\n\\nText content drift using a **domain classifier**. Evidently trains a binary classification model to discriminate between data from reference and current distributions. \\n\\nThe default for **small data with <= 1000 observations** detects drift if the ROC AUC of the drift detection classifier > possible ROC AUC of the random classifier at a 95th per',\n",
-       " 'score and compares it to the reference or against a defined condition. | **Required**:<br>N/A<br><br> **Optional**:<ul><li>`threshold_probas`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k`</li></ul> **Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/-20% or better than a dummy model.<br><br>**With reference**: if the F1 is over 20% higher or lower, the test fails.<br><br>**No reference**: if the F1 is lower than the F1 of the d',\n",
-       " '><li> Checks if the text begins with a specified combination. </li><li> Returns True/False for every input.</li></ul> Example use:<br> `BeginsWith(prefix=\"How\")`| **Required:**<br>`prefix`<br><br>**Optional:**<ul><li>`display_name`</li><li>`case_sensitive = True` or `False`</li></ul> |\\n| **EndsWith()** <ul><li> Checks if the text ends with a specified combination. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `EndsWith(suffix=\"Thank you.\")`| **Required:**<br>`suffix`<',\n",
-       " 'ositive Rate)</li><li>TNR (True Negative Rate)</li><li>FPR (False Positive Rate)</li><li>FNR (False Negative Rate)</li><li>ROC AUC Score (for probabilistic classification)</li><li>LogLoss (for probabilistic classification) </li></ul> | **Required:**:<br>n/a<br><br>**Optional:**<ul><li>`probas_threshold` (default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul> |\\n| **ClassificationClassBalance()** <br><br> Calculates the number of o',\n",
-       " 'columns contain empty or infinite values (+-np.inf), these values will be filtered out when calculating distribution drift in the corresponding column.\\n\\nBy default, drift tests do not react to changes or increases in the number of empty values. Since the high number of nulls can be an important indicator, we recommend grouping the data drift tests (that check for distribution shift) with data integrity tests (that check for a share of nulls). You can choose from several null-related [tests](all-',\n",
-       " '. Prediction and target are required. Input features are optional.\\n\\n**Composition**:\\n* `RegressionQualityMetric()`\\n* `RegressionPredictedVsActualScatter()`\\n* `RegressionPredictedVsActualPlot()`\\n* `RegressionErrorPlot()`\\n* `RegressionAbsPercentageErrorPlot()`\\n* `RegressionErrorDistribution()`\\n* `RegressionErrorNormality()`\\n* `RegressionTopErrorMetric()`\\n* `RegressionErrorBiasTable()` for all or specified `columns`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n  \\n<summary>Classifica',\n",
-       " 'ders items that are present in training. \\n\\nFurther reading: [Castells, P., Vargas, S., & Wang, J. (2011). Novelty and Diversity Metrics for Recommender Systems: Choice, Discovery and Relevance](https://repositorio.uam.es/bitstream/handle/10486/666094/novelty_castells_DDR_2011.pdf)\\n\\n# Serendipity\\n\\n![](../.gitbook/assets/reports/metric_serendipity-min.png)\\n\\n**Evidently Metric**: `SerendipityMetric`\\n\\nRecommendation serendipity: this metric measures how unusual the relevant recommendations are in K,',\n",
-       " 'ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\\\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\\\text{MAP@K} = \\\\frac{1}{U} \\\\sum_{u=1}^{U} \\\\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica',\n",
-       " 'r><br>**With reference**: the test fails if the number of columns with missing values is higher than in reference.  <br>**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. <br><br> Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:<br> N/A <br><br> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default ',\n",
-       " 'talog.\\n\\n**Range**: 0 to 1, where 0 represents the perfect equality (recommended items are evenly distributed among users), and 1 is complete inequality (the recommendations are concentrated on a single item).\\n\\n**Interpretation**: the lower the value (usually preferable), the more equal the item distribution in recommendations. If the value is high, a few items are frequently recommended to many users while others are ignored.\\n\\nFurther reading: [Abdollahpouri, H., Mansoury, M., Burke, R., Mobashe']"
+       "['ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
+       " 'r><br> | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **PopularityBias()** <br><br> Evaluates the popularity bias in recommendations by computing ARP (average recommendation popularity), Gini index, and coverage. <br><br>Requires a training dataset. | **Required**:<ul><li>`K`</li><li>`normalize_arp (default: False)` - whether to normalize ARP calculation by the most popular item in training</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ItemBiasMetric()** <br><br> Visu',\n",
+       " 'th reference**: the test fails if the TNR is over 20% higher or lower.<br><br>**No reference**: the test fails if the TNR is lower than the TNR of the dummy model. |\\n| **TestFPR()** | Dataset-level. <br><br> Computes the False Positive Rate and compares it to the reference or against a defined condition. | **Required**:<br>N/A<br><br> **Optional:**<ul><li>`probas_threshold`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul>**',\n",
+       " 'tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com',\n",
+       " '.|\\n| **TestGiniIndex(k=k)** | Dataset-level. <br><br> Computes the Gini Index at the top K recommendations and compares it to the reference or against a defined condition.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li></ul> **Optional**:<br>N/A<br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10% from reference.<br><br>**With reference**: if the Gini Index at the top K is over 10% higher or lower, the test fails.<br><br>**No reference**: Test',\n",
+       " ' a defined condition. | **Required**:<br>N/A<br><br> **Optional:**<ul><li>`probas_threshold`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/-20% or better than a dummy model.<br><br>**With reference**: the test fails if the FNR is over 20% higher or lower.<br><br>**No reference**: the test fails if the FNR is higher than the FNR of the dummy model. |\\n|',\n",
+       " ' test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of',\n",
+       " 'rcentage error in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorDistribution()** <br><br> Visualizes the distribution of the model error in a histogram. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorNormality()** <br><br> Visualizes the quantile-quantile plot (Q-Q plot) to estimate value normality. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionTopErrorMetric()** <br><br> Calculates the regression performance me',\n",
+       " \"\\n**Note**: Only a single top relevant item is considered in this metric, disregarding the position and relevance of other items in the list.\\n\\n# Diversity\\n\\n![](../.gitbook/assets/reports/metric_diversity-min.png)\\n\\n**Evidently Metric**: `DiversityMetric`\\n\\n**Recommendation diversity**: this metric measures the average intra-list diversity at K. It reflects the variety of items within the same user's recommendation list, averaged by all users. \\n\\n**Implemented method**:\\n* **Measure the difference bet\",\n",
+       " '*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te']"
       ]
      },
-     "execution_count": 131,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -542,7 +542,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 22,
    "id": "90712120-3ac4-48d0-a749-9f8ef72f4247",
    "metadata": {},
    "outputs": [],
@@ -569,10 +569,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 23,
    "id": "7aa2632f-5bda-4395-81a2-77ccb4dd994b",
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"gpt-4o\",  # Updated to a valid model\n",
@@ -588,42 +604,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 24,
    "id": "d01f1b79-7781-4e60-b6b0-71a31f860376",
    "metadata": {},
    "outputs": [],
    "source": [
-    "generated_seed_queries = response.choices[0].message.content.strip().split(\",\")"
+    "generated_queries = response.choices[0].message.content.strip().split(\",\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 25,
    "id": "4ba9d23b-b502-4e6e-9535-6c672d6ec309",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['\"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?',\n",
-       " ' What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?',\n",
-       " ' What is the purpose of the HuggingFaceModel function?',\n",
-       " ' How does the HuggingFaceToxicityModel function detect hate speech?',\n",
-       " ' What condition causes the TestNumberOfDuplicatedRows to fail without a reference?',\n",
-       " ' What is measured by the TestShareOfDriftedColumns function?',\n",
-       " ' What are the required and optional parameters for the ScoreDistribution function?',\n",
-       " ' What is the role of the ColumnSummaryMetric in the DataQualityPreset?',\n",
-       " ' How does the drift detection method choose the appropriate test for each column?',\n",
-       " ' How is AP@K calculated in the context of relevant item positions?\"']"
+       "['\"How is the \\'Name\\' of a Metric used in reading tables?',\n",
+       " \" What information does the 'Description' section provide in the context of Metrics?\",\n",
+       " \" What parameters are considered for the 'PopularityBias()' Metric?\",\n",
+       " \" How does the 'TestFPR()' function operate at the dataset level?\",\n",
+       " \" What is the role of the 'TestGiniIndex(k=k)' in evaluating dataset bias?\",\n",
+       " \" What visualization is provided by the 'RegressionErrorDistribution()'?\",\n",
+       " \" How does the 'RegressionErrorNormality()' assess value normality?\",\n",
+       " \" What is the primary focus of the 'DiversityMetric' in recommendation systems?\",\n",
+       " \" How are missing values tested in the 'TestShareOfRowsWithMissingValues()'?\",\n",
+       " ' How does the \\'TestNumberOfDuplicatedRows()\\' function evaluate dataset integrity?\"']"
       ]
      },
-     "execution_count": 118,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "generated_seed_queries"
+    "generated_queries"
    ]
   },
   {
@@ -633,12 +649,93 @@
     "jp-MarkdownHeadingCollapsed": true
    },
    "source": [
-    "### Get alternative questions"
+    "### [PLEASE IGNORE THE WHOLE BLOCK] Get alternative questions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fe13a2c7-1c76-4d18-8bde-d1821078822f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#it is not used so far\n",
+    "seed_query = \"How do I get Evidently data drift report for my data?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "0d135ad3-5be4-45de-8039-c556770c32c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#random seed question generation\n",
+    "system_prompt = \"You are an assisstant who generates questions based on provided context\"\n",
+    "user_prompt = \"\"\"\n",
+    "Generate a conceptual question based on the provided context and can be answered from the information in the provided context.\n",
+    "Here is a context\n",
+    "<context>\n",
+    "    {context}\n",
+    "</context>\n",
+    "\n",
+    "Remain faithful to the underlying context. \n",
+    "Avoid providing any preamble!\n",
+    "Avoid providing any closing statement!\n",
+    "Please return only a question\n",
+    "\"\"\"\n",
+    "\n",
+    "context = \"\\n\\n\".join(random_chuncks)\n",
+    "\n",
+    "formated_user_prompt = user_prompt.format(context=context, N=number_of_questions)\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",  # Updated to a valid model\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": formated_user_prompt}\n",
+    "    ],\n",
+    "    max_tokens=400,  # Limits the response length\n",
+    "    temperature=0.7,  # Controls randomness in the output\n",
+    "    n=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "0b751174-f5ab-4b2c-bde1-3e1c9bcc9d45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_seed = response.choices[0].message.content.strip().split(\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "82c0618f-e790-4565-acad-e62edf4dfba3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['How does the `PopularityBias()` metric evaluate recommendation systems',\n",
+       " ' and what parameters are required to compute this metric?']"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generated_seed"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 29,
    "id": "39df8c68-84cb-43af-aba3-63d1f10537ac",
    "metadata": {},
    "outputs": [],
@@ -648,7 +745,7 @@
     "\n",
     "number_of_reformulations = 5\n",
     "\n",
-    "seed_query = \"How do I get Evidently data drift report for my data?\"\n",
+    "seed_query = generated_seed\n",
     "\n",
     "user_prompt = \"\"\"Write for me {number_of_reformulations} alternative questions quite similar to the question you got.\n",
     "The question: {seed_query}\n",
@@ -658,13 +755,12 @@
     "\"\"\"\n",
     "\n",
     "formated_user_prompt = user_prompt.format(number_of_reformulations=number_of_reformulations, \n",
-    "                                          seed_query = generated_seed_query)\n",
-    "                         #seed_query=seed_query)"
+    "                                          seed_query=seed_query)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 30,
    "id": "983b4545-0511-473e-8797-7fbdf2d5ff54",
    "metadata": {},
    "outputs": [],
@@ -685,7 +781,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 31,
    "id": "9c2fe61b-5470-469a-949c-9e1a65c0f4e0",
    "metadata": {},
    "outputs": [
@@ -694,20 +790,24 @@
      "output_type": "stream",
      "text": [
       "Generated Completion:\n",
-      "What criteria does the `TestShareOfColumnsWithMissingValues()` function use to identify failure without a reference dataset?, How does the absence of a reference affect the `TestShareOfColumnsWithMissingValues()` function's failure detection?, In what way does the `TestShareOfColumnsWithMissingValues()` function assess failure without having a reference?, How is failure determined by the `TestShareOfColumnsWithMissingValues()` function when a reference is not given?, What is the method used by the `TestShareOfColumnsWithMissingValues()` function to evaluate failure without a reference dataset?\n"
+      "['What parameters are needed to calculate the `PopularityBias()` metric in evaluating recommendation systems?', 'In what way does the `PopularityBias()` metric assess recommendation systems, and what are the necessary parameters?', 'Which parameters are essential for the `PopularityBias()` metric, and how does it evaluate recommendation systems?', 'How is the `PopularityBias()` metric used to evaluate recommendation systems, and what parameters does it need?', 'What is the role of the `PopularityBias()` metric in assessing recommendation systems, and which parameters are required for its computation?']\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "['What criteria does the `TestShareOfColumnsWithMissingValues()` function use to identify failure without a reference dataset?',\n",
-       " \" How does the absence of a reference affect the `TestShareOfColumnsWithMissingValues()` function's failure detection?\",\n",
-       " ' In what way does the `TestShareOfColumnsWithMissingValues()` function assess failure without having a reference?',\n",
-       " ' How is failure determined by the `TestShareOfColumnsWithMissingValues()` function when a reference is not given?',\n",
-       " ' What is the method used by the `TestShareOfColumnsWithMissingValues()` function to evaluate failure without a reference dataset?']"
+       "[\"['What parameters are needed to calculate the `PopularityBias()` metric in evaluating recommendation systems?'\",\n",
+       " \" 'In what way does the `PopularityBias()` metric assess recommendation systems\",\n",
+       " \" and what are the necessary parameters?'\",\n",
+       " \" 'Which parameters are essential for the `PopularityBias()` metric\",\n",
+       " \" and how does it evaluate recommendation systems?'\",\n",
+       " \" 'How is the `PopularityBias()` metric used to evaluate recommendation systems\",\n",
+       " \" and what parameters does it need?'\",\n",
+       " \" 'What is the role of the `PopularityBias()` metric in assessing recommendation systems\",\n",
+       " \" and which parameters are required for its computation?']\"]"
       ]
      },
-     "execution_count": 110,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -730,7 +830,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 32,
    "id": "0333932a-8f6e-48e6-9f28-9f5c0406d091",
    "metadata": {},
    "outputs": [],
@@ -763,19 +863,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 33,
    "id": "b827e30d-a7b2-406f-a139-5b7fdd3bab6c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['how to detect drift in ML embeddings](https://www.evidentlyai.com/blog/embedding-drift-detection).  \\n\\nAdditional links:  \\n\\n* [How to interpret data and prediction drift together? ](https://evidentlyai.com/blog/data-and-prediction-drift)  \\n\\n* [Do I need to monitor data drift if I can measure the ML model quality?](https://evidentlyai.com/blog/ml-monitoring-do-i-need-data-drift)  \\n\\n* [\"My data drifted. What\\'s next?\" How to handle ML model drift in production.](https://evidentlyai.com/blog/ml-monit',\n",
-       " 'arget). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label=\"hate\"` (default)</li><li> `display_name`</li></ul> |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,',\n",
-       " 'alculates the number and share of drifted features in the dataset. </li><li>Each feature is tested for drift individually using the default algorithm, unless a custom approach is specified.</li></ul>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`сolumns` (default=all)</li><li>`drift_share`(default for dataset drift = 0.5)</li> <li>`stattest`</li><li>`cat_stattest`</li><li>`num_stattest`</li><li>`per_column_stattest`</li><li>`stattest_threshold`</li><li>`cat_stattest_threshold`</li><li>`num_']"
+       "[' times item *i* was rated in the training set (popularity of item *i*)\\n\\n**Range**: 0 to infinity \\n\\n**Interpretation**: the higher the value, the more popular on average the recommendations are in top-K.  \\n\\n**Note**: This metric is not normalized and depends on the number of recommendations in the training set.\\n\\nFurther reading: [Abdollahpouri, H., Mansoury, M., Burke, R., Mobasher, B., & Malthouse, E. (2021). User-centered Evaluation of Popularity Bias in Recommender Systems](https://dl.acm.org/',\n",
+       " 'bdollahpouri, H., Mansoury, M., Burke, R., Mobasher, B., & Malthouse, E. (2021). User-centered Evaluation of Popularity Bias in Recommender Systems](https://dl.acm.org/doi/fullHtml/10.1145/3450613.3456821)\\n\\n# Recommendation table\\n\\n![](../.gitbook/assets/reports/metric_recsys_table-min.png)\\n\\n**Evidently Metric**: `RecCasesTable`\\n\\nThis visual Metric shows the list of recommendations for the specified user IDs (`user_ids: List`). If you do not pass the list of IDs, Evidently will choose 5 random on',\n",
+       " 'reports/metric_popularity_bias-min.png)\\n\\n**Evidently Metric**: `PopularityBias`\\n\\nThe recommendation popularity bias is a tendency to favor a few popular items. This metric includes several measurements: ARP, Coverage and Gini index.\\n\\n## 1. Average Recommendation Popularity (ARP)\\n\\nARP reflects the average popularity of the items recommended to the users. \\n\\n**Implementation**.\\n* Compute the item popularity as the number of times each item was seen in training. \\n* Compute the average popularity for',\n",
+       " '---\\ndescription: List of Metrics, Descriptors and Metric Presets available in Evidently.\\n---\\n\\n<details>\\n\\n<summary>How to use this page</summary>\\n\\nThis is a reference page. It shows all the available Metrics, Descriptors and Presets. \\n  \\nYou can use the menu on the right to navigate the sections. We organize the Metrics by logical groups. Note that these groups do **not** match the Presets with a similar name. For example, there are more Data Quality Metrics than included in the `DataQualityPrese',\n",
+       " 'ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
+       " 'unique <= 2): proportion difference test for independent samples based on Z-score.\\n\\nAll tests use a 0.95 confidence level by default.  \\n\\nFor **larger data with \\\\> 1000 observations** in the reference dataset:\\n\\n* For numerical columns (n\\\\_unique \\\\> 5):[Wasserstein Distance](https://en.wikipedia.org/wiki/Wasserstein_metric).\\n* For categorical columns or numerical with n\\\\_unique <= 5):[Jensen--Shannon divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence).\\n\\nAll metrics use a t']"
       ]
      },
-     "execution_count": 71,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -786,18 +889,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 36,
    "id": "d549b9ab-1e3a-490e-a57e-669af72dbdb6",
    "metadata": {},
    "outputs": [],
    "source": [
     "#relevant_chunks = [query_collection(query) for query in queries]\n",
-    "relevant_chunks = [query_collection(query) for query in generated_seed_queries]"
+    "relevant_chunks = [query_collection(query) for query in generated_queries]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 37,
    "id": "cfeb5073-f37c-4d24-85a7-bf2043dacb1e",
    "metadata": {
     "collapsed": true,
@@ -810,39 +913,39 @@
     {
      "data": {
       "text/plain": [
-       "[['r><br>**With reference**: the test fails if the number of columns with missing values is higher than in reference.  <br>**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. <br><br> Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:<br> N/A <br><br> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default ',\n",
-       "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O',\n",
-       "  ' test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of'],\n",
-       " [\"the Test's defaults. You can see them in the tables below. The listed Preset parameters apply to the relevant individual Tests inside the Preset.\\n\\n<details>\\n \\n<summary>NoTargetPerformance Test Preset</summary>\\n\\nPreset name: `NoTargetPerformanceTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift(column_name=prediction)`\\n* `TestColumnShareOfMissingValues()` for `all` or `сolumns` if provided\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* \",\n",
-       "  'lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo',\n",
-       "  \"**: N/A |\\n\\n## Column Values\\n\\n| Test name  | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestColumnValueMin**(column_name='num-column') | Column-level. <br><br> Tests the minimum value of a given numerical column against reference or a defined condition. |  **Required**:<ul><li>`column_name`</li></ul> **Optional:** N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects not lower.<br><br>**With reference**: the test fails if the minimum \"],\n",
-       " ['tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com',\n",
+       "[['ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
+       "  'igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use',\n",
+       "  '---\\ndescription: List of Metrics, Descriptors and Metric Presets available in Evidently.\\n---\\n\\n<details>\\n\\n<summary>How to use this page</summary>\\n\\nThis is a reference page. It shows all the available Metrics, Descriptors and Presets. \\n  \\nYou can use the menu on the right to navigate the sections. We organize the Metrics by logical groups. Note that these groups do **not** match the Presets with a similar name. For example, there are more Data Quality Metrics than included in the `DataQualityPrese'],\n",
+       " ['---\\ndescription: List of Metrics, Descriptors and Metric Presets available in Evidently.\\n---\\n\\n<details>\\n\\n<summary>How to use this page</summary>\\n\\nThis is a reference page. It shows all the available Metrics, Descriptors and Presets. \\n  \\nYou can use the menu on the right to navigate the sections. We organize the Metrics by logical groups. Note that these groups do **not** match the Presets with a similar name. For example, there are more Data Quality Metrics than included in the `DataQualityPrese',\n",
        "  'ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
-       "  'igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use'],\n",
-       " ['l>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |\\n| **HuggingFaceModel()** <br><br> Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|\\n| **HuggingFaceToxicityModel()** <ul><li> Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). </li><li> Returns predicted probability fo',\n",
-       "  'xts (containing critical or pessimistic tone). Returns a label (NEGATIVE or POSITIVE) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **BiasLLMEval()** <br><br> Detects biased texts (containing prejudice for or against a person or group). Returns a label (BIAS or OK) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **ToxicityLLMEval()** <br><br> Detects toxic texts (containing harmful, offensive, or derogatory language). Returns a label (T',\n",
-       "  'arget). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label=\"hate\"` (default)</li><li> `display_name`</li></ul> |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,'],\n",
+       "  '*Interpretation**: the higher the value, the more varied items are shown to each user (e.g. inside a single recommendation block).\\n\\n**Requirements**: You must pass the `item_features` list to point to numerical columns or embeddings that describe the recommended items. For example, these could be encoded genres that represent each movie. This makes it possible to compare the degree of similarity between different items. \\n\\n**Notes**: \\n* This metric does not consider relevance. A recommender syste'],\n",
+       " ['reports/metric_popularity_bias-min.png)\\n\\n**Evidently Metric**: `PopularityBias`\\n\\nThe recommendation popularity bias is a tendency to favor a few popular items. This metric includes several measurements: ARP, Coverage and Gini index.\\n\\n## 1. Average Recommendation Popularity (ARP)\\n\\nARP reflects the average popularity of the items recommended to the users. \\n\\n**Implementation**.\\n* Compute the item popularity as the number of times each item was seen in training. \\n* Compute the average popularity for',\n",
+       "  ' times item *i* was rated in the training set (popularity of item *i*)\\n\\n**Range**: 0 to infinity \\n\\n**Interpretation**: the higher the value, the more popular on average the recommendations are in top-K.  \\n\\n**Note**: This metric is not normalized and depends on the number of recommendations in the training set.\\n\\nFurther reading: [Abdollahpouri, H., Mansoury, M., Burke, R., Mobasher, B., & Malthouse, E. (2021). User-centered Evaluation of Popularity Bias in Recommender Systems](https://dl.acm.org/',\n",
+       "  'r><br> | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **PopularityBias()** <br><br> Evaluates the popularity bias in recommendations by computing ARP (average recommendation popularity), Gini index, and coverage. <br><br>Requires a training dataset. | **Required**:<ul><li>`K`</li><li>`normalize_arp (default: False)` - whether to normalize ARP calculation by the most popular item in training</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ItemBiasMetric()** <br><br> Visu'],\n",
+       " ['th reference**: the test fails if the TNR is over 20% higher or lower.<br><br>**No reference**: the test fails if the TNR is lower than the TNR of the dummy model. |\\n| **TestFPR()** | Dataset-level. <br><br> Computes the False Positive Rate and compares it to the reference or against a defined condition. | **Required**:<br>N/A<br><br> **Optional:**<ul><li>`probas_threshold`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul>**',\n",
+       "  'In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect ',\n",
+       "  ' parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n\\n<summary>Data Drift Preset</summary>\\n\\n`DataDriftPreset` evaluates the data distribution drift in all individual columns, and share of drifting columns in the dataset. Input columns are required. \\n\\n**Composition**:\\n* `DataDriftTable()` for all or specified `columns`\\n* `DatasetDriftMetric()` for all or specified `columns`\\n\\n**Optional parameters**:\\n* `columns`\\n* `stattest`\\n* `cat_stattest`\\n* `num_stattest`\\n* `per_column_stattest`\\n* `text_stattest`\\n'],\n",
+       " ['wer, the test fails.<br><br>**No reference**: Tests if Gini Index < 1. |\\n| **TestCoverage(k=k)** | Dataset-level. <br><br> Computes the Coverage at the top K recommendations and compares it to the reference or against a defined condition. <br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li></ul> **Optional**:<br>N/A<br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10% from reference.<br><br>**With reference**: if the Coverage at the top K is over 1',\n",
+       "  '.|\\n| **TestGiniIndex(k=k)** | Dataset-level. <br><br> Computes the Gini Index at the top K recommendations and compares it to the reference or against a defined condition.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li></ul> **Optional**:<br>N/A<br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10% from reference.<br><br>**With reference**: if the Gini Index at the top K is over 10% higher or lower, the test fails.<br><br>**No reference**: Test',\n",
+       "  'eference**: if the Coverage at the top K is over 10% higher or lower, the test fails.<br><br>**No reference**: Tests if Coverage > 0.|\\n\\n'],\n",
+       " ['ter plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionPredictedVsActualPlot()** <br><br> Visualizes predicted vs. actual values in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorPlot()** <br><br> Visualizes the model error (predicted - actual) in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionAbsPercentageErrorPlot()** <br><br> Visualizes the absolute percentage error in a line plot. | **Required:**<br>',\n",
+       "  'rcentage error in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorDistribution()** <br><br> Visualizes the distribution of the model error in a histogram. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorNormality()** <br><br> Visualizes the quantile-quantile plot (Q-Q plot) to estimate value normality. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionTopErrorMetric()** <br><br> Calculates the regression performance me',\n",
+       "  'all scores, if available). \\n\\n# Item Bias \\n\\n![](../.gitbook/assets/reports/metric_itembias_2-min.png)\\n\\n**Evidently Metric**: `ItemBiasMetric`\\n\\nThis visual metric shows the distribution of recommendations by a specified category or numerical value (`column_name`) compared to its distribution in the training set.\\n\\nThis helps compare the model recommendations against what could have been a random classifier that follows the observed distribution of items by a chosen characteristic.\\n\\nThe visualizatio'],\n",
+       " ['rcentage error in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorDistribution()** <br><br> Visualizes the distribution of the model error in a histogram. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorNormality()** <br><br> Visualizes the quantile-quantile plot (Q-Q plot) to estimate value normality. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionTopErrorMetric()** <br><br> Calculates the regression performance me',\n",
+       "  'her than the RMSE of the dummy model that predicts the optimal constant (mean of the target value). |\\n| **TestValueMeanError()**<br>| Dataset-level. <br><br> Computes the Mean Error (ME) and tests if it is near zero or compares it against a defined condition. | **Required**:<br>N/A<br><br> **Optional**:<br>N/A<br><br> **Test conditions** <ul><li>*standard parameters*</li></ul>| Expects the Mean Error to be near zero.<br><br>**With/without reference**: the test fails if the Mean Error is skewed a',\n",
+       "  'n and underestimation).</li></ul> |\\n| **RegressionErrorBiasTable()** <br><br> Plots the relationship between feature values and model quality per group (for top-X% error groups, as above). | **Required:**<br>n/a<br><br>**Optional:**<ul><li>`columns`(default = all categorical and numerical columns)</li><li>`top_error` (default=0.05; the metrics are calculated for top-5% predictions with overestimation and underestimation).</li></ul>|\\n\\n# Ranking and Recommendations \\n\\nAll metrics are dataset-level.'],\n",
+       " [\"\\n**Note**: Only a single top relevant item is considered in this metric, disregarding the position and relevance of other items in the list.\\n\\n# Diversity\\n\\n![](../.gitbook/assets/reports/metric_diversity-min.png)\\n\\n**Evidently Metric**: `DiversityMetric`\\n\\n**Recommendation diversity**: this metric measures the average intra-list diversity at K. It reflects the variety of items within the same user's recommendation list, averaged by all users. \\n\\n**Implemented method**:\\n* **Measure the difference bet\",\n",
+       "  \"\\n* **Intra-list diversity**. Calculate intra-list diversity for each user by averaging the Cosine Distance between each pair of items in the user's top-K list.\\n* **Overall diversity**. Calculate the overall diversity by averaging the intra-list diversity across all users.\\n\\n**Range**: The metric is based on Cosine distance, and can take values from 0 to 2. \\n**0:** identical recommendations in top-K.\\n**2:** very diverse recommendations in top-K.\\n\\n**Interpretation**: the higher the value, the more \",\n",
+       "  'ders items that are present in training. \\n\\nFurther reading: [Castells, P., Vargas, S., & Wang, J. (2011). Novelty and Diversity Metrics for Recommender Systems: Choice, Discovery and Relevance](https://repositorio.uam.es/bitstream/handle/10486/666094/novelty_castells_DDR_2011.pdf)\\n\\n# Serendipity\\n\\n![](../.gitbook/assets/reports/metric_serendipity-min.png)\\n\\n**Evidently Metric**: `SerendipityMetric`\\n\\nRecommendation serendipity: this metric measures how unusual the relevant recommendations are in K,'],\n",
+       " ['r> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul> **Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0. <br><br>**With reference**: the test fails if the share of missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains missing values.|\\n| **TestShareOfMissingValues()**| Dataset-level. <br><br> Tests the share of missing values in the dataset against ',\n",
+       "  ' test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of',\n",
+       "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O'],\n",
        " ['*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te',\n",
        "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O',\n",
-       "  ' in the reference.<br><br>**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. <br><br> Tests the number of duplicate columns against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects =< or none.<br><br>**With reference**: the test fails if the number of duplicate columns is higher than in the reference.<b'],\n",
-       " ['lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo',\n",
-       "  'sition**: \\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestMostCommonValueShare()` for all or specified `columns`\\n* `TestNumberOfConstantColumns()`\\n* `TestNumberOfDuplicatedColumns()`\\n* `TestNumberOfDuplicatedRows()`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Drift Test Preset</summary>\\n\\nPreset name: `DataDriftTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift()` for all or specified `columns`\\n\\n**Optio',\n",
-       "  \"10%.<br><br>**With reference**: the test fails if the median value is different by more than 10%.<br><br>**No reference**: N/A |\\n| **TestColumnValueStd**(column_name='num-column')<br>| Column-level. <br><br> Tests the standard deviation of a given numerical column against reference or a defined condition. |   **Required**:<ul><li>`column_name`</li></ul> **Optional:**<br> N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10%.<br><br>**With reference**: the tes\"],\n",
-       " ['in the training dataset.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li><li>`column_name`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ScoreDistribution()** <br><br> Computes the predicted score entropy. Visualizes the distribution of the scores at `k` (and all scores, if available).<br><br>Applies only when the `recommendations_type` is a `score`. | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **RecCasesTable()** <br><br> Shows the list of recomm',\n",
-       "  'Evidently Metric**: `ScoreDistribution`\\n\\nThis metric computes the predicted score entropy. It applies only when the `recommendations_type` is a score.\\n\\n**Implementation**:\\n* Apply softmax transformation for top-K scores for all users.\\n* Compute the KL divergence (relative entropy in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)). \\n\\nThe visualization shows the distribution of the predicted scores at K (and all scores, if available). \\n\\n# Item Bias \\n\\n![](../',\n",
-       "  'icationProbDistribution()`- if probabilistic classification\\n* `ClassificationRocCurve()` - if probabilistic classification\\n* `ClassificationPRCurve()` - if probabilistic classification\\n* `ClassificationPRTable()` - if probabilistic classification\\n* `ClassificationQualityByFeatureTable()` for all or specified `columns`</li></ul>\\n\\n**Optional parameters**:\\n* `columns`\\n* `probas_threshold`\\n\\n</details>\\n\\n<details>\\n  \\n<summary>Text Overview Preset</summary>\\n\\n`TextOverviewPreset()` provides a summary fo'],\n",
-       " ['ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
-       "  '%}\\n\\n# Metric Presets\\n\\n**Defaults**: Presets use the default parameters for each Metric. You can see them in the tables below. \\n\\n<details>\\n\\n<summary>Data Quality Preset</summary>\\n\\n`DataQualityPreset` captures column and dataset summaries. Input columns are required. Prediction and target are optional.\\n\\n**Composition**:\\n* `DatasetSummaryMetric()`\\n* `ColumnSummaryMetric()` for `all` or specified `сolumns`\\n* `DatasetMissingValuesMetric()`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n',\n",
-       "  'lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo'],\n",
-       " ['In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect ',\n",
-       "  'ct a different test, you should set [data drift parameters](../customization/options-for-statistical-tests.md). \\n\\n| Test name | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestNumberOfDriftedColumns()** | Dataset-level. <br><br> Compares the distribution of each column in the current dataset to the reference and tests the number of drifting features against a defined condition.| **Required**:<br>N/A<br><br>**Optional:**<ul><li>`сolumns`</li><li>`stattest`(default=',\n",
-       "  'tical tests and drift detection methods to detect if the distribution has changed significantly. It returns a \"drift detected\" or \"not detected\" result.\\n\\nThere is a default logic to choosing the appropriate drift test for each column. It is based on:\\n\\n* column type: categorical, numerical, text data or embeddings\\n* the number of observations in the reference dataset\\n* the number of unique values in the column (n\\\\_unique)\\n\\n## Tabular Data \\n\\n![](../.gitbook/assets/reports/metric_data_drift_table_2'],\n",
-       " ['at each relevant item position within the top K. To do that, we sum up precision at all values of K when the item is relevant (e.g., Precision @1, Precision@2..), and divide it by the total number of relevant items in K.\\n\\n$$\\n\\\\text{AP@K} = \\\\frac{1}{N} \\\\sum_{k=1}^{K} Precision(k) \\\\times rel(k)\\n$$\\n\\nWhere *N* is the total number of relevant items at K, and *rel(k)* is equal to 1 if the item is relevant, and is 0 otherwise.\\n\\nExample: if K = 10, and items in positions 1, 2, and 10 are relevant, the fo',\n",
-       "  ' 1 if any relevant item is included in K, or 0 otherwise.\\n* **Compute average hit rate**. The average of this metric is calculated across all users or queries.\\n\\n**Range**: 0 to 1, where 1 indicates that each user / query gets at least one relevant recommendation / retrieval.\\n\\n**Interpretation**: A higher Hit Rate indicates that a higher share of users / queries have relevant items in their lists. \\n\\n**Note**: the Hit Rate will typically increase for higher values of K (since there is a higher cha',\n",
-       "  'ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\\\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\\\text{MAP@K} = \\\\frac{1}{U} \\\\sum_{u=1}^{U} \\\\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica']]"
+       "  ' in the reference.<br><br>**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. <br><br> Tests the number of duplicate columns against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects =< or none.<br><br>**With reference**: the test fails if the number of duplicate columns is higher than in the reference.<b']]"
       ]
      },
-     "execution_count": 120,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -861,7 +964,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 38,
    "id": "ee992257-020d-461b-9b2f-928b93acb4c4",
    "metadata": {},
    "outputs": [],
@@ -908,23 +1011,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 40,
    "id": "8db672b0-f63a-400b-b00f-22e96d02dbe5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "baseline_answers = [generate_baseline_answer(generated_seed_queries[i], relevant_chunks[i]) for i in range(min(len(generated_seed_queries), len(relevant_chunks)))]"
+    "baseline_answers = [generate_baseline_answer(generated_queries[i], relevant_chunks[i]) for i in range(min(len(generated_queries), len(relevant_chunks)))]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 41,
    "id": "ae366d95-3438-4d6c-8030-8e8c666e0e17",
    "metadata": {},
    "outputs": [],
    "source": [
     "generated_dataset = pd.DataFrame({\n",
-    "    'Query': generated_seed_queries,\n",
+    "    'Query': generated_queries,\n",
     "    'Relevant chunks': relevant_chunks,\n",
     "    'Baseline_answers': baseline_answers\n",
     "})"
@@ -932,7 +1035,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 42,
    "id": "bdfc1029-34bb-4870-bcc2-0c32f56a0bc1",
    "metadata": {},
    "outputs": [
@@ -965,107 +1068,107 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>\"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?</td>\n",
-       "      <td>[r&gt;&lt;br&gt;**With reference**: the test fails if the number of columns with missing values is higher than in reference.  &lt;br&gt;**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Optional**: &lt;ul&gt;&lt;li&gt;`missing_values = [], replace = True/False` (default , **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. &lt;br&gt;&lt;br&gt;**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**O,  test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**Optional**:&lt;ul&gt;&lt;li&gt;`missing_values = [], replace = True/False` (default = default list)&lt;/li&gt;&lt;/ul&gt;**Test conditions** &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt;| Expects up to +10% or 0.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the share of]</td>\n",
-       "      <td>The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.</td>\n",
+       "      <td>\"How is the 'Name' of a Metric used in reading...</td>\n",
+       "      <td>[ity Metrics than included in the `DataQuality...</td>\n",
+       "      <td>The 'Name' of a Metric is used to identify the...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?</td>\n",
-       "      <td>[the Test's defaults. You can see them in the tables below. The listed Preset parameters apply to the relevant individual Tests inside the Preset.\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;NoTargetPerformance Test Preset&lt;/summary&gt;\\n\\nPreset name: `NoTargetPerformanceTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift(column_name=prediction)`\\n* `TestColumnShareOfMissingValues()` for `all` or `сolumns` if provided\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* , lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Quality Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, **: N/A |\\n\\n## Column Values\\n\\n| Test name  | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestColumnValueMin**(column_name='num-column') | Column-level. &lt;br&gt;&lt;br&gt; Tests the minimum value of a given numerical column against reference or a defined condition. |  **Required**:&lt;ul&gt;&lt;li&gt;`column_name`&lt;/li&gt;&lt;/ul&gt; **Optional:** N/A &lt;br&gt;&lt;br&gt; **Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt; | Expects not lower.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the minimum ]</td>\n",
-       "      <td>The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.</td>\n",
+       "      <td>What information does the 'Description' secti...</td>\n",
+       "      <td>[---\\ndescription: List of Metrics, Descriptor...</td>\n",
+       "      <td>The 'Description' section provides a plain tex...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>What is the purpose of the HuggingFaceModel function?</td>\n",
-       "      <td>[tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com, ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use]</td>\n",
-       "      <td>The purpose of the HuggingFaceModel function is not specified in the provided documents.</td>\n",
+       "      <td>What parameters are considered for the 'Popul...</td>\n",
+       "      <td>[reports/metric_popularity_bias-min.png)\\n\\n**...</td>\n",
+       "      <td>ARP, Coverage, and Gini index are the paramete...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>How does the HuggingFaceToxicityModel function detect hate speech?</td>\n",
-       "      <td>[l&gt;| **Required:**&lt;br&gt;n/a&lt;br&gt;&lt;br&gt;**Optional:**&lt;ul&gt;&lt;li&gt;`display_name`&lt;/li&gt;&lt;/ul&gt; |\\n| **HuggingFaceModel()** &lt;br&gt;&lt;br&gt; Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|\\n| **HuggingFaceToxicityModel()** &lt;ul&gt;&lt;li&gt; Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). &lt;/li&gt;&lt;li&gt; Returns predicted probability fo, xts (containing critical or pessimistic tone). Returns a label (NEGATIVE or POSITIVE) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **BiasLLMEval()** &lt;br&gt;&lt;br&gt; Detects biased texts (containing prejudice for or against a person or group). Returns a label (BIAS or OK) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **ToxicityLLMEval()** &lt;br&gt;&lt;br&gt; Detects toxic texts (containing harmful, offensive, or derogatory language). Returns a label (T, arget). &lt;/li&gt;&lt;li&gt; Returns predicted probability for the “hate” label. &lt;/li&gt;&lt;li&gt; Scale: 0 to 1. &lt;/li&gt;&lt;/ul&gt; | **Optional**: &lt;ul&gt;&lt;li&gt;`toxic_label=\"hate\"` (default)&lt;/li&gt;&lt;li&gt; `display_name`&lt;/li&gt;&lt;/ul&gt; |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,]</td>\n",
-       "      <td>The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.</td>\n",
+       "      <td>How does the 'TestFPR()' function operate at ...</td>\n",
+       "      <td>[th reference**: the test fails if the TNR is ...</td>\n",
+       "      <td>The `TestFPR()` function operates at the datas...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>What condition causes the TestNumberOfDuplicatedRows to fail without a reference?</td>\n",
-       "      <td>[*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of duplicate rows against reference or a defined condition. |**Required**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Optional**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt;**Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt;| Expects +/- 10% or none.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.&lt;br&gt;&lt;br&gt;**No reference**: the te, **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. &lt;br&gt;&lt;br&gt;**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**O,  in the reference.&lt;br&gt;&lt;br&gt;**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. &lt;br&gt;&lt;br&gt; Tests the number of duplicate columns against reference or a defined condition. |**Required**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Optional**:&lt;br&gt; N/A &lt;br&gt;&lt;br&gt;**Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt;| Expects =&lt; or none.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the number of duplicate columns is higher than in the reference.&lt;b]</td>\n",
-       "      <td>The test fails if there is at least one duplicate row.</td>\n",
+       "      <td>What is the role of the 'TestGiniIndex(k=k)' ...</td>\n",
+       "      <td>[wer, the test fails.&lt;br&gt;&lt;br&gt;**No reference**:...</td>\n",
+       "      <td>The role of the 'TestGiniIndex(k=k)' in evalua...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>What is measured by the TestShareOfDriftedColumns function?</td>\n",
-       "      <td>[lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Quality Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, sition**: \\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestMostCommonValueShare()` for all or specified `columns`\\n* `TestNumberOfConstantColumns()`\\n* `TestNumberOfDuplicatedColumns()`\\n* `TestNumberOfDuplicatedRows()`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Drift Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataDriftTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift()` for all or specified `columns`\\n\\n**Optio, 10%.&lt;br&gt;&lt;br&gt;**With reference**: the test fails if the median value is different by more than 10%.&lt;br&gt;&lt;br&gt;**No reference**: N/A |\\n| **TestColumnValueStd**(column_name='num-column')&lt;br&gt;| Column-level. &lt;br&gt;&lt;br&gt; Tests the standard deviation of a given numerical column against reference or a defined condition. |   **Required**:&lt;ul&gt;&lt;li&gt;`column_name`&lt;/li&gt;&lt;/ul&gt; **Optional:**&lt;br&gt; N/A &lt;br&gt;&lt;br&gt; **Test conditions**: &lt;ul&gt;&lt;li&gt;*standard parameters*&lt;/li&gt;&lt;/ul&gt; | Expects +/-10%.&lt;br&gt;&lt;br&gt;**With reference**: the tes]</td>\n",
-       "      <td>The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.</td>\n",
+       "      <td>What visualization is provided by the 'Regres...</td>\n",
+       "      <td>[ter plot. | **Required:**&lt;br&gt;n/a&lt;br&gt;&lt;br&gt;**Opt...</td>\n",
+       "      <td>Visualizes the distribution of the model error...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>What are the required and optional parameters for the ScoreDistribution function?</td>\n",
-       "      <td>[in the training dataset.&lt;br&gt;&lt;br&gt;Requires a training dataset. | **Required**:&lt;ul&gt;&lt;li&gt;`k`&lt;/li&gt;&lt;li&gt;`column_name`&lt;/li&gt;&lt;/ul&gt;**Optional**:&lt;ul&gt;&lt;li&gt;-&lt;/li&gt;&lt;/ul&gt; |\\n| **ScoreDistribution()** &lt;br&gt;&lt;br&gt; Computes the predicted score entropy. Visualizes the distribution of the scores at `k` (and all scores, if available).&lt;br&gt;&lt;br&gt;Applies only when the `recommendations_type` is a `score`. | **Required**:&lt;ul&gt;&lt;li&gt;`k`&lt;/li&gt;&lt;/ul&gt;**Optional**:&lt;ul&gt;&lt;li&gt;-&lt;/li&gt;&lt;/ul&gt; |\\n| **RecCasesTable()** &lt;br&gt;&lt;br&gt; Shows the list of recomm, Evidently Metric**: `ScoreDistribution`\\n\\nThis metric computes the predicted score entropy. It applies only when the `recommendations_type` is a score.\\n\\n**Implementation**:\\n* Apply softmax transformation for top-K scores for all users.\\n* Compute the KL divergence (relative entropy in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)). \\n\\nThe visualization shows the distribution of the predicted scores at K (and all scores, if available). \\n\\n# Item Bias \\n\\n![](../, icationProbDistribution()`- if probabilistic classification\\n* `ClassificationRocCurve()` - if probabilistic classification\\n* `ClassificationPRCurve()` - if probabilistic classification\\n* `ClassificationPRTable()` - if probabilistic classification\\n* `ClassificationQualityByFeatureTable()` for all or specified `columns`&lt;/li&gt;&lt;/ul&gt;\\n\\n**Optional parameters**:\\n* `columns`\\n* `probas_threshold`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n  \\n&lt;summary&gt;Text Overview Preset&lt;/summary&gt;\\n\\n`TextOverviewPreset()` provides a summary fo]</td>\n",
-       "      <td>**Required**: `k`  \\n**Optional**: None</td>\n",
+       "      <td>How does the 'RegressionErrorNormality()' ass...</td>\n",
+       "      <td>[rcentage error in a line plot. | **Required:*...</td>\n",
+       "      <td>RegressionErrorNormality() assesses value norm...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>What is the role of the ColumnSummaryMetric in the DataQualityPreset?</td>\n",
-       "      <td>[ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, %}\\n\\n# Metric Presets\\n\\n**Defaults**: Presets use the default parameters for each Metric. You can see them in the tables below. \\n\\n&lt;details&gt;\\n\\n&lt;summary&gt;Data Quality Preset&lt;/summary&gt;\\n\\n`DataQualityPreset` captures column and dataset summaries. Input columns are required. Prediction and target are optional.\\n\\n**Composition**:\\n* `DatasetSummaryMetric()`\\n* `ColumnSummaryMetric()` for `all` or specified `сolumns`\\n* `DatasetMissingValuesMetric()`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n, lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n&lt;/details&gt;\\n\\n&lt;details&gt;\\n \\n&lt;summary&gt;Data Quality Test Preset&lt;/summary&gt;\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo]</td>\n",
-       "      <td>The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.</td>\n",
+       "      <td>What is the primary focus of the 'DiversityMe...</td>\n",
+       "      <td>[\\n**Note**: Only a single top relevant item i...</td>\n",
+       "      <td>The primary focus of the 'DiversityMetric' in ...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>How does the drift detection method choose the appropriate test for each column?</td>\n",
-       "      <td>[In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect , ct a different test, you should set [data drift parameters](../customization/options-for-statistical-tests.md). \\n\\n| Test name | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestNumberOfDriftedColumns()** | Dataset-level. &lt;br&gt;&lt;br&gt; Compares the distribution of each column in the current dataset to the reference and tests the number of drifting features against a defined condition.| **Required**:&lt;br&gt;N/A&lt;br&gt;&lt;br&gt;**Optional:**&lt;ul&gt;&lt;li&gt;`сolumns`&lt;/li&gt;&lt;li&gt;`stattest`(default=, tical tests and drift detection methods to detect if the distribution has changed significantly. It returns a \"drift detected\" or \"not detected\" result.\\n\\nThere is a default logic to choosing the appropriate drift test for each column. It is based on:\\n\\n* column type: categorical, numerical, text data or embeddings\\n* the number of observations in the reference dataset\\n* the number of unique values in the column (n\\_unique)\\n\\n## Tabular Data \\n\\n![](../.gitbook/assets/reports/metric_data_drift_table_2]</td>\n",
-       "      <td>The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.</td>\n",
+       "      <td>How are missing values tested in the 'TestSha...</td>\n",
+       "      <td>[r&gt; **Optional**: &lt;ul&gt;&lt;li&gt;`missing_values = []...</td>\n",
+       "      <td>The 'TestShareOfRowsWithMissingValues()' tests...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>How is AP@K calculated in the context of relevant item positions?\"</td>\n",
-       "      <td>[at each relevant item position within the top K. To do that, we sum up precision at all values of K when the item is relevant (e.g., Precision @1, Precision@2..), and divide it by the total number of relevant items in K.\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nWhere *N* is the total number of relevant items at K, and *rel(k)* is equal to 1 if the item is relevant, and is 0 otherwise.\\n\\nExample: if K = 10, and items in positions 1, 2, and 10 are relevant, the fo,  1 if any relevant item is included in K, or 0 otherwise.\\n* **Compute average hit rate**. The average of this metric is calculated across all users or queries.\\n\\n**Range**: 0 to 1, where 1 indicates that each user / query gets at least one relevant recommendation / retrieval.\\n\\n**Interpretation**: A higher Hit Rate indicates that a higher share of users / queries have relevant items in their lists. \\n\\n**Note**: the Hit Rate will typically increase for higher values of K (since there is a higher cha, ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\text{MAP@K} = \\frac{1}{U} \\sum_{u=1}^{U} \\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica]</td>\n",
-       "      <td>AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.</td>\n",
+       "      <td>How does the 'TestNumberOfDuplicatedRows()' f...</td>\n",
+       "      <td>[*: the test fails if there is at least one em...</td>\n",
+       "      <td>The 'TestNumberOfDuplicatedRows()' function ev...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                                                                              Query  \\\n",
-       "0  \"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?   \n",
-       "1                     What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?   \n",
-       "2                                                             What is the purpose of the HuggingFaceModel function?   \n",
-       "3                                                How does the HuggingFaceToxicityModel function detect hate speech?   \n",
-       "4                                 What condition causes the TestNumberOfDuplicatedRows to fail without a reference?   \n",
-       "5                                                       What is measured by the TestShareOfDriftedColumns function?   \n",
-       "6                                 What are the required and optional parameters for the ScoreDistribution function?   \n",
-       "7                                             What is the role of the ColumnSummaryMetric in the DataQualityPreset?   \n",
-       "8                                  How does the drift detection method choose the appropriate test for each column?   \n",
-       "9                                                How is AP@K calculated in the context of relevant item positions?\"   \n",
+       "                                               Query  \\\n",
+       "0  \"How is the 'Name' of a Metric used in reading...   \n",
+       "1   What information does the 'Description' secti...   \n",
+       "2   What parameters are considered for the 'Popul...   \n",
+       "3   How does the 'TestFPR()' function operate at ...   \n",
+       "4   What is the role of the 'TestGiniIndex(k=k)' ...   \n",
+       "5   What visualization is provided by the 'Regres...   \n",
+       "6   How does the 'RegressionErrorNormality()' ass...   \n",
+       "7   What is the primary focus of the 'DiversityMe...   \n",
+       "8   How are missing values tested in the 'TestSha...   \n",
+       "9   How does the 'TestNumberOfDuplicatedRows()' f...   \n",
        "\n",
-       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Relevant chunks  \\\n",
-       "0                                                [r><br>**With reference**: the test fails if the number of columns with missing values is higher than in reference.  <br>**No reference**: the test fails if the dataset contains columns with missing values.|\\n| **TestShareOfColumnsWithMissingValues()** | Dataset-level. <br><br> Tests the share of columns that contain missing values in the dataset against the reference or a defined condition.| **Required**:<br> N/A <br><br> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default , **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O,  test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of]   \n",
-       "1              [the Test's defaults. You can see them in the tables below. The listed Preset parameters apply to the relevant individual Tests inside the Preset.\\n\\n<details>\\n \\n<summary>NoTargetPerformance Test Preset</summary>\\n\\nPreset name: `NoTargetPerformanceTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift(column_name=prediction)`\\n* `TestColumnShareOfMissingValues()` for `all` or `сolumns` if provided\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* , lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, **: N/A |\\n\\n## Column Values\\n\\n| Test name  | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestColumnValueMin**(column_name='num-column') | Column-level. <br><br> Tests the minimum value of a given numerical column against reference or a defined condition. |  **Required**:<ul><li>`column_name`</li></ul> **Optional:** N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects not lower.<br><br>**With reference**: the test fails if the minimum ]   \n",
-       "2                             [tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com, ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use]   \n",
-       "3                                         [l>| **Required:**<br>n/a<br><br>**Optional:**<ul><li>`display_name`</li></ul> |\\n| **HuggingFaceModel()** <br><br> Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|\\n| **HuggingFaceToxicityModel()** <ul><li> Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). </li><li> Returns predicted probability fo, xts (containing critical or pessimistic tone). Returns a label (NEGATIVE or POSITIVE) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **BiasLLMEval()** <br><br> Detects biased texts (containing prejudice for or against a person or group). Returns a label (BIAS or OK) or score.| See [docs](../customization/llm_as_a_judge.md) for parameters.|\\n| **ToxicityLLMEval()** <br><br> Detects toxic texts (containing harmful, offensive, or derogatory language). Returns a label (T, arget). </li><li> Returns predicted probability for the “hate” label. </li><li> Scale: 0 to 1. </li></ul> | **Optional**: <ul><li>`toxic_label=\"hate\"` (default)</li><li> `display_name`</li></ul> |\\n\\n# Data Drift\\n\\n**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.\\n\\nTo modify the logic or select a different test,]   \n",
-       "4                                                [*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te, **With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O,  in the reference.<br><br>**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. <br><br> Tests the number of duplicate columns against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects =< or none.<br><br>**With reference**: the test fails if the number of duplicate columns is higher than in the reference.<b]   \n",
-       "5          [lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo, sition**: \\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestMostCommonValueShare()` for all or specified `columns`\\n* `TestNumberOfConstantColumns()`\\n* `TestNumberOfDuplicatedColumns()`\\n* `TestNumberOfDuplicatedRows()`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Drift Test Preset</summary>\\n\\nPreset name: `DataDriftTestPreset()`\\n\\n**Composition**: \\n* `TestShareOfDriftedColumns()`\\n* `TestColumnDrift()` for all or specified `columns`\\n\\n**Optio, 10%.<br><br>**With reference**: the test fails if the median value is different by more than 10%.<br><br>**No reference**: N/A |\\n| **TestColumnValueStd**(column_name='num-column')<br>| Column-level. <br><br> Tests the standard deviation of a given numerical column against reference or a defined condition. |   **Required**:<ul><li>`column_name`</li></ul> **Optional:**<br> N/A <br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10%.<br><br>**With reference**: the tes]   \n",
-       "6                     [in the training dataset.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li><li>`column_name`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ScoreDistribution()** <br><br> Computes the predicted score entropy. Visualizes the distribution of the scores at `k` (and all scores, if available).<br><br>Applies only when the `recommendations_type` is a `score`. | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **RecCasesTable()** <br><br> Shows the list of recomm, Evidently Metric**: `ScoreDistribution`\\n\\nThis metric computes the predicted score entropy. It applies only when the `recommendations_type` is a score.\\n\\n**Implementation**:\\n* Apply softmax transformation for top-K scores for all users.\\n* Compute the KL divergence (relative entropy in [scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html)). \\n\\nThe visualization shows the distribution of the predicted scores at K (and all scores, if available). \\n\\n# Item Bias \\n\\n![](../, icationProbDistribution()`- if probabilistic classification\\n* `ClassificationRocCurve()` - if probabilistic classification\\n* `ClassificationPRCurve()` - if probabilistic classification\\n* `ClassificationPRTable()` - if probabilistic classification\\n* `ClassificationQualityByFeatureTable()` for all or specified `columns`</li></ul>\\n\\n**Optional parameters**:\\n* `columns`\\n* `probas_threshold`\\n\\n</details>\\n\\n<details>\\n  \\n<summary>Text Overview Preset</summary>\\n\\n`TextOverviewPreset()` provides a summary fo]   \n",
-       "7  [ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati, %}\\n\\n# Metric Presets\\n\\n**Defaults**: Presets use the default parameters for each Metric. You can see them in the tables below. \\n\\n<details>\\n\\n<summary>Data Quality Preset</summary>\\n\\n`DataQualityPreset` captures column and dataset summaries. Input columns are required. Prediction and target are optional.\\n\\n**Composition**:\\n* `DatasetSummaryMetric()`\\n* `ColumnSummaryMetric()` for `all` or specified `сolumns`\\n* `DatasetMissingValuesMetric()`\\n\\n**Optional parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n, lumnsType()`\\n* `TestColumnShareOfMissingValues()` for all or specified `columns`\\n* `TestShareOfOutRangeValues()` for all numerical or specified `columns`\\n* `TestShareOfOutListValues()` for all categorical or specified  `columns`\\n* `TestMeanInNSigmas()` for all numerical or specified `columns`\\n\\n**Optional parameters**: \\n* `columns`\\n\\n</details>\\n\\n<details>\\n \\n<summary>Data Quality Test Preset</summary>\\n\\nPreset name: `DataQualityTestPreset()`\\n\\n**Composition**: \\n* `TestColumnShareOfMissingValues()` fo]   \n",
-       "8                                 [In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect , ct a different test, you should set [data drift parameters](../customization/options-for-statistical-tests.md). \\n\\n| Test name | Description | Parameters | Default test conditions | \\n|---|---|---|---|\\n| **TestNumberOfDriftedColumns()** | Dataset-level. <br><br> Compares the distribution of each column in the current dataset to the reference and tests the number of drifting features against a defined condition.| **Required**:<br>N/A<br><br>**Optional:**<ul><li>`сolumns`</li><li>`stattest`(default=, tical tests and drift detection methods to detect if the distribution has changed significantly. It returns a \"drift detected\" or \"not detected\" result.\\n\\nThere is a default logic to choosing the appropriate drift test for each column. It is based on:\\n\\n* column type: categorical, numerical, text data or embeddings\\n* the number of observations in the reference dataset\\n* the number of unique values in the column (n\\_unique)\\n\\n## Tabular Data \\n\\n![](../.gitbook/assets/reports/metric_data_drift_table_2]   \n",
-       "9                    [at each relevant item position within the top K. To do that, we sum up precision at all values of K when the item is relevant (e.g., Precision @1, Precision@2..), and divide it by the total number of relevant items in K.\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nWhere *N* is the total number of relevant items at K, and *rel(k)* is equal to 1 if the item is relevant, and is 0 otherwise.\\n\\nExample: if K = 10, and items in positions 1, 2, and 10 are relevant, the fo,  1 if any relevant item is included in K, or 0 otherwise.\\n* **Compute average hit rate**. The average of this metric is calculated across all users or queries.\\n\\n**Range**: 0 to 1, where 1 indicates that each user / query gets at least one relevant recommendation / retrieval.\\n\\n**Interpretation**: A higher Hit Rate indicates that a higher share of users / queries have relevant items in their lists. \\n\\n**Note**: the Hit Rate will typically increase for higher values of K (since there is a higher cha, ems in positions 1, 2, and 10 are relevant, the formula will look as:\\n\\n$$\\nAP@10 = \\frac{Precision@1+Precision@2+Precision@10}{3}\\n$$\\n\\n* **Compute Mean Average Precision (MAP) at K**. Average the results across all users (or queries) in the dataset.\\n\\n$$\\n\\text{MAP@K} = \\frac{1}{U} \\sum_{u=1}^{U} \\text{AP@K}_u\\n$$\\n\\nWhere *U* is the total number of users or queries in the dataset, and *AP* is the average precision for a given list.\\n\\n**Range**: 0 to 1.\\n\\n**Interpretation**: Higher MAP at K values indica]   \n",
+       "                                     Relevant chunks  \\\n",
+       "0  [ity Metrics than included in the `DataQuality...   \n",
+       "1  [---\\ndescription: List of Metrics, Descriptor...   \n",
+       "2  [reports/metric_popularity_bias-min.png)\\n\\n**...   \n",
+       "3  [th reference**: the test fails if the TNR is ...   \n",
+       "4  [wer, the test fails.<br><br>**No reference**:...   \n",
+       "5  [ter plot. | **Required:**<br>n/a<br><br>**Opt...   \n",
+       "6  [rcentage error in a line plot. | **Required:*...   \n",
+       "7  [\\n**Note**: Only a single top relevant item i...   \n",
+       "8  [r> **Optional**: <ul><li>`missing_values = []...   \n",
+       "9  [*: the test fails if there is at least one em...   \n",
        "\n",
-       "                                                                                                                                                                                                                                                                                                                                                                             Baseline_answers  \n",
-       "0                                                                                                                                                                                                             The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.  \n",
-       "1                                                                                                                                                                                                                                                                                               The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.  \n",
-       "2                                                                                                                                                                                                                                                                                                    The purpose of the HuggingFaceModel function is not specified in the provided documents.  \n",
-       "3                                                                                                                                  The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.  \n",
-       "4                                                                                                                                                                                                                                                                                                                                      The test fails if there is at least one duplicate row.  \n",
-       "5                                                                                                                                                                                                                                                                             The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.  \n",
-       "6                                                                                                                                                                                                                                                                                                                                                     **Required**: `k`  \\n**Optional**: None  \n",
-       "7                                                                                                                                                                                                                                             The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.  \n",
-       "8                                                                                                                                    The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.  \n",
-       "9  AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.  "
+       "                                    Baseline_answers  \n",
+       "0  The 'Name' of a Metric is used to identify the...  \n",
+       "1  The 'Description' section provides a plain tex...  \n",
+       "2  ARP, Coverage, and Gini index are the paramete...  \n",
+       "3  The `TestFPR()` function operates at the datas...  \n",
+       "4  The role of the 'TestGiniIndex(k=k)' in evalua...  \n",
+       "5  Visualizes the distribution of the model error...  \n",
+       "6  RegressionErrorNormality() assesses value norm...  \n",
+       "7  The primary focus of the 'DiversityMetric' in ...  \n",
+       "8  The 'TestShareOfRowsWithMissingValues()' tests...  \n",
+       "9  The 'TestNumberOfDuplicatedRows()' function ev...  "
       ]
      },
-     "execution_count": 128,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1076,7 +1179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 43,
    "id": "3de32ca3-162f-4ed8-ba88-09a5b9572457",
    "metadata": {},
    "outputs": [],
@@ -1086,7 +1189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 44,
    "id": "db43a50d-4b1a-4b42-a529-67e85bef0f9a",
    "metadata": {},
    "outputs": [
@@ -1118,85 +1221,85 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>\"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?</td>\n",
-       "      <td>The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.</td>\n",
+       "      <td>\"How is the 'Name' of a Metric used in reading tables?</td>\n",
+       "      <td>The 'Name' of a Metric is used to identify the specific Metric being referenced.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?</td>\n",
-       "      <td>The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.</td>\n",
+       "      <td>What information does the 'Description' section provide in the context of Metrics?</td>\n",
+       "      <td>The 'Description' section provides a plain text explanation of the Metric, specifying whether it applies to the whole dataset or individual columns.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>What is the purpose of the HuggingFaceModel function?</td>\n",
-       "      <td>The purpose of the HuggingFaceModel function is not specified in the provided documents.</td>\n",
+       "      <td>What parameters are considered for the 'PopularityBias()' Metric?</td>\n",
+       "      <td>ARP, Coverage, and Gini index are the parameters considered for the 'PopularityBias()' Metric.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>How does the HuggingFaceToxicityModel function detect hate speech?</td>\n",
-       "      <td>The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.</td>\n",
+       "      <td>How does the 'TestFPR()' function operate at the dataset level?</td>\n",
+       "      <td>The `TestFPR()` function operates at the dataset level by computing the False Positive Rate (FPR) and comparing it to a reference or against a defined condition.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>What condition causes the TestNumberOfDuplicatedRows to fail without a reference?</td>\n",
-       "      <td>The test fails if there is at least one duplicate row.</td>\n",
+       "      <td>What is the role of the 'TestGiniIndex(k=k)' in evaluating dataset bias?</td>\n",
+       "      <td>The role of the 'TestGiniIndex(k=k)' in evaluating dataset bias is to compute the Gini Index at the top K recommendations and compare it to a reference or a defined condition. If the Gini Index at the top K is over 10% higher or lower than the reference, the test fails. This helps in assessing the fairness and distribution of recommendations, indicating potential bias if the Gini Index significantly deviates from the reference.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>What is measured by the TestShareOfDriftedColumns function?</td>\n",
-       "      <td>The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.</td>\n",
+       "      <td>What visualization is provided by the 'RegressionErrorDistribution()'?</td>\n",
+       "      <td>Visualizes the distribution of the model error in a histogram.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>What are the required and optional parameters for the ScoreDistribution function?</td>\n",
-       "      <td>**Required**: `k`  \\n**Optional**: None</td>\n",
+       "      <td>How does the 'RegressionErrorNormality()' assess value normality?</td>\n",
+       "      <td>RegressionErrorNormality() assesses value normality by visualizing the quantile-quantile plot (Q-Q plot).</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>What is the role of the ColumnSummaryMetric in the DataQualityPreset?</td>\n",
-       "      <td>The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.</td>\n",
+       "      <td>What is the primary focus of the 'DiversityMetric' in recommendation systems?</td>\n",
+       "      <td>The primary focus of the 'DiversityMetric' in recommendation systems is to measure the average intra-list diversity at K, reflecting the variety of items within the same user's recommendation list, averaged by all users.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>How does the drift detection method choose the appropriate test for each column?</td>\n",
-       "      <td>The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.</td>\n",
+       "      <td>How are missing values tested in the 'TestShareOfRowsWithMissingValues()'?</td>\n",
+       "      <td>The 'TestShareOfRowsWithMissingValues()' tests the share of rows that contain missing values against a reference or a defined condition. With reference, the test fails if the share of rows with missing values is over 10% higher than in the reference. Without reference, the test fails if the dataset contains any rows with missing values.</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>How is AP@K calculated in the context of relevant item positions?\"</td>\n",
-       "      <td>AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.</td>\n",
+       "      <td>How does the 'TestNumberOfDuplicatedRows()' function evaluate dataset integrity?\"</td>\n",
+       "      <td>The 'TestNumberOfDuplicatedRows()' function evaluates dataset integrity by testing the number of duplicate rows against a reference or a defined condition. If a reference is provided, the test fails if the share of duplicate rows is over 10% higher or lower than in the reference. If no reference is provided, the test fails if there is at least one duplicate row.</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                                                                              Query  \\\n",
-       "0  \"How does the TestShareOfColumnsWithMissingValues function determine if a dataset fails the test with reference?   \n",
-       "1                     What optional parameters can be included in the TestShareOfColumnsWithMissingValues function?   \n",
-       "2                                                             What is the purpose of the HuggingFaceModel function?   \n",
-       "3                                                How does the HuggingFaceToxicityModel function detect hate speech?   \n",
-       "4                                 What condition causes the TestNumberOfDuplicatedRows to fail without a reference?   \n",
-       "5                                                       What is measured by the TestShareOfDriftedColumns function?   \n",
-       "6                                 What are the required and optional parameters for the ScoreDistribution function?   \n",
-       "7                                             What is the role of the ColumnSummaryMetric in the DataQualityPreset?   \n",
-       "8                                  How does the drift detection method choose the appropriate test for each column?   \n",
-       "9                                                How is AP@K calculated in the context of relevant item positions?\"   \n",
+       "                                                                                 Query  \\\n",
+       "0                               \"How is the 'Name' of a Metric used in reading tables?   \n",
+       "1   What information does the 'Description' section provide in the context of Metrics?   \n",
+       "2                    What parameters are considered for the 'PopularityBias()' Metric?   \n",
+       "3                      How does the 'TestFPR()' function operate at the dataset level?   \n",
+       "4             What is the role of the 'TestGiniIndex(k=k)' in evaluating dataset bias?   \n",
+       "5               What visualization is provided by the 'RegressionErrorDistribution()'?   \n",
+       "6                    How does the 'RegressionErrorNormality()' assess value normality?   \n",
+       "7        What is the primary focus of the 'DiversityMetric' in recommendation systems?   \n",
+       "8           How are missing values tested in the 'TestShareOfRowsWithMissingValues()'?   \n",
+       "9    How does the 'TestNumberOfDuplicatedRows()' function evaluate dataset integrity?\"   \n",
        "\n",
-       "                                                                                                                                                                                                                                                                                                                                                                             Baseline_answers  \n",
-       "0                                                                                                                                                                                                             The TestShareOfColumnsWithMissingValues function determines that a dataset fails the test if the number of columns with missing values is higher than in the reference dataset.  \n",
-       "1                                                                                                                                                                                                                                                                                               The optional parameters for the `TestShareOfColumnsWithMissingValues` function are `columns`.  \n",
-       "2                                                                                                                                                                                                                                                                                                    The purpose of the HuggingFaceModel function is not specified in the provided documents.  \n",
-       "3                                                                                                                                  The HuggingFaceToxicityModel function detects hate speech using the HuggingFace model found at https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target. It returns a predicted probability for the \"hate\" label, with a scale from 0 to 1.  \n",
-       "4                                                                                                                                                                                                                                                                                                                                      The test fails if there is at least one duplicate row.  \n",
-       "5                                                                                                                                                                                                                                                                             The `TestShareOfDriftedColumns` function measures the proportion of columns that have drifted between datasets.  \n",
-       "6                                                                                                                                                                                                                                                                                                                                                     **Required**: `k`  \\n**Optional**: None  \n",
-       "7                                                                                                                                                                                                                                             The `ColumnSummaryMetric` in the `DataQualityPreset` is used to capture summaries for each column, either for all columns or specified columns.  \n",
-       "8                                                                                                                                    The drift detection method chooses the appropriate test for each column based on the column type (categorical, numerical, text data, or embeddings), the number of observations in the reference dataset, and the number of unique values in the column.  \n",
-       "9  AP@K is calculated by summing the precision at each position up to K where the item is relevant and dividing by the total number of relevant items within K. The formula is:\\n\\n$$\\n\\text{AP@K} = \\frac{1}{N} \\sum_{k=1}^{K} Precision(k) \\times rel(k)\\n$$\\n\\nwhere *N* is the total number of relevant items in K, and *rel(k)* is 1 if the item at position k is relevant, otherwise 0.  "
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                  Baseline_answers  \n",
+       "0                                                                                                                                                                                                                                                                                                                                                                 The 'Name' of a Metric is used to identify the specific Metric being referenced.  \n",
+       "1                                                                                                                                                                                                                                                                                             The 'Description' section provides a plain text explanation of the Metric, specifying whether it applies to the whole dataset or individual columns.  \n",
+       "2                                                                                                                                                                                                                                                                                                                                                   ARP, Coverage, and Gini index are the parameters considered for the 'PopularityBias()' Metric.  \n",
+       "3                                                                                                                                                                                                                                                                                The `TestFPR()` function operates at the dataset level by computing the False Positive Rate (FPR) and comparing it to a reference or against a defined condition.  \n",
+       "4  The role of the 'TestGiniIndex(k=k)' in evaluating dataset bias is to compute the Gini Index at the top K recommendations and compare it to a reference or a defined condition. If the Gini Index at the top K is over 10% higher or lower than the reference, the test fails. This helps in assessing the fairness and distribution of recommendations, indicating potential bias if the Gini Index significantly deviates from the reference.  \n",
+       "5                                                                                                                                                                                                                                                                                                                                                                                   Visualizes the distribution of the model error in a histogram.  \n",
+       "6                                                                                                                                                                                                                                                                                                                                        RegressionErrorNormality() assesses value normality by visualizing the quantile-quantile plot (Q-Q plot).  \n",
+       "7                                                                                                                                                                                                                     The primary focus of the 'DiversityMetric' in recommendation systems is to measure the average intra-list diversity at K, reflecting the variety of items within the same user's recommendation list, averaged by all users.  \n",
+       "8                                                                                               The 'TestShareOfRowsWithMissingValues()' tests the share of rows that contain missing values against a reference or a defined condition. With reference, the test fails if the share of rows with missing values is over 10% higher than in the reference. Without reference, the test fails if the dataset contains any rows with missing values.  \n",
+       "9                                                                     The 'TestNumberOfDuplicatedRows()' function evaluates dataset integrity by testing the number of duplicate rows against a reference or a defined condition. If a reference is provided, the test fails if the share of duplicate rows is over 10% higher or lower than in the reference. If no reference is provided, the test fails if there is at least one duplicate row.  "
       ]
      },
-     "execution_count": 130,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }

From 24e1f8d51f02b25a4501323905a5d8b7269da097 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 18:44:52 +0200
Subject: [PATCH 14/63] wip

---
 examples/synth_data.py                        |  5 ++-
 src/evidently/dataset_generators/llm/aaa.py   | 33 ++++++++--------
 .../dataset_generators/llm/chunks.py          | 39 +++++++++++++------
 .../dataset_generators/llm/generator.py       |  2 +-
 4 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index b53d42dbca..4b678629e5 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,13 +1,14 @@
 import os
 
-from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator, QuestionPairGenerator, SimpleChunkGenerator, SimpleQuestionPrompt
+from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator, QuestionPairGenerator, SimpleQuestionPrompt
+from evidently.dataset_generators.llm.chunks import SimpleIndexExtractor
 from evidently.options.base import Options
 from evidently.ui.workspace import CloudWorkspace
 
 
 def main():
     generator = QuestionPairGenerator(
-        chunks=SimpleChunkGenerator(chunks=["I am a banana"]),
+        index=SimpleIndexExtractor(chunks=["I am a banana"]),
         questions=PromptQuestionGenerator(prompt=SimpleQuestionPrompt()),
         num_questions=2,
         provider="openai",
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index a4597b1553..61a55c07a4 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -2,14 +2,15 @@
 from abc import ABC
 from typing import ClassVar
 from typing import List
+from typing import Sequence
 from typing import Tuple
 
 import pandas as pd
 
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
-from evidently.dataset_generators.llm.chunks import ChunkGenerator
-from evidently.dataset_generators.llm.chunks import LLMChunk
+from evidently.dataset_generators.llm.chunks import Chunk
+from evidently.dataset_generators.llm.chunks import IndexExtractor
 from evidently.pydantic_utils import EvidentlyBaseModel
 from evidently.utils.llm import BlockPromptTemplate
 from evidently.utils.llm import LLMMessage
@@ -19,19 +20,19 @@
 
 Question = str
 Answer = str
-GeneratedQuestion = Tuple[Question, Answer, LLMChunk]
+GeneratedQuestion = Tuple[Question, Answer, Chunk]
 
 
 class QuestionGenerator(EvidentlyBaseModel, ABC):
     @abc.abstractmethod
-    def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQuestion:
+    def generate_questions(self, wrapper: LLMWrapper, chunks: Sequence[Chunk]) -> List[GeneratedQuestion]:
         raise NotImplementedError
 
 
 class SimpleQuestionPrompt(BlockPromptTemplate):
     blocks: ClassVar = [
-        PromptBlock.simple("Please generate a question {} about this:"),
-        PromptBlock.input("chunk").anchored(),
+        PromptBlock.simple("Please generate a {question_type} question about this:"),
+        PromptBlock.input("context").anchored(),
         PromptBlock.json_output(question="question text", answer="answer text"),
     ]
 
@@ -41,26 +42,26 @@ class Config:
         type_alias = "asdfasdasdfaaasdfdsfasfasd"
 
     prompt: PromptTemplate
+    question_type: str = "simple"
 
-    def generate_question(self, wrapper: LLMWrapper, chunk: LLMChunk) -> GeneratedQuestion:
-        rendered = self.prompt.render(chunk=chunk)
+    def generate_questions(self, wrapper: LLMWrapper, chunks: Sequence[Chunk]) -> GeneratedQuestion:
+        context = "\n\n".join(chunks)
+        rendered = self.prompt.render(context=context, question_type=self.question_type)
         result = wrapper.complete([LLMMessage.user(rendered)])
-        data = self.prompt.parse(result)
-        return data["question"], data["answer"], chunk
+        data = self.prompt.parse(result, keys=["question", "answer"])
+        return data["question"], data["answer"], context
 
 
 class QuestionPairGenerator(BaseLLMDatasetGenerator):
     class Config:
         type_alias = "asdfasdasdfaaasdfdsfasfasd"
 
-    chunks: ChunkGenerator
+    index: IndexExtractor
     questions: QuestionGenerator
     num_questions: int
 
     def generate(self) -> DatasetGeneratorResult:
-        qs: List[GeneratedQuestion] = []
-        for chunk in self.chunks.generate_chunks():
-            for i in range(self.num_questions):
-                qs.append(self.questions.generate_question(self.wrapper, chunk))
+        documents = self.index.extract_index()
+        qs = self.questions.generate_questions(self.wrapper, [chunk for chunk in documents.chunks])
 
-        return pd.DataFrame(qs, columns=["question", "answer", "context"])
+        return pd.DataFrame([qs], columns=["question", "answer", "context"])
diff --git a/src/evidently/dataset_generators/llm/chunks.py b/src/evidently/dataset_generators/llm/chunks.py
index 81236fd73e..0ccb60ac35 100644
--- a/src/evidently/dataset_generators/llm/chunks.py
+++ b/src/evidently/dataset_generators/llm/chunks.py
@@ -1,40 +1,55 @@
 import abc
+import dataclasses
 from abc import ABC
-from typing import Iterator
+from typing import Any
 from typing import List
+from typing import Optional
 
 from llama_index.core.node_parser import SentenceSplitter
 
 from evidently.pydantic_utils import EvidentlyBaseModel
 
-LLMChunk = str
+Chunk = str
 
 
-class ChunkGenerator(EvidentlyBaseModel, ABC):
+@dataclasses.dataclass
+class DocumentIndex:
+    chunks: List[Chunk]
+    embeddings: Optional[Any] = None
+
+    def get_embeddings(self):
+        if self.embeddings is not None:
+            self.embeddings = ...
+        return self.embeddings
+
+
+class IndexExtractor(EvidentlyBaseModel, ABC):
     @abc.abstractmethod
-    def generate_chunks(self) -> Iterator[LLMChunk]:
+    def extract_index(self) -> DocumentIndex:
         raise NotImplementedError
 
 
-class FileContextGenerator(ChunkGenerator):
+class IndexExtractorFromFile(IndexExtractor):
     class Config:
         type_alias = "asdfasdfasd"
 
     path: str
+    chunk_size: int = 512
+    chunk_overlap: int = 20
 
-    def generate_chunks(self) -> Iterator[LLMChunk]:
+    def extract_index(self) -> DocumentIndex:
         with open(self.path) as f:
             text = f.read()
-        splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
+        splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
         text_nodes = splitter.split_text(text)
-        yield from text_nodes
+        return DocumentIndex(text_nodes)
 
 
-class SimpleChunkGenerator(ChunkGenerator):
+class SimpleIndexExtractor(IndexExtractor):
     class Config:
         type_alias = "asdfasdasdfafasd"
 
-    chunks: List[LLMChunk]
+    chunks: List[Chunk]
 
-    def generate_chunks(self) -> Iterator[LLMChunk]:
-        yield from self.chunks
+    def extract_index(self) -> DocumentIndex:
+        return DocumentIndex(self.chunks)
diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
index 19918b6340..ea511d24ea 100644
--- a/src/evidently/dataset_generators/llm/generator.py
+++ b/src/evidently/dataset_generators/llm/generator.py
@@ -12,7 +12,7 @@
 def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFrame:
     chunks = FileContextGenerator(path=file_path)
     generator = QuestionPairGenerator(
-        chunks=chunks,
+        index=chunks,
         questions=PromptQuestionGenerator(system_promt=SimpleQuestionPrompt()),
         num_questions=num_questions,
         provider="openai",

From 7c4f536aefc1506e356717f08aaf183e8b000589 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 18:45:50 +0200
Subject: [PATCH 15/63] wip

---
 src/evidently/dataset_generators/llm/{chunks.py => index.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/evidently/dataset_generators/llm/{chunks.py => index.py} (100%)

diff --git a/src/evidently/dataset_generators/llm/chunks.py b/src/evidently/dataset_generators/llm/index.py
similarity index 100%
rename from src/evidently/dataset_generators/llm/chunks.py
rename to src/evidently/dataset_generators/llm/index.py

From c95b6acc89536accde500dd49febd25db2fac210 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 18:46:27 +0200
Subject: [PATCH 16/63] wip

---
 examples/synth_data.py                      | 2 +-
 src/evidently/dataset_generators/llm/aaa.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 4b678629e5..852cbeb215 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,7 +1,7 @@
 import os
 
 from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator, QuestionPairGenerator, SimpleQuestionPrompt
-from evidently.dataset_generators.llm.chunks import SimpleIndexExtractor
+from evidently.dataset_generators.llm.index import SimpleIndexExtractor
 from evidently.options.base import Options
 from evidently.ui.workspace import CloudWorkspace
 
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 61a55c07a4..fa1803b0e0 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -9,8 +9,8 @@
 
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
-from evidently.dataset_generators.llm.chunks import Chunk
-from evidently.dataset_generators.llm.chunks import IndexExtractor
+from evidently.dataset_generators.llm.index import Chunk
+from evidently.dataset_generators.llm.index import IndexExtractor
 from evidently.pydantic_utils import EvidentlyBaseModel
 from evidently.utils.llm import BlockPromptTemplate
 from evidently.utils.llm import LLMMessage

From 180b82254d7a93eb166faad1f0d9b128d2fc9d77 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 19:40:47 +0200
Subject: [PATCH 17/63] generate_dataset_from_docs

---
 docs/book/input-data/column-mapping.md        |   2 +-
 requirements.min.txt                          |   1 +
 setup.py                                      |   1 +
 .../llm/data_generation_for_RAG.ipynb         | 390 +++++++-----------
 .../dataset_generators/llm/generator.py       |   6 +-
 src/evidently/dataset_generators/llm/index.py |  40 +-
 6 files changed, 189 insertions(+), 251 deletions(-)

diff --git a/docs/book/input-data/column-mapping.md b/docs/book/input-data/column-mapping.md
index 3d0ed84592..723eecce16 100644
--- a/docs/book/input-data/column-mapping.md
+++ b/docs/book/input-data/column-mapping.md
@@ -138,7 +138,7 @@ Here is an example of how you point to the defined list of columns that contain
 
 ```python
 column_mapping = ColumnMapping()
-column_mapping.embeddings = {'small_subset': embeddings_data.columns[:10]}
+column_mapping.collection = {'small_subset': embeddings_data.columns[:10]}
 ```
 
 {% hint style="info" %} 
diff --git a/requirements.min.txt b/requirements.min.txt
index f858d8e507..2a0bff5f00 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -31,3 +31,4 @@ openai==1.16.2
 evaluate==0.4.1
 transformers[torch]==4.39.3
 sentence-transformers==2.7.0
+chromadb==0.5.12
diff --git a/setup.py b/setup.py
index 46e9cd43aa..c22676a19e 100644
--- a/setup.py
+++ b/setup.py
@@ -76,6 +76,7 @@
         "deprecation>=2.1.0",
         "uuid6>=2024.7.10",
         "cryptography>=43.0.1",
+        "chromadb>=0.5.12",
     ],
     extras_require={
         "dev": [
diff --git a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
index 5f5b572832..202c6ca472 100644
--- a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
+++ b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "bf4855a8-0d91-4d88-8fa2-05d2eb2ddbad",
    "metadata": {
     "scrolled": true
@@ -20,229 +20,101 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Collecting chromadb\n",
-      "  Obtaining dependency information for chromadb from https://files.pythonhosted.org/packages/43/cd/a875ed1f61365c9fdb46ee2de0cbea1735a9575ff718886f7eb218d4ef45/chromadb-0.5.12-py3-none-any.whl.metadata\n",
-      "  Downloading chromadb-0.5.12-py3-none-any.whl.metadata (6.8 kB)\n",
-      "Collecting build>=1.0.3 (from chromadb)\n",
-      "  Obtaining dependency information for build>=1.0.3 from https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl.metadata\n",
-      "  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)\n",
-      "Requirement already satisfied: pydantic>=1.9 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.10.14)\n",
-      "Collecting chroma-hnswlib==0.7.6 (from chromadb)\n",
-      "  Obtaining dependency information for chroma-hnswlib==0.7.6 from https://files.pythonhosted.org/packages/0d/19/aa6f2139f1ff7ad23a690ebf2a511b2594ab359915d7979f76f3213e46c4/chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl.metadata\n",
-      "  Downloading chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (252 bytes)\n",
-      "Requirement already satisfied: fastapi>=0.95.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.104.1)\n",
-      "Requirement already satisfied: uvicorn[standard]>=0.18.3 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.23.2)\n",
-      "Requirement already satisfied: numpy>=1.22.5 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.26.2)\n",
-      "Collecting posthog>=2.4.0 (from chromadb)\n",
-      "  Obtaining dependency information for posthog>=2.4.0 from https://files.pythonhosted.org/packages/c2/11/a8d4283b324cda992fbb72611c46c5c68f87902a10383dba1bde91660cc6/posthog-3.7.0-py2.py3-none-any.whl.metadata\n",
-      "  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)\n",
-      "Requirement already satisfied: typing-extensions>=4.5.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (4.8.0)\n",
-      "Collecting onnxruntime>=1.14.1 (from chromadb)\n",
-      "  Obtaining dependency information for onnxruntime>=1.14.1 from https://files.pythonhosted.org/packages/f0/ff/77bee5df55f034ee81d2e1bc58b2b8511b9c54f06ce6566cb562c5d95aa5/onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl.metadata\n",
-      "  Downloading onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl.metadata (4.5 kB)\n",
-      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.25.0)\n",
-      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.25.0)\n",
-      "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation-fastapi>=0.41b0 from https://files.pythonhosted.org/packages/ee/50/745ab075a3041b7a5f29a579d2c28eaad54f64b4589d8f9fd364c62cf0f3/opentelemetry_instrumentation_fastapi-0.48b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation_fastapi-0.48b0-py3-none-any.whl.metadata (2.1 kB)\n",
-      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.25.0)\n",
-      "Requirement already satisfied: tokenizers>=0.13.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.15.2)\n",
-      "Collecting pypika>=0.48.9 (from chromadb)\n",
-      "  Downloading PyPika-0.48.9.tar.gz (67 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25h  Installing build dependencies ... \u001b[?25ldone\n",
-      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
-      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
-      "\u001b[?25hRequirement already satisfied: tqdm>=4.65.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (4.66.1)\n",
-      "Requirement already satisfied: overrides>=7.3.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (7.4.0)\n",
-      "Requirement already satisfied: importlib-resources in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (6.1.3)\n",
-      "Requirement already satisfied: grpcio>=1.58.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (1.65.0)\n",
-      "Collecting bcrypt>=4.0.1 (from chromadb)\n",
-      "  Obtaining dependency information for bcrypt>=4.0.1 from https://files.pythonhosted.org/packages/96/86/8c6a84daed4dd878fbab094400c9174c43d9b838ace077a2f8ee8bc3ae12/bcrypt-4.2.0-cp39-abi3-macosx_10_12_universal2.whl.metadata\n",
-      "  Downloading bcrypt-4.2.0-cp39-abi3-macosx_10_12_universal2.whl.metadata (9.6 kB)\n",
-      "Requirement already satisfied: typer>=0.9.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (0.12.3)\n",
-      "Requirement already satisfied: kubernetes>=28.1.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (29.0.0)\n",
-      "Collecting tenacity>=8.2.3 (from chromadb)\n",
-      "  Obtaining dependency information for tenacity>=8.2.3 from https://files.pythonhosted.org/packages/b6/cb/b86984bed139586d01532a587464b5805f12e397594f19f931c4c2fbfa61/tenacity-9.0.0-py3-none-any.whl.metadata\n",
-      "  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)\n",
-      "Requirement already satisfied: PyYAML>=6.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (6.0.1)\n",
-      "Collecting mmh3>=4.0.1 (from chromadb)\n",
-      "  Obtaining dependency information for mmh3>=4.0.1 from https://files.pythonhosted.org/packages/13/f0/2d3daca276a4673f82af859e4b0b18befd4e6e54f1017ba48ea9735b2f1b/mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl.metadata\n",
-      "  Downloading mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (14 kB)\n",
-      "Requirement already satisfied: orjson>=3.9.12 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (3.10.1)\n",
-      "Collecting httpx>=0.27.0 (from chromadb)\n",
-      "  Obtaining dependency information for httpx>=0.27.0 from https://files.pythonhosted.org/packages/56/95/9377bcb415797e44274b51d46e3249eba641711cf3348050f76ee7b15ffc/httpx-0.27.2-py3-none-any.whl.metadata\n",
-      "  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n",
-      "Requirement already satisfied: rich>=10.11.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from chromadb) (13.5.2)\n",
-      "Requirement already satisfied: packaging>=19.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from build>=1.0.3->chromadb) (23.1)\n",
-      "Collecting pyproject_hooks (from build>=1.0.3->chromadb)\n",
-      "  Obtaining dependency information for pyproject_hooks from https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl.metadata\n",
-      "  Downloading pyproject_hooks-1.2.0-py3-none-any.whl.metadata (1.3 kB)\n",
-      "Requirement already satisfied: anyio<4.0.0,>=3.7.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from fastapi>=0.95.2->chromadb) (3.7.1)\n",
-      "Requirement already satisfied: starlette<0.28.0,>=0.27.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from fastapi>=0.95.2->chromadb) (0.27.0)\n",
-      "Requirement already satisfied: certifi in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (2024.7.4)\n",
-      "Collecting httpcore==1.* (from httpx>=0.27.0->chromadb)\n",
-      "  Obtaining dependency information for httpcore==1.* from https://files.pythonhosted.org/packages/06/89/b161908e2f51be56568184aeb4a880fd287178d176fd1c860d2217f41106/httpcore-1.0.6-py3-none-any.whl.metadata\n",
-      "  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)\n",
-      "Requirement already satisfied: idna in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (3.4)\n",
-      "Requirement already satisfied: sniffio in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (1.3.0)\n",
-      "Requirement already satisfied: h11<0.15,>=0.13 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.27.0->chromadb) (0.14.0)\n",
-      "Requirement already satisfied: six>=1.9.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n",
-      "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n",
-      "Requirement already satisfied: google-auth>=1.0.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.25.1)\n",
-      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.6.1)\n",
-      "Requirement already satisfied: requests in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.32.1)\n",
-      "Requirement already satisfied: requests-oauthlib in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.3.1)\n",
-      "Requirement already satisfied: oauthlib>=3.2.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n",
-      "Requirement already satisfied: urllib3>=1.24.2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.4)\n",
-      "Collecting coloredlogs (from onnxruntime>=1.14.1->chromadb)\n",
-      "  Obtaining dependency information for coloredlogs from https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl.metadata\n",
-      "  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n",
-      "Collecting flatbuffers (from onnxruntime>=1.14.1->chromadb)\n",
-      "  Obtaining dependency information for flatbuffers from https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl.metadata\n",
-      "  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)\n",
-      "Requirement already satisfied: protobuf in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.1)\n",
-      "Requirement already satisfied: sympy in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\n",
-      "Requirement already satisfied: deprecated>=1.2.6 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\n",
-      "Requirement already satisfied: importlib-metadata<=7.1,>=6.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (6.8.0)\n",
-      "Requirement already satisfied: googleapis-common-protos~=1.52 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.61.0)\n",
-      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.25.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.25.0)\n",
-      "Requirement already satisfied: opentelemetry-proto==1.25.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.25.0)\n",
-      "Collecting opentelemetry-instrumentation-asgi==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation-asgi==0.48b0 from https://files.pythonhosted.org/packages/db/74/a0e0d38622856597dd8e630f2bd793760485eb165708e11b8be1696bbb5a/opentelemetry_instrumentation_asgi-0.48b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation_asgi-0.48b0-py3-none-any.whl.metadata (2.0 kB)\n",
-      "Collecting opentelemetry-instrumentation==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation==0.48b0 from https://files.pythonhosted.org/packages/0a/7f/405c41d4f359121376c9d5117dcf68149b8122d3f6c718996d037bd4d800/opentelemetry_instrumentation-0.48b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation-0.48b0-py3-none-any.whl.metadata (6.1 kB)\n",
-      "Collecting opentelemetry-semantic-conventions==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-semantic-conventions==0.48b0 from https://files.pythonhosted.org/packages/b7/7a/4f0063dbb0b6c971568291a8bc19a4ca70d3c185db2d956230dd67429dfc/opentelemetry_semantic_conventions-0.48b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_semantic_conventions-0.48b0-py3-none-any.whl.metadata (2.4 kB)\n",
-      "Collecting opentelemetry-util-http==0.48b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-util-http==0.48b0 from https://files.pythonhosted.org/packages/ad/2e/36097c0a4d0115b8c7e377c90bab7783ac183bc5cb4071308f8959454311/opentelemetry_util_http-0.48b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_util_http-0.48b0-py3-none-any.whl.metadata (2.5 kB)\n",
-      "Requirement already satisfied: setuptools>=16.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.48b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (65.5.1)\n",
-      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.48b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\n",
-      "Collecting asgiref~=3.0 (from opentelemetry-instrumentation-asgi==0.48b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for asgiref~=3.0 from https://files.pythonhosted.org/packages/39/e3/893e8757be2612e6c266d9bb58ad2e3651524b5b40cf56761e985a28b13e/asgiref-3.8.1-py3-none-any.whl.metadata\n",
-      "  Downloading asgiref-3.8.1-py3-none-any.whl.metadata (9.3 kB)\n",
-      "Collecting opentelemetry-api>=1.2.0 (from chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-api>=1.2.0 from https://files.pythonhosted.org/packages/fb/1f/737dcdbc9fea2fa96c1b392ae47275165a7c641663fbb08a8d252968eed2/opentelemetry_api-1.27.0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_api-1.27.0-py3-none-any.whl.metadata (1.4 kB)\n",
-      "INFO: pip is looking at multiple versions of opentelemetry-sdk to determine which version is compatible with other requirements. This could take a while.\n",
-      "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation-fastapi>=0.41b0 from https://files.pythonhosted.org/packages/a5/29/a97842d6dfa679bf0f3624ce1ea3458eb185befd536cafe580daa9ab68ae/opentelemetry_instrumentation_fastapi-0.47b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation_fastapi-0.47b0-py3-none-any.whl.metadata (2.1 kB)\n",
-      "Collecting opentelemetry-instrumentation-asgi==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation-asgi==0.47b0 from https://files.pythonhosted.org/packages/ba/d9/c74cb6d69589cc97d856cb3f427dfcef37ec16f9564586290c9c075d9020/opentelemetry_instrumentation_asgi-0.47b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation_asgi-0.47b0-py3-none-any.whl.metadata (2.0 kB)\n",
-      "Collecting opentelemetry-instrumentation==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation==0.47b0 from https://files.pythonhosted.org/packages/1f/6a/be31a84ddd13e9018fcca6885e4710f227eb0fd06eda1896da67287faa2e/opentelemetry_instrumentation-0.47b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation-0.47b0-py3-none-any.whl.metadata (6.1 kB)\n",
-      "Collecting opentelemetry-semantic-conventions==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-semantic-conventions==0.47b0 from https://files.pythonhosted.org/packages/00/c2/ca5cef8e4cd8eec5a95deed95ec3f6005e499fd9d17ca08731ced03a6921/opentelemetry_semantic_conventions-0.47b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_semantic_conventions-0.47b0-py3-none-any.whl.metadata (2.4 kB)\n",
-      "Collecting opentelemetry-util-http==0.47b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-util-http==0.47b0 from https://files.pythonhosted.org/packages/10/7e/98749e14a4e3f4db8bc016e6b42aba40e4d934baeb8767b8658a99d0dfac/opentelemetry_util_http-0.47b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_util_http-0.47b0-py3-none-any.whl.metadata (2.5 kB)\n",
-      "Collecting opentelemetry-api>=1.2.0 (from chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-api>=1.2.0 from https://files.pythonhosted.org/packages/e3/a7/6322d1d7a1fb926e8b99208c27730f21217da2f1e0e11dab48a78a0427a4/opentelemetry_api-1.26.0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)\n",
-      "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation-fastapi>=0.41b0 from https://files.pythonhosted.org/packages/b8/96/905d575947342c4fd6781a28f6d7bc7f4f6670d45e3b1a85f8a06955c9ae/opentelemetry_instrumentation_fastapi-0.46b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation_fastapi-0.46b0-py3-none-any.whl.metadata (2.0 kB)\n",
-      "Collecting opentelemetry-instrumentation-asgi==0.46b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation-asgi==0.46b0 from https://files.pythonhosted.org/packages/47/8d/8955c7fbd949e3ea1c186c7422047f675bf4f7c8976afd2fdf713183318e/opentelemetry_instrumentation_asgi-0.46b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation_asgi-0.46b0-py3-none-any.whl.metadata (1.9 kB)\n",
-      "Collecting opentelemetry-instrumentation==0.46b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-instrumentation==0.46b0 from https://files.pythonhosted.org/packages/10/e5/d6fff0a6f6fbddf03c7fb48ab47925581c4f1a8268f9ad98e5ea4a8b90a5/opentelemetry_instrumentation-0.46b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_instrumentation-0.46b0-py3-none-any.whl.metadata (6.1 kB)\n",
-      "Requirement already satisfied: opentelemetry-semantic-conventions==0.46b0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.46b0)\n",
-      "Collecting opentelemetry-util-http==0.46b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n",
-      "  Obtaining dependency information for opentelemetry-util-http==0.46b0 from https://files.pythonhosted.org/packages/a2/7f/26d3d8880ea79adde8bb7bc306b25ca5134d6f6c3006ba464716405b4729/opentelemetry_util_http-0.46b0-py3-none-any.whl.metadata\n",
-      "  Downloading opentelemetry_util_http-0.46b0-py3-none-any.whl.metadata (2.4 kB)\n",
-      "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)\n",
-      "  Obtaining dependency information for monotonic>=1.5 from https://files.pythonhosted.org/packages/9a/67/7e8406a29b6c45be7af7740456f7f37025f0506ae2e05fb9009a53946860/monotonic-1.6-py2.py3-none-any.whl.metadata\n",
-      "  Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)\n",
-      "Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb)\n",
-      "  Obtaining dependency information for backoff>=1.10.0 from https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl.metadata\n",
-      "  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n",
-      "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (3.0.0)\n",
-      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (2.16.1)\n",
-      "Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from tokenizers>=0.13.2->chromadb) (0.22.2)\n",
-      "Requirement already satisfied: click>=8.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from typer>=0.9.0->chromadb) (8.1.6)\n",
-      "Requirement already satisfied: shellingham>=1.3.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from typer>=0.9.0->chromadb) (1.5.4)\n",
-      "Requirement already satisfied: httptools>=0.5.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\n",
-      "Requirement already satisfied: python-dotenv>=0.13 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\n",
-      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.19.0)\n",
-      "Requirement already satisfied: watchfiles>=0.13 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.21.0)\n",
-      "Requirement already satisfied: websockets>=10.4 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (12.0)\n",
-      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.2)\n",
-      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.3.0)\n",
-      "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n",
-      "Requirement already satisfied: filelock in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.13.1)\n",
-      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.9.0)\n",
-      "Requirement already satisfied: zipp>=0.5 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from importlib-metadata<=7.1,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.16.2)\n",
-      "Requirement already satisfied: mdurl~=0.1 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->chromadb) (0.1.2)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from requests->kubernetes>=28.1.0->chromadb) (3.2.0)\n",
-      "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.14.1->chromadb)\n",
-      "  Obtaining dependency information for humanfriendly>=9.1 from https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl.metadata\n",
-      "  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n",
-      "Requirement already satisfied: mpmath>=0.19 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n",
-      "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /opt/homebrew/Caskroom/miniconda/base/envs/py11/lib/python3.11/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.5.1)\n",
-      "Downloading chromadb-0.5.12-py3-none-any.whl (602 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m602.6/602.6 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
-      "\u001b[?25hDownloading chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl (185 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m185.0/185.0 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
-      "\u001b[?25hDownloading bcrypt-4.2.0-cp39-abi3-macosx_10_12_universal2.whl (472 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m472.4/472.4 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n",
-      "\u001b[?25hDownloading build-1.2.2.post1-py3-none-any.whl (22 kB)\n",
-      "Downloading httpx-0.27.2-py3-none-any.whl (76 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hDownloading mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl (38 kB)\n",
-      "Downloading onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl (16.8 MB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.8/16.8 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
-      "\u001b[?25hDownloading opentelemetry_instrumentation_fastapi-0.46b0-py3-none-any.whl (11 kB)\n",
-      "Downloading opentelemetry_instrumentation-0.46b0-py3-none-any.whl (29 kB)\n",
-      "Downloading opentelemetry_instrumentation_asgi-0.46b0-py3-none-any.whl (14 kB)\n",
-      "Downloading opentelemetry_util_http-0.46b0-py3-none-any.whl (6.9 kB)\n",
-      "Downloading posthog-3.7.0-py2.py3-none-any.whl (54 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.4/54.4 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hDownloading tenacity-9.0.0-py3-none-any.whl (28 kB)\n",
-      "Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
-      "Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
-      "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hDownloading flatbuffers-24.3.25-py2.py3-none-any.whl (26 kB)\n",
-      "Downloading pyproject_hooks-1.2.0-py3-none-any.whl (10 kB)\n",
-      "Downloading asgiref-3.8.1-py3-none-any.whl (23 kB)\n",
-      "Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
-      "\u001b[?25hBuilding wheels for collected packages: pypika\n",
-      "  Building wheel for pypika (pyproject.toml) ... \u001b[?25ldone\n",
-      "\u001b[?25h  Created wheel for pypika: filename=PyPika-0.48.9-py2.py3-none-any.whl size=53723 sha256=8643461882832391b1d76edba76a526166c2d9df66a46f567985fb53216f60b6\n",
-      "  Stored in directory: /Users/emelidral/Library/Caches/pip/wheels/a3/01/bd/4c40ceb9d5354160cb186dcc153360f4ab7eb23e2b24daf96d\n",
-      "Successfully built pypika\n",
-      "Installing collected packages: pypika, monotonic, flatbuffers, tenacity, pyproject_hooks, opentelemetry-util-http, mmh3, humanfriendly, httpcore, chroma-hnswlib, bcrypt, backoff, asgiref, posthog, httpx, coloredlogs, build, opentelemetry-instrumentation, onnxruntime, opentelemetry-instrumentation-asgi, opentelemetry-instrumentation-fastapi, chromadb\n",
-      "  Attempting uninstall: tenacity\n",
-      "    Found existing installation: tenacity 8.2.2\n",
-      "    Uninstalling tenacity-8.2.2:\n",
-      "      Successfully uninstalled tenacity-8.2.2\n",
-      "  Attempting uninstall: httpcore\n",
-      "    Found existing installation: httpcore 0.17.3\n",
-      "    Uninstalling httpcore-0.17.3:\n",
-      "      Successfully uninstalled httpcore-0.17.3\n",
-      "  Attempting uninstall: httpx\n",
-      "    Found existing installation: httpx 0.24.1\n",
-      "    Uninstalling httpx-0.24.1:\n",
-      "      Successfully uninstalled httpx-0.24.1\n",
-      "Successfully installed asgiref-3.8.1 backoff-2.2.1 bcrypt-4.2.0 build-1.2.2.post1 chroma-hnswlib-0.7.6 chromadb-0.5.12 coloredlogs-15.0.1 flatbuffers-24.3.25 httpcore-1.0.6 httpx-0.27.2 humanfriendly-10.0 mmh3-5.0.1 monotonic-1.6 onnxruntime-1.19.2 opentelemetry-instrumentation-0.46b0 opentelemetry-instrumentation-asgi-0.46b0 opentelemetry-instrumentation-fastapi-0.46b0 opentelemetry-util-http-0.46b0 posthog-3.7.0 pypika-0.48.9 pyproject_hooks-1.2.0 tenacity-9.0.0\n"
+      "Requirement already satisfied: chromadb in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (0.5.12)\n",
+      "Requirement already satisfied: build>=1.0.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.2.1)\n",
+      "Requirement already satisfied: pydantic>=1.9 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (2.9.2)\n",
+      "Requirement already satisfied: chroma-hnswlib==0.7.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.7.6)\n",
+      "Requirement already satisfied: fastapi>=0.95.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.115.0)\n",
+      "Requirement already satisfied: uvicorn>=0.18.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.31.0)\n",
+      "Requirement already satisfied: numpy>=1.22.5 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.25.2)\n",
+      "Requirement already satisfied: posthog>=2.4.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (3.6.6)\n",
+      "Requirement already satisfied: typing-extensions>=4.5.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (4.12.2)\n",
+      "Requirement already satisfied: onnxruntime>=1.14.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.19.2)\n",
+      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.26.0)\n",
+      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.26.0)\n",
+      "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.47b0)\n",
+      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.26.0)\n",
+      "Requirement already satisfied: tokenizers>=0.13.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.20.0)\n",
+      "Requirement already satisfied: pypika>=0.48.9 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.48.9)\n",
+      "Requirement already satisfied: tqdm>=4.65.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (4.66.5)\n",
+      "Requirement already satisfied: overrides>=7.3.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (7.7.0)\n",
+      "Requirement already satisfied: importlib-resources in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (6.4.5)\n",
+      "Requirement already satisfied: grpcio>=1.58.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.66.2)\n",
+      "Requirement already satisfied: bcrypt>=4.0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (4.2.0)\n",
+      "Requirement already satisfied: typer>=0.9.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.9.4)\n",
+      "Requirement already satisfied: kubernetes>=28.1.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (31.0.0)\n",
+      "Requirement already satisfied: tenacity>=8.2.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (8.5.0)\n",
+      "Requirement already satisfied: PyYAML>=6.0.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (6.0.2)\n",
+      "Requirement already satisfied: mmh3>=4.0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (5.0.1)\n",
+      "Requirement already satisfied: orjson>=3.9.12 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (3.10.7)\n",
+      "Requirement already satisfied: httpx>=0.27.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.27.2)\n",
+      "Requirement already satisfied: rich>=10.11.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (13.9.1)\n",
+      "Requirement already satisfied: packaging>=19.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from build>=1.0.3->chromadb) (24.1)\n",
+      "Requirement already satisfied: pyproject_hooks in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from build>=1.0.3->chromadb) (1.1.0)\n",
+      "Requirement already satisfied: starlette<0.39.0,>=0.37.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from fastapi>=0.95.2->chromadb) (0.38.6)\n",
+      "Requirement already satisfied: anyio in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (4.6.0)\n",
+      "Requirement already satisfied: certifi in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (2024.8.30)\n",
+      "Requirement already satisfied: httpcore==1.* in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (1.0.6)\n",
+      "Requirement already satisfied: idna in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (3.10)\n",
+      "Requirement already satisfied: sniffio in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (1.3.1)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.27.0->chromadb) (0.14.0)\n",
+      "Requirement already satisfied: six>=1.9.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n",
+      "Requirement already satisfied: google-auth>=1.0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.35.0)\n",
+      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\n",
+      "Requirement already satisfied: requests in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.32.3)\n",
+      "Requirement already satisfied: requests-oauthlib in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.0)\n",
+      "Requirement already satisfied: oauthlib>=3.2.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n",
+      "Requirement already satisfied: urllib3>=1.24.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.2.3)\n",
+      "Requirement already satisfied: durationpy>=0.7 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (0.9)\n",
+      "Requirement already satisfied: coloredlogs in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (15.0.1)\n",
+      "Requirement already satisfied: flatbuffers in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\n",
+      "Requirement already satisfied: protobuf in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.5)\n",
+      "Requirement already satisfied: sympy in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (1.13.3)\n",
+      "Requirement already satisfied: deprecated>=1.2.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\n",
+      "Requirement already satisfied: importlib-metadata<=8.0.0,>=6.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (8.0.0)\n",
+      "Requirement already satisfied: googleapis-common-protos~=1.52 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.65.0)\n",
+      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.26.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.26.0)\n",
+      "Requirement already satisfied: opentelemetry-proto==1.26.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.26.0)\n",
+      "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
+      "Requirement already satisfied: opentelemetry-instrumentation==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
+      "Requirement already satisfied: opentelemetry-semantic-conventions==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
+      "Requirement already satisfied: opentelemetry-util-http==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
+      "Requirement already satisfied: setuptools>=16.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.47b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (65.5.1)\n",
+      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.47b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\n",
+      "Requirement already satisfied: asgiref~=3.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-asgi==0.47b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (3.8.1)\n",
+      "Requirement already satisfied: monotonic>=1.5 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from posthog>=2.4.0->chromadb) (1.6)\n",
+      "Requirement already satisfied: backoff>=1.10.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from posthog>=2.4.0->chromadb) (2.2.1)\n",
+      "Requirement already satisfied: annotated-types>=0.6.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pydantic>=1.9->chromadb) (0.7.0)\n",
+      "Requirement already satisfied: pydantic-core==2.23.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pydantic>=1.9->chromadb) (2.23.4)\n",
+      "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (3.0.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (2.18.0)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from tokenizers>=0.13.2->chromadb) (0.25.1)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from typer>=0.9.0->chromadb) (8.1.7)\n",
+      "Requirement already satisfied: httptools>=0.5.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\n",
+      "Requirement already satisfied: python-dotenv>=0.13 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\n",
+      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.20.0)\n",
+      "Requirement already satisfied: watchfiles>=0.13 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.24.0)\n",
+      "Requirement already satisfied: websockets>=10.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (13.1)\n",
+      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.5.0)\n",
+      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.1)\n",
+      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n",
+      "Requirement already satisfied: filelock in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.12.4)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.9.0)\n",
+      "Requirement already satisfied: zipp>=0.5 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from importlib-metadata<=8.0.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.19.2)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->chromadb) (0.1.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from requests->kubernetes>=28.1.0->chromadb) (3.3.2)\n",
+      "Requirement already satisfied: humanfriendly>=9.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb) (10.0)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n",
+      "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.1)\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n"
      ]
     }
    ],
    "source": [
-    "! pip install chromadb"
+    "!pip install chromadb"
    ]
   },
   {
@@ -255,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 15,
    "id": "547c43f3-e58f-450c-b80b-c396eb2655a1",
    "metadata": {},
    "outputs": [],
@@ -271,6 +143,37 @@
     "from chromadb.utils import embedding_functions"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "5f649ddb-af77-4961-8eb2-f69b7c6916db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chroma_client = chromadb.Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "5d8da19c-8d15-467f-90d0-dc7e02ed3aad",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<chromadb.api.client.Client at 0x2bc6254d0>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chroma_client"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2c6fac99-e1db-48be-9554-88ecddac271e",
@@ -281,12 +184,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "id": "9b93b470-9d32-4757-9d03-915992e2a7c3",
    "metadata": {
     "scrolled": true
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -307,6 +218,7 @@
     "default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=\"all-MiniLM-L6-v2\")\n",
     "\n",
     "chroma_client = chromadb.PersistentClient(path=\"./chromadb/\")\n",
+    "\n",
     "# declare ChromaDB collection\n",
     "collection = chroma_client.get_or_create_collection(\n",
     "    name=collection_name,\n",
@@ -316,12 +228,12 @@
     "result = collection.get()\n",
     "\n",
     "print(f\"Collection {collection_name} created successfully\")\n",
-    "pprint.pprint(result)"
+    "pprint.pprint(result)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 12,
    "id": "d547021f-9d4d-42cf-b580-abc6a1008cd1",
    "metadata": {
     "scrolled": true
@@ -354,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 13,
    "id": "53f147f1-f8a2-4095-a840-2bacbc0aaf63",
    "metadata": {},
    "outputs": [],
@@ -383,7 +295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 24,
    "id": "c0c9c8e2-0f2f-4fe0-aeee-68b0bb67cea8",
    "metadata": {},
    "outputs": [
@@ -391,13 +303,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " 4 files loaded\n",
-      "Split in to 270 chunks\n"
+      " 1 files loaded\n",
+      "Split in to 8 chunks\n"
      ]
     }
    ],
    "source": [
-    "directory_path = \"./evidently_reference/\"\n",
+    "directory_path = \"../../../../evidently_reference/\"\n",
     "\n",
     "# load documents from directory\n",
     "md_files = load_md_from_dir(directory_path)\n",
@@ -419,15 +331,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 10,
    "id": "7bf99fde-8aa7-4111-ad7e-eec59bd0c23e",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collection db_collection has 270 documents\n"
+     "ename": "NameError",
+     "evalue": "name 'chunked_files' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# insert documents with embeddings to collection ChromaDB\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[43mchunked_files\u001b[49m:\n\u001b[1;32m      3\u001b[0m     collection\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m      4\u001b[0m             ids\u001b[38;5;241m=\u001b[39mchunk[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m      5\u001b[0m             documents\u001b[38;5;241m=\u001b[39mchunk[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m      6\u001b[0m     )\n\u001b[1;32m      8\u001b[0m result \u001b[38;5;241m=\u001b[39m collection\u001b[38;5;241m.\u001b[39mget()\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'chunked_files' is not defined"
      ]
     }
    ],
@@ -446,7 +362,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 11,
    "id": "50ec8822-e2dc-4a01-bad3-44f1f123ed5c",
    "metadata": {
     "scrolled": true
@@ -456,7 +372,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Collection(id=9d10a2e1-2a39-4ba6-863a-577069d1d2af, name=db_collection)]\n"
+      "[Collection(id=639fbee3-87d9-447e-a134-f2ad8596b07c, name=db_collection)]\n"
      ]
     }
    ],
@@ -1333,7 +1249,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
index ea511d24ea..e04e451c90 100644
--- a/src/evidently/dataset_generators/llm/generator.py
+++ b/src/evidently/dataset_generators/llm/generator.py
@@ -2,17 +2,17 @@
 
 import pandas as pd
 
-from evidently.dataset_generators.llm.aaa import FileContextGenerator
 from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator
 from evidently.dataset_generators.llm.aaa import QuestionPairGenerator
 from evidently.dataset_generators.llm.aaa import SimpleQuestionPrompt
+from evidently.dataset_generators.llm.index import IndexExtractorFromFile
 from evidently.options.base import Options
 
 
 def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFrame:
-    chunks = FileContextGenerator(path=file_path)
+    documents = IndexExtractorFromFile(path=file_path)
     generator = QuestionPairGenerator(
-        index=chunks,
+        index=documents,
         questions=PromptQuestionGenerator(system_promt=SimpleQuestionPrompt()),
         num_questions=num_questions,
         provider="openai",
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 0ccb60ac35..9f6e58e874 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,10 +1,14 @@
 import abc
 import dataclasses
 from abc import ABC
-from typing import Any
+from pathlib import Path
 from typing import List
 from typing import Optional
 
+import chromadb
+from chromadb import ClientAPI
+from chromadb.types import Collection
+from chromadb.utils import embedding_functions
 from llama_index.core.node_parser import SentenceSplitter
 
 from evidently.pydantic_utils import EvidentlyBaseModel
@@ -14,13 +18,29 @@
 
 @dataclasses.dataclass
 class DocumentIndex:
+    name: str
     chunks: List[Chunk]
-    embeddings: Optional[Any] = None
-
-    def get_embeddings(self):
-        if self.embeddings is not None:
-            self.embeddings = ...
-        return self.embeddings
+    collection: Collection = None
+    chroma_client: Optional[ClientAPI] = None
+
+    def get_collection(self):
+        if self.collection is None:
+            default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+                model_name="all-MiniLM-L6-v2",
+            )
+            self.chroma_client = chromadb.Client()
+            collection = self.chroma_client.get_or_create_collection(
+                name=self.name,
+                embedding_function=default_embedding_function,
+            )
+            # insert documents with embeddings to collection ChromaDB
+            for i, chunk in enumerate(self.chunks):
+                collection.upsert(
+                    ids=str(i),
+                    documents=chunk,
+                )
+            self.collection = collection
+        return self.collection
 
 
 class IndexExtractor(EvidentlyBaseModel, ABC):
@@ -31,9 +51,9 @@ def extract_index(self) -> DocumentIndex:
 
 class IndexExtractorFromFile(IndexExtractor):
     class Config:
-        type_alias = "asdfasdfasd"
+        type_alias = "IndexExtractorFromFile"
 
-    path: str
+    path: Path
     chunk_size: int = 512
     chunk_overlap: int = 20
 
@@ -42,7 +62,7 @@ def extract_index(self) -> DocumentIndex:
             text = f.read()
         splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
         text_nodes = splitter.split_text(text)
-        return DocumentIndex(text_nodes)
+        return DocumentIndex(self.path.name, chunks=text_nodes)
 
 
 class SimpleIndexExtractor(IndexExtractor):

From 426a1fad481da44329c4adb45ceb51d6de37142e Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 19:47:06 +0200
Subject: [PATCH 18/63] prompts

---
 src/evidently/dataset_generators/llm/aaa.py   | 66 +++++++----------
 .../dataset_generators/llm/prompts.py         | 55 ++++++++++++++
 src/evidently/utils/llm.py                    | 72 ++++++++++++++++---
 3 files changed, 145 insertions(+), 48 deletions(-)
 create mode 100644 src/evidently/dataset_generators/llm/prompts.py

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index fa1803b0e0..4ea40391f8 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -1,6 +1,3 @@
-import abc
-from abc import ABC
-from typing import ClassVar
 from typing import List
 from typing import Sequence
 from typing import Tuple
@@ -11,57 +8,46 @@
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.dataset_generators.llm.index import Chunk
 from evidently.dataset_generators.llm.index import IndexExtractor
-from evidently.pydantic_utils import EvidentlyBaseModel
-from evidently.utils.llm import BlockPromptTemplate
+from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
+from evidently.dataset_generators.llm.prompts import QuestionGenerationPrompt
 from evidently.utils.llm import LLMMessage
-from evidently.utils.llm import LLMWrapper
-from evidently.utils.llm import PromptBlock
-from evidently.utils.llm import PromptTemplate
 
 Question = str
 Answer = str
 GeneratedQuestion = Tuple[Question, Answer, Chunk]
 
 
-class QuestionGenerator(EvidentlyBaseModel, ABC):
-    @abc.abstractmethod
-    def generate_questions(self, wrapper: LLMWrapper, chunks: Sequence[Chunk]) -> List[GeneratedQuestion]:
-        raise NotImplementedError
-
-
-class SimpleQuestionPrompt(BlockPromptTemplate):
-    blocks: ClassVar = [
-        PromptBlock.simple("Please generate a {question_type} question about this:"),
-        PromptBlock.input("context").anchored(),
-        PromptBlock.json_output(question="question text", answer="answer text"),
-    ]
-
-
-class PromptQuestionGenerator(QuestionGenerator):
-    class Config:
-        type_alias = "asdfasdasdfaaasdfdsfasfasd"
-
-    prompt: PromptTemplate
-    question_type: str = "simple"
-
-    def generate_questions(self, wrapper: LLMWrapper, chunks: Sequence[Chunk]) -> GeneratedQuestion:
-        context = "\n\n".join(chunks)
-        rendered = self.prompt.render(context=context, question_type=self.question_type)
-        result = wrapper.complete([LLMMessage.user(rendered)])
-        data = self.prompt.parse(result, keys=["question", "answer"])
-        return data["question"], data["answer"], context
-
-
 class QuestionPairGenerator(BaseLLMDatasetGenerator):
     class Config:
         type_alias = "asdfasdasdfaaasdfdsfasfasd"
 
     index: IndexExtractor
-    questions: QuestionGenerator
     num_questions: int
+    prompt: QuestionGenerationPrompt
+    system_prompt: str = "You are an assisstant who generates questions based on provided context"
+    answer_prompt: BaselineAnswerPrompt
+    answer_system_prompt: str = "You are a helpful assistant thet answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.index.extract_index()
-        qs = self.questions.generate_questions(self.wrapper, [chunk for chunk in documents.chunks])
+        questions: List[Question] = self.generate_questions([chunk for chunk in documents.chunks])
+        relevant_chunks = [[c] for c in documents.chunks]  # fixme
+        answers = self.generate_answers(questions, relevant_chunks)
+        return pd.DataFrame({"questions": questions, "answers": answers, "context": relevant_chunks})
 
-        return pd.DataFrame([qs], columns=["question", "answer", "context"])
+    def generate_questions(self, chunks: Sequence[Chunk]) -> List[Question]:
+        context = "\n\n".join(chunks)
+        rendered = self.prompt.render(context=context)
+        result = self.wrapper.complete([LLMMessage.system(self.system_prompt), LLMMessage.user(rendered)])
+        data = self.prompt.parse(result, keys=["questions"])
+        return data["questions"]
+
+    def generate_answers(self, questions: List[Question], relevent_chunks: List[List[Chunk]]):
+        answers = []
+        system = LLMMessage.system(self.answer_system_prompt)
+        for question, chunks in zip(questions, relevent_chunks):
+            answer = self.wrapper.complete(
+                [system, LLMMessage.user(self.answer_prompt.render(question=question, context="\n".join(chunks)))]
+            )
+            answers.append(answer)
+        return answers
diff --git a/src/evidently/dataset_generators/llm/prompts.py b/src/evidently/dataset_generators/llm/prompts.py
new file mode 100644
index 0000000000..d721b58104
--- /dev/null
+++ b/src/evidently/dataset_generators/llm/prompts.py
@@ -0,0 +1,55 @@
+from typing import ClassVar
+
+from evidently.utils.llm import BlockPromptTemplate
+from evidently.utils.llm import PromptBlock
+
+
+class SimpleQuestionPrompt(BlockPromptTemplate):
+    blocks: ClassVar = [
+        "Please generate a {question_type} question about this:",
+        PromptBlock.input("context").anchored(),
+        PromptBlock.json_output(question="question text", answer="answer text"),
+    ]
+    question_type: str = "simple"
+
+
+class QuestionGenerationPrompt(BlockPromptTemplate):
+    pass
+
+
+class NaiveQuestionsPrompt(QuestionGenerationPrompt):
+    blocks: ClassVar = [
+        "Generate {number} conceptual questions based on the provided context and "
+        "can be answered from the information in the provided context.\n"
+        "Here is a context",
+        PromptBlock.input("context").anchored(),
+        "Remain faithful to the above context.\n"
+        "Avoid providing any preamble!\n"
+        "Avoid providing any closing statement!",
+        PromptBlock.string_list_output("questions"),
+    ]
+    number: int
+
+
+class ReformulateQuestionPrompt(QuestionGenerationPrompt):
+    blocks: ClassVar = [
+        """Write for me {number} alternative questions quite similar to the question you got.
+The question:""",
+        PromptBlock.input("context").anchored(),
+        PromptBlock.string_list_output("questions"),
+    ]
+    number: int
+
+
+class BaselineAnswerPrompt(BlockPromptTemplate):
+    blocks: ClassVar = [
+        "Your task is to answer the following query:",
+        PromptBlock.input("question").anchored(),
+        "You have access to the following documents which are meant to provide context as you answer the query:",
+        PromptBlock.input("context").anchored(),
+        """Please remain faithful to the underlying context,
+and deviate from it only if you haven't found the answer in the provided context.
+Avoid providing any preamble!
+Avoid providing any closing statement!""",
+        PromptBlock.string_output("answer"),
+    ]
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index e6873a0a80..57c792d46a 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -30,6 +30,10 @@ class LLMMessage:
     def user(cls, message: str):
         return LLMMessage("user", message)
 
+    @classmethod
+    def system(cls, message: str):
+        return LLMMessage("system", message)
+
 
 LLMResponse = Dict[str, Any]
 
@@ -132,7 +136,17 @@ class PromptBlock(EvidentlyBaseModel):
     class Config:
         alias_required = False  # fixme
 
-    def render(self) -> str:
+    def render(self):
+        # )))
+        result = self._render()
+        for field in self.__fields__:
+            placeholder = f"{{{field}}}"
+            if placeholder in result:
+                result = result.replace(placeholder, getattr(self, field))
+        return result
+
+    @abstractmethod
+    def _render(self) -> str:
         raise NotImplementedError
 
     @classmethod
@@ -147,6 +161,14 @@ def input(cls, placeholder_name: str = "input"):
     def json_output(cls, **fields: Union[str, Tuple[str, str]]):
         return JsonOutputFormatBlock(fields=fields)
 
+    @classmethod
+    def string_list_output(cls, of_what: str):
+        return StringListFormatBlock(of_what=of_what)
+
+    @classmethod
+    def string_output(cls, what: str):
+        return StringFormatBlock(what=what)
+
     def anchored(self, start: str = "__start__", end: str = "__end__"):
         return Anchor(start=start, block=self, end=end)
 
@@ -156,18 +178,19 @@ class Anchor(PromptBlock):
     block: PromptBlock
     end: str
 
-    def render(self) -> str:
+    def _render(self) -> str:
         return f"{self.start}\n{self.block.render()}\n{self.end}"
 
 
 class SimpleBlock(PromptBlock):
     value: str
 
-    def render(self) -> str:
+    def _render(self) -> str:
         return self.value
 
 
-class OutputFormatBlock(PromptBlock):
+class OutputFormatBlock(PromptBlock, ABC):
+    @abstractmethod
     def parse_response(self, response: str) -> Dict[str, str]:
         raise NotImplementedError
 
@@ -175,7 +198,7 @@ def parse_response(self, response: str) -> Dict[str, str]:
 class JsonOutputFormatBlock(OutputFormatBlock):
     fields: Dict[str, Union[Tuple[str, str], str]]
 
-    def render(self) -> str:
+    def _render(self) -> str:
         values = []
         example_rows = []
         for field, descr in self.fields.items():
@@ -196,6 +219,27 @@ def parse_response(self, response: str) -> Dict[str, str]:
             raise LLMResponseParseError(f"Failed to parse response '{response}' as json") from e
 
 
+class StringListFormatBlock(OutputFormatBlock):
+    of_what: str
+
+    def _render(self) -> str:
+        return f"""Return a list of {self.of_what}.
+This should be only a list of string {self.of_what}, separated by comma"""
+
+    def parse_response(self, response: str) -> Dict[str, str]:
+        return {self.of_what: response.split(",")}
+
+
+class StringFormatBlock(OutputFormatBlock):
+    what: str
+
+    def _render(self) -> str:
+        return f"""Return {self.what} only."""
+
+    def parse_response(self, response: str) -> Dict[str, str]:
+        return {self.what: response}
+
+
 class PromptTemplate(EvidentlyBaseModel):
     class Config:
         alias_required = False  # fixme
@@ -215,7 +259,7 @@ def render(self, **values: str):
     def get_template(self) -> str:
         return "\n".join(block.render() for block in self.get_blocks())
 
-    def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, str]:
+    def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, Any]:
         output = next((b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None)
         if output is None:
             return {"": response}
@@ -225,11 +269,23 @@ def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, st
         return parsed
 
 
+AnyBlock = Union[str, PromptBlock, Callable]
+
+
 class BlockPromptTemplate(PromptTemplate):
-    blocks: ClassVar[List[PromptBlock]]
+    blocks: ClassVar[List[AnyBlock]]
 
     def get_blocks(self) -> Sequence[PromptBlock]:
-        return self.blocks
+        return [self._to_block(b) for b in self.blocks]
+
+    def _to_block(self, block: AnyBlock) -> PromptBlock:
+        if isinstance(block, PromptBlock):
+            return block
+        if isinstance(block, str):
+            return PromptBlock.simple(block)
+        if callable(block):
+            return PromptBlock.func(block)
+        raise NotImplementedError(f"Cannot create promt block from {block}")
 
 
 # class BinaryClassificationPromtTemplate(PromptTemplate):

From 73699935187f3c808325cc43f1b61398330b5bf4 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 19:47:30 +0200
Subject: [PATCH 19/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/index.py | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 9f6e58e874..27de66ab41 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,5 +1,4 @@
 import abc
-import dataclasses
 from abc import ABC
 from pathlib import Path
 from typing import List
@@ -16,7 +15,6 @@
 Chunk = str
 
 
-@dataclasses.dataclass
 class DocumentIndex:
     name: str
     chunks: List[Chunk]
@@ -42,6 +40,28 @@ def get_collection(self):
             self.collection = collection
         return self.collection
 
+    def find_relevant_chunks(self, question: str, n_results=3) -> List[Chunk]:
+        """
+        Queries the collection with a given question and returns the relevant text chunks.
+
+        Args:
+            question (str): The query or question text to search for.
+            n_results (int): Number of results to retrieve. Default is 3.
+
+        Returns:
+            List[Chunk]: A list of relevant text chunks.
+        """
+        # Perform the query
+        results = self.collection.query(
+            query_texts=question,
+            n_results=n_results,
+        )
+
+        # Extract relevant text chunks from the documents
+        relevant_chunks = [chunk for document in results["documents"] for chunk in document]
+
+        return relevant_chunks
+
 
 class IndexExtractor(EvidentlyBaseModel, ABC):
     @abc.abstractmethod

From bbbfd55b83e3e84c75c979b0b28aa609638a7f99 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 19:55:17 +0200
Subject: [PATCH 20/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/index.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 27de66ab41..994481eb22 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,5 +1,6 @@
 import abc
 from abc import ABC
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 from typing import Optional
@@ -15,6 +16,7 @@
 Chunk = str
 
 
+@dataclass
 class DocumentIndex:
     name: str
     chunks: List[Chunk]

From 996aab767ecd4ed450bfea2199d09beac32f27f2 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 20:10:34 +0200
Subject: [PATCH 21/63] fixes

---
 examples/synth_data.py                        | 17 +++++++----
 src/evidently/dataset_generators/llm/aaa.py   | 29 +++++++++++--------
 .../llm/data_generation_for_RAG.ipynb         | 12 ++++----
 src/evidently/dataset_generators/llm/index.py |  2 +-
 .../dataset_generators/llm/prompts.py         |  1 -
 src/evidently/utils/llm.py                    |  4 +--
 6 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 852cbeb215..e7368b79e0 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,22 +1,29 @@
 import os
 
-from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator, QuestionPairGenerator, SimpleQuestionPrompt
+from evidently.dataset_generators.llm.aaa import QuestionPairGenerator
 from evidently.dataset_generators.llm.index import SimpleIndexExtractor
+from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsPrompt
 from evidently.options.base import Options
 from evidently.ui.workspace import CloudWorkspace
 
 
 def main():
     generator = QuestionPairGenerator(
-        index=SimpleIndexExtractor(chunks=["I am a banana"]),
-        questions=PromptQuestionGenerator(prompt=SimpleQuestionPrompt()),
-        num_questions=2,
+        index=SimpleIndexExtractor(chunks=["I am a banana", "My spoon is too big"]),
+        questions=NaiveQuestionsPrompt(),
+        answers=BaselineAnswerPrompt(),
         provider="openai",
         model="gpt-4o-mini",
+        num_questions=5,
         options=Options.from_any_options(None)
     )
+    print(generator.questions.get_template())
     generated = generator.generate()
-    print(generated)
+    for _, a in generated.iterrows():
+        print("Q", a["questions"])
+        print("A", a["answers"])
+        print("C", a["context"])
+        print()
 
     # client = CloudWorkspace(token=os.environ["EVIDENTLY_TOKEN"], url="https://app.evidently.dev")
     #
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 4ea40391f8..629b079766 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -23,31 +23,36 @@ class Config:
 
     index: IndexExtractor
     num_questions: int
-    prompt: QuestionGenerationPrompt
-    system_prompt: str = "You are an assisstant who generates questions based on provided context"
-    answer_prompt: BaselineAnswerPrompt
+    questions: QuestionGenerationPrompt
+    questions_system_prompt: str = "You are an assisstant who generates questions based on provided context"
+    answers: BaselineAnswerPrompt
     answer_system_prompt: str = "You are a helpful assistant thet answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.index.extract_index()
-        questions: List[Question] = self.generate_questions([chunk for chunk in documents.chunks])
-        relevant_chunks = [[c] for c in documents.chunks]  # fixme
+        chunk_sets = [documents.chunks]
+        questions: List[Question] = self.generate_questions(chunk_sets)
+        relevant_chunks = [documents.chunks for _ in questions]  # fixme
         answers = self.generate_answers(questions, relevant_chunks)
+        print(len(questions), len(answers), len(relevant_chunks))
         return pd.DataFrame({"questions": questions, "answers": answers, "context": relevant_chunks})
 
-    def generate_questions(self, chunks: Sequence[Chunk]) -> List[Question]:
-        context = "\n\n".join(chunks)
-        rendered = self.prompt.render(context=context)
-        result = self.wrapper.complete([LLMMessage.system(self.system_prompt), LLMMessage.user(rendered)])
-        data = self.prompt.parse(result, keys=["questions"])
-        return data["questions"]
+    def generate_questions(self, chunk_sets: Sequence[List[Chunk]]) -> List[Question]:
+        questions = []
+        for chunks in chunk_sets:
+            context = "\n\n".join(chunks)
+            rendered = self.questions.render(context=context, number=self.num_questions)
+            result = self.wrapper.complete([LLMMessage.system(self.questions_system_prompt), LLMMessage.user(rendered)])
+            data = self.questions.parse(result, keys=["questions"])
+            questions.extend(data["questions"])
+        return questions
 
     def generate_answers(self, questions: List[Question], relevent_chunks: List[List[Chunk]]):
         answers = []
         system = LLMMessage.system(self.answer_system_prompt)
         for question, chunks in zip(questions, relevent_chunks):
             answer = self.wrapper.complete(
-                [system, LLMMessage.user(self.answer_prompt.render(question=question, context="\n".join(chunks)))]
+                [system, LLMMessage.user(self.answers.render(question=question, context="\n".join(chunks)))]
             )
             answers.append(answer)
         return answers
diff --git a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
index 202c6ca472..4af120045b 100644
--- a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
+++ b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
@@ -108,8 +108,8 @@
       "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n",
       "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.1)\n",
       "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n"
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.2\u001B[0m\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpython3.11 -m pip install --upgrade pip\u001B[0m\n"
      ]
     }
    ],
@@ -340,10 +340,10 @@
      "evalue": "name 'chunked_files' is not defined",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# insert documents with embeddings to collection ChromaDB\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[43mchunked_files\u001b[49m:\n\u001b[1;32m      3\u001b[0m     collection\u001b[38;5;241m.\u001b[39mupsert(\n\u001b[1;32m      4\u001b[0m             ids\u001b[38;5;241m=\u001b[39mchunk[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m      5\u001b[0m             documents\u001b[38;5;241m=\u001b[39mchunk[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m      6\u001b[0m     )\n\u001b[1;32m      8\u001b[0m result \u001b[38;5;241m=\u001b[39m collection\u001b[38;5;241m.\u001b[39mget()\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'chunked_files' is not defined"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[10], line 2\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;66;03m# insert documents with embeddings to collection ChromaDB\u001B[39;00m\n\u001B[0;32m----> 2\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m chunk \u001B[38;5;129;01min\u001B[39;00m \u001B[43mchunked_files\u001B[49m:\n\u001B[1;32m      3\u001B[0m     collection\u001B[38;5;241m.\u001B[39mupsert(\n\u001B[1;32m      4\u001B[0m             ids\u001B[38;5;241m=\u001B[39mchunk[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mid\u001B[39m\u001B[38;5;124m'\u001B[39m],\n\u001B[1;32m      5\u001B[0m             documents\u001B[38;5;241m=\u001B[39mchunk[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m'\u001B[39m],\n\u001B[1;32m      6\u001B[0m     )\n\u001B[1;32m      8\u001B[0m result \u001B[38;5;241m=\u001B[39m collection\u001B[38;5;241m.\u001B[39mget()\n",
+      "\u001B[0;31mNameError\u001B[0m: name 'chunked_files' is not defined"
      ]
     }
    ],
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 9f6e58e874..1b8401b23a 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -72,4 +72,4 @@ class Config:
     chunks: List[Chunk]
 
     def extract_index(self) -> DocumentIndex:
-        return DocumentIndex(self.chunks)
+        return DocumentIndex("", chunks=self.chunks)
diff --git a/src/evidently/dataset_generators/llm/prompts.py b/src/evidently/dataset_generators/llm/prompts.py
index d721b58104..9ec704a900 100644
--- a/src/evidently/dataset_generators/llm/prompts.py
+++ b/src/evidently/dataset_generators/llm/prompts.py
@@ -28,7 +28,6 @@ class NaiveQuestionsPrompt(QuestionGenerationPrompt):
         "Avoid providing any closing statement!",
         PromptBlock.string_list_output("questions"),
     ]
-    number: int
 
 
 class ReformulateQuestionPrompt(QuestionGenerationPrompt):
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index 57c792d46a..581f53d6e2 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -224,7 +224,7 @@ class StringListFormatBlock(OutputFormatBlock):
 
     def _render(self) -> str:
         return f"""Return a list of {self.of_what}.
-This should be only a list of string {self.of_what}, separated by comma"""
+This should be only a list of string {self.of_what} separated by commas."""
 
     def parse_response(self, response: str) -> Dict[str, str]:
         return {self.of_what: response.split(",")}
@@ -253,7 +253,7 @@ def iterate(self, values: Sequence[Dict[str, str]]) -> Iterator[str]:
         for vals in values:
             yield template.format(**vals)
 
-    def render(self, **values: str):
+    def render(self, **values):
         return self.get_template().format(**values)
 
     def get_template(self) -> str:

From 3dee2c30e0fbcd4661b6bb77b4d0a315ce55fd12 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 9 Oct 2024 20:17:24 +0200
Subject: [PATCH 22/63] fixes

---
 src/evidently/dataset_generators/llm/generator.py | 7 ++++---
 src/evidently/dataset_generators/llm/index.py     | 7 ++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
index e04e451c90..df9205b19d 100644
--- a/src/evidently/dataset_generators/llm/generator.py
+++ b/src/evidently/dataset_generators/llm/generator.py
@@ -2,10 +2,10 @@
 
 import pandas as pd
 
-from evidently.dataset_generators.llm.aaa import PromptQuestionGenerator
 from evidently.dataset_generators.llm.aaa import QuestionPairGenerator
-from evidently.dataset_generators.llm.aaa import SimpleQuestionPrompt
 from evidently.dataset_generators.llm.index import IndexExtractorFromFile
+from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
+from evidently.dataset_generators.llm.prompts import NaiveQuestionsPrompt
 from evidently.options.base import Options
 
 
@@ -13,7 +13,8 @@ def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFram
     documents = IndexExtractorFromFile(path=file_path)
     generator = QuestionPairGenerator(
         index=documents,
-        questions=PromptQuestionGenerator(system_promt=SimpleQuestionPrompt()),
+        questions=NaiveQuestionsPrompt(),
+        answers=BaselineAnswerPrompt(),
         num_questions=num_questions,
         provider="openai",
         model="gpt-4o-mini",
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 06384e1f1f..8f28149aa8 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -3,10 +3,8 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List
-from typing import Optional
 
 import chromadb
-from chromadb import ClientAPI
 from chromadb.types import Collection
 from chromadb.utils import embedding_functions
 from llama_index.core.node_parser import SentenceSplitter
@@ -21,15 +19,14 @@ class DocumentIndex:
     name: str
     chunks: List[Chunk]
     collection: Collection = None
-    chroma_client: Optional[ClientAPI] = None
 
     def get_collection(self):
         if self.collection is None:
             default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
                 model_name="all-MiniLM-L6-v2",
             )
-            self.chroma_client = chromadb.Client()
-            collection = self.chroma_client.get_or_create_collection(
+            chroma_client = chromadb.Client()
+            collection = chroma_client.get_or_create_collection(
                 name=self.name,
                 embedding_function=default_embedding_function,
             )

From 761946ea3890e5bdcea5271993e7b2c67c410ae5 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Wed, 9 Oct 2024 20:41:05 +0200
Subject: [PATCH 23/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/index.py | 39 +++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 8f28149aa8..56e6c1d13b 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,4 +1,5 @@
 import abc
+import os
 from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
@@ -68,6 +69,12 @@ def extract_index(self) -> DocumentIndex:
         raise NotImplementedError
 
 
+@dataclass
+class Document:
+    id: str
+    content: str
+
+
 class IndexExtractorFromFile(IndexExtractor):
     class Config:
         type_alias = "IndexExtractorFromFile"
@@ -76,11 +83,37 @@ class Config:
     chunk_size: int = 512
     chunk_overlap: int = 20
 
+    def load_md_from_dir(self, path: Path) -> List[Document]:
+        """
+        Loads Markdown (.md) files from the specified directory.
+
+        Args:
+            path (str): Path to the directory containing .md files.
+
+        Returns:
+            List[dict]: A list of dictionaries with the text content of each .md file.
+        """
+        documents = []
+
+        if os.path.isfile(path):
+            with open(path, "r", encoding="utf-8") as file:
+                documents.append(Document(id=file.name, content=file.read()))
+            return documents
+
+        for filename in os.listdir(path):
+            file_path = os.path.join(path, filename)
+            with open(file_path, "r", encoding="utf-8") as file:
+                documents.append(Document(id=file.name, content=file.read()))
+
+        return documents
+
     def extract_index(self) -> DocumentIndex:
-        with open(self.path) as f:
-            text = f.read()
+        documents = self.load_md_from_dir(self.path)
         splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
-        text_nodes = splitter.split_text(text)
+        text_nodes = []
+        for document in documents:
+            text_nodes.extend(splitter.split_text(document.content))
+
         return DocumentIndex(self.path.name, chunks=text_nodes)
 
 

From e36c8ced93c85be54353616c29afdc63c51b69df Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 10:57:22 +0200
Subject: [PATCH 24/63] chunks count

---
 src/evidently/dataset_generators/llm/aaa.py   | 21 +++++++++++++------
 src/evidently/dataset_generators/llm/index.py |  4 ++--
 src/evidently/utils/llm.py                    |  4 ++--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 629b079766..548430b625 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -1,3 +1,4 @@
+import random
 from typing import List
 from typing import Sequence
 from typing import Tuple
@@ -7,6 +8,7 @@
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.dataset_generators.llm.index import Chunk
+from evidently.dataset_generators.llm.index import DocumentIndex
 from evidently.dataset_generators.llm.index import IndexExtractor
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
 from evidently.dataset_generators.llm.prompts import QuestionGenerationPrompt
@@ -15,6 +17,7 @@
 Question = str
 Answer = str
 GeneratedQuestion = Tuple[Question, Answer, Chunk]
+ChunkSet = List[Chunk]
 
 
 class QuestionPairGenerator(BaseLLMDatasetGenerator):
@@ -30,18 +33,24 @@ class Config:
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.index.extract_index()
-        chunk_sets = [documents.chunks]
-        questions: List[Question] = self.generate_questions(chunk_sets)
-        relevant_chunks = [documents.chunks for _ in questions]  # fixme
+        chunk_set_count, chuns_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count()
+        chunk_sets = self.generate_chunksets(documents, chunk_set_count, chuns_in_set_count)
+        questions: List[Question] = self.generate_questions(chunk_sets, questions_per_chunkset)
+        relevant_chunks = [documents.find_relevant_chunks(q) for q in questions]
         answers = self.generate_answers(questions, relevant_chunks)
-        print(len(questions), len(answers), len(relevant_chunks))
         return pd.DataFrame({"questions": questions, "answers": answers, "context": relevant_chunks})
 
-    def generate_questions(self, chunk_sets: Sequence[List[Chunk]]) -> List[Question]:
+    def get_chunks_and_question_count(self) -> Tuple[int, int, int]:
+        return 1, 1, self.num_questions
+
+    def generate_chunksets(self, documents: DocumentIndex, count: int, chunks_per_set: int) -> List[ChunkSet]:
+        return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)]
+
+    def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]:
         questions = []
         for chunks in chunk_sets:
             context = "\n\n".join(chunks)
-            rendered = self.questions.render(context=context, number=self.num_questions)
+            rendered = self.questions.render(context=context, number=questions_per_chunkset)
             result = self.wrapper.complete([LLMMessage.system(self.questions_system_prompt), LLMMessage.user(rendered)])
             data = self.questions.parse(result, keys=["questions"])
             questions.extend(data["questions"])
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 8f28149aa8..78598ed161 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -51,7 +51,7 @@ def find_relevant_chunks(self, question: str, n_results=3) -> List[Chunk]:
             List[Chunk]: A list of relevant text chunks.
         """
         # Perform the query
-        results = self.collection.query(
+        results = self.get_collection().query(
             query_texts=question,
             n_results=n_results,
         )
@@ -91,4 +91,4 @@ class Config:
     chunks: List[Chunk]
 
     def extract_index(self) -> DocumentIndex:
-        return DocumentIndex("", chunks=self.chunks)
+        return DocumentIndex("inmemory", chunks=self.chunks)
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index 581f53d6e2..d272485957 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -224,10 +224,10 @@ class StringListFormatBlock(OutputFormatBlock):
 
     def _render(self) -> str:
         return f"""Return a list of {self.of_what}.
-This should be only a list of string {self.of_what} separated by commas."""
+This should be only a list of string {self.of_what}, each one on a new line"""
 
     def parse_response(self, response: str) -> Dict[str, str]:
-        return {self.of_what: response.split(",")}
+        return {self.of_what: response.split("\n")}
 
 
 class StringFormatBlock(OutputFormatBlock):

From 8051121cde22b1dc249597c333d07bce6b32c3f4 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 14:48:19 +0200
Subject: [PATCH 25/63] async

---
 src/evidently/dataset_generators/llm/aaa.py   | 33 +++----
 src/evidently/dataset_generators/llm/index.py |  4 +-
 src/evidently/utils/llm.py                    | 88 +++++++++++++++++--
 3 files changed, 100 insertions(+), 25 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 548430b625..b3353174ac 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -47,21 +47,24 @@ def generate_chunksets(self, documents: DocumentIndex, count: int, chunks_per_se
         return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)]
 
     def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]:
-        questions = []
-        for chunks in chunk_sets:
-            context = "\n\n".join(chunks)
-            rendered = self.questions.render(context=context, number=questions_per_chunkset)
-            result = self.wrapper.complete([LLMMessage.system(self.questions_system_prompt), LLMMessage.user(rendered)])
-            data = self.questions.parse(result, keys=["questions"])
-            questions.extend(data["questions"])
-        return questions
+        system = LLMMessage.system(self.questions_system_prompt)
+        llm_responses = self.wrapper.batch_complete_sync(
+            [
+                [
+                    system,
+                    LLMMessage.user(self.questions.render(context="\n\n".join(chunks), number=questions_per_chunkset)),
+                ]
+                for chunks in chunk_sets
+            ]
+        )
+        questions = [self.questions.parse(response, keys=["questions"])["questions"] for response in llm_responses]
+        return [q for qs in questions for q in qs]
 
-    def generate_answers(self, questions: List[Question], relevent_chunks: List[List[Chunk]]):
-        answers = []
+    def generate_answers(self, questions: List[Question], relevent_chunks: List[List[Chunk]]) -> List[str]:
         system = LLMMessage.system(self.answer_system_prompt)
-        for question, chunks in zip(questions, relevent_chunks):
-            answer = self.wrapper.complete(
+        return self.wrapper.batch_complete_sync(
+            [
                 [system, LLMMessage.user(self.answers.render(question=question, context="\n".join(chunks)))]
-            )
-            answers.append(answer)
-        return answers
+                for question, chunks in zip(questions, relevent_chunks)
+            ]
+        )
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index f1aaefba3b..d962015b3c 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -40,7 +40,7 @@ def get_collection(self):
             self.collection = collection
         return self.collection
 
-    def find_relevant_chunks(self, question: str, n_results=3) -> List[Chunk]:
+    def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]:
         """
         Queries the collection with a given question and returns the relevant text chunks.
 
@@ -54,7 +54,7 @@ def find_relevant_chunks(self, question: str, n_results=3) -> List[Chunk]:
         # Perform the query
         results = self.get_collection().query(
             query_texts=question,
-            n_results=n_results,
+            n_results=min(n_results, len(self.chunks)),
         )
 
         # Extract relevant text chunks from the documents
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index d272485957..e6216bf966 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -1,7 +1,12 @@
+import asyncio
 import dataclasses
+import datetime
 import json
 from abc import ABC
 from abc import abstractmethod
+from asyncio import Lock
+from asyncio import Semaphore
+from asyncio import sleep
 from typing import Any
 from typing import Callable
 from typing import ClassVar
@@ -19,6 +24,7 @@
 from evidently.options.base import Options
 from evidently.options.option import Option
 from evidently.pydantic_utils import EvidentlyBaseModel
+from evidently.ui.base import sync_api
 
 
 @dataclasses.dataclass
@@ -50,13 +56,63 @@ class LLMRequestError(EvidentlyLLMError):
     pass
 
 
+class RateLimiter:
+    def __init__(self, rate: Optional[int], interval: datetime.timedelta):
+        self.rate = rate
+        self.interval = interval
+        self.enters = []
+        self.lock = Lock()
+
+    async def __aenter__(self):
+        if self.rate is None:
+            return
+        while True:
+            async with self.lock:
+                await self._clean()
+                if len(self.enters) < self.rate:
+                    self.enters.append(datetime.datetime.now())
+                    break
+            await sleep(0.1)
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+    async def _clean(self):
+        now = datetime.datetime.now()
+        self.enters = [e for e in self.enters if now - e < self.interval]
+
+
 class LLMWrapper(ABC):
     __used_options__: ClassVar[List[Type[Option]]] = []
 
     @abstractmethod
-    def complete(self, messages: List[LLMMessage]) -> str:
+    async def complete(self, messages: List[LLMMessage]) -> str:
         raise NotImplementedError
 
+    async def batch_complete(
+        self, messages_batch: List[List[LLMMessage]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
+    ) -> List[str]:
+        if batch_size is None:
+            batch_size = self.get_batch_size()
+        if rpm_limit is None:
+            rpm_limit = self.get_rpm_limit()
+        rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1))
+        semaphore = Semaphore(batch_size)
+
+        async def work(messages: List[LLMMessage]) -> str:
+            async with semaphore, rate_limiter:
+                return await self.complete(messages)
+
+        return await asyncio.gather(*[work(msgs) for msgs in messages_batch])
+
+    batch_complete_sync = sync_api(batch_complete)
+
+    def get_batch_size(self) -> int:
+        return 100
+
+    def get_rpm_limit(self) -> Optional[int]:
+        return None
+
     def get_used_options(self) -> List[Type[Option]]:
         return self.__used_options__
 
@@ -87,12 +143,13 @@ def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) ->
 
 class OpenAIKey(Option):
     api_key: Optional[SecretStr] = None
+    rpm_limit: int = 500
 
     def __init__(self, api_key: Optional[str] = None):
         self.api_key = SecretStr(api_key) if api_key is not None else None
         super().__init__()
 
-    def get_value(self) -> Optional[str]:
+    def get_api_key(self) -> Optional[str]:
         if self.api_key is None:
             return None
         return self.api_key.get_secret_value()
@@ -103,23 +160,38 @@ class OpenAIWrapper(LLMWrapper):
     __used_options__: ClassVar = [OpenAIKey]
 
     def __init__(self, model: str, options: Options):
-        import openai
-
         self.model = model
-        self.client = openai.OpenAI(api_key=options.get(OpenAIKey).get_value())
+        self.options = options.get(OpenAIKey)
+        self._clients = {}
 
-    def complete(self, messages: List[LLMMessage]) -> str:
+    @property
+    def client(self):
+        import openai
+
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError as e:
+            raise RuntimeError("Cannot access OpenAIWrapper client without loop") from e
+        loop_id = id(loop)
+        if loop_id not in self._clients:
+            self._clients[loop_id] = openai.AsyncOpenAI(api_key=self.options.get_api_key())
+        return self._clients[loop_id]
+
+    async def complete(self, messages: List[LLMMessage]) -> str:
         import openai
 
         messages = [{"role": msg.role, "content": msg.content} for msg in messages]
         try:
-            response = self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
+            response = await self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
         except openai.OpenAIError as e:
             raise LLMRequestError("Failed to call OpenAI complete API") from e
         content = response.choices[0].message.content
         assert content is not None  # todo: better error
         return content
 
+    def get_rpm_limit(self) -> Optional[int]:
+        return self.options.rpm_limit
+
 
 @llm_provider("litellm", None)
 class LiteLLMWrapper(LLMWrapper):
@@ -224,7 +296,7 @@ class StringListFormatBlock(OutputFormatBlock):
 
     def _render(self) -> str:
         return f"""Return a list of {self.of_what}.
-This should be only a list of string {self.of_what}, each one on a new line"""
+This should be only a list of string {self.of_what}, each one on a new line with no enumeration"""
 
     def parse_response(self, response: str) -> Dict[str, str]:
         return {self.of_what: response.split("\n")}

From 37257bc892afcae8cbb2fad902122956cca17c5f Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 14:55:51 +0200
Subject: [PATCH 26/63] generate_dataset_from_docs

---
 .../dataset_generators/llm/generator.py       |  12 +-
 src/evidently/dataset_generators/llm/index.py | 190 +++++++++++-------
 src/evidently/tests/dataset_generator.py      |  45 +++++
 3 files changed, 176 insertions(+), 71 deletions(-)
 create mode 100644 src/evidently/tests/dataset_generator.py

diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
index df9205b19d..1781ec258c 100644
--- a/src/evidently/dataset_generators/llm/generator.py
+++ b/src/evidently/dataset_generators/llm/generator.py
@@ -10,9 +10,8 @@
 
 
 def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFrame:
-    documents = IndexExtractorFromFile(path=file_path)
     generator = QuestionPairGenerator(
-        index=documents,
+        inpute=IndexExtractorFromFile(path=file_path),
         questions=NaiveQuestionsPrompt(),
         answers=BaselineAnswerPrompt(),
         num_questions=num_questions,
@@ -22,3 +21,12 @@ def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFram
     )
     generated = generator.generate()
     return generated
+
+
+# def generate_dataset_from_chunks(file_path, num_questions: 2) -> pd.DataFrame:
+# RAG
+# kb = KnowledgeBase.from_path(...)
+# generated = QuestionPairGenerator(inpute=kb).generate()
+
+# # From scratch
+# generated = QuestionPairGeneratorFromSamples(inpute=["What is Evidently?"]).generate()
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index d962015b3c..1d7ba73462 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,25 +1,82 @@
-import abc
 import os
-from abc import ABC
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List
+from typing import Optional
 
 import chromadb
 from chromadb.types import Collection
 from chromadb.utils import embedding_functions
 from llama_index.core.node_parser import SentenceSplitter
 
-from evidently.pydantic_utils import EvidentlyBaseModel
-
 Chunk = str
+DEFAULT_CHUNK_SIZE = 512
+DEFAULT_CHUNK_OVERLAP = 20
 
 
 @dataclass
-class DocumentIndex:
+class Document:
+    id: str
+    content: str
+
+
+def load_md_from_dir(path: Path) -> List[Document]:
+    """
+    Loads Markdown (.md) files from the specified directory.
+
+    Args:
+        path (str): Path to the directory containing .md files.
+
+    Returns:
+        List[dict]: A list of dictionaries with the text content of each .md file.
+    """
+    documents = []
+
+    if os.path.isfile(path):
+        with open(path, "r", encoding="utf-8") as file:
+            documents.append(Document(id=file.name, content=file.read()))
+        return documents
+
+    for filename in os.listdir(path):
+        file_path = os.path.join(path, filename)
+        with open(file_path, "r", encoding="utf-8") as file:
+            documents.append(Document(id=file.name, content=file.read()))
+
+    return documents
+
+
+class KnowledgeBase:
     name: str
     chunks: List[Chunk]
-    collection: Collection = None
+    collection: Optional[Collection] = None
+
+    def __init__(self, name: str, chunks: List[str], collection: Optional["Collection"] = None):
+        self.name = name
+        self.chunks = chunks
+        self.collection = collection
+
+    @staticmethod
+    def from_files(
+        path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
+    ) -> "KnowledgeBase":
+        file_path = Path(path)
+        # extractor = IndexExtractorFromFile(path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        # documents = extractor.load_md_from_dir()
+        documents = load_md_from_dir(path=file_path)
+        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        text_nodes = []
+
+        for document in documents:
+            text_nodes.extend(splitter.split_text(document.content))
+
+        document_index = KnowledgeBase(name=file_path.name, chunks=text_nodes)
+        document_index.get_collection()
+        return document_index
+
+    @staticmethod
+    def from_chunks(self, chunks: List[str]):
+        document_index = KnowledgeBase("kb_from_chunks", chunks=chunks)
+        return document_index
 
     def get_collection(self):
         if self.collection is None:
@@ -59,69 +116,64 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]
 
         # Extract relevant text chunks from the documents
         relevant_chunks = [chunk for document in results["documents"] for chunk in document]
-
         return relevant_chunks
 
 
-class IndexExtractor(EvidentlyBaseModel, ABC):
-    @abc.abstractmethod
-    def extract_index(self) -> DocumentIndex:
-        raise NotImplementedError
-
-
-@dataclass
-class Document:
-    id: str
-    content: str
-
-
-class IndexExtractorFromFile(IndexExtractor):
-    class Config:
-        type_alias = "IndexExtractorFromFile"
-
-    path: Path
-    chunk_size: int = 512
-    chunk_overlap: int = 20
-
-    def load_md_from_dir(self, path: Path) -> List[Document]:
-        """
-        Loads Markdown (.md) files from the specified directory.
-
-        Args:
-            path (str): Path to the directory containing .md files.
-
-        Returns:
-            List[dict]: A list of dictionaries with the text content of each .md file.
-        """
-        documents = []
-
-        if os.path.isfile(path):
-            with open(path, "r", encoding="utf-8") as file:
-                documents.append(Document(id=file.name, content=file.read()))
-            return documents
-
-        for filename in os.listdir(path):
-            file_path = os.path.join(path, filename)
-            with open(file_path, "r", encoding="utf-8") as file:
-                documents.append(Document(id=file.name, content=file.read()))
-
-        return documents
-
-    def extract_index(self) -> DocumentIndex:
-        documents = self.load_md_from_dir(self.path)
-        splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
-        text_nodes = []
-        for document in documents:
-            text_nodes.extend(splitter.split_text(document.content))
-
-        return DocumentIndex(self.path.name, chunks=text_nodes)
-
-
-class SimpleIndexExtractor(IndexExtractor):
-    class Config:
-        type_alias = "asdfasdasdfafasd"
-
-    chunks: List[Chunk]
-
-    def extract_index(self) -> DocumentIndex:
-        return DocumentIndex("inmemory", chunks=self.chunks)
+# class IndexExtractor(EvidentlyBaseModel, ABC):
+#     @abc.abstractmethod
+#     def extract_index(self) -> KnowledgeBase:
+#         raise NotImplementedError
+
+
+# class IndexExtractorFromFile(IndexExtractor):
+#     class Config:
+#         type_alias = "IndexExtractorFromFile"
+#
+#     path: Path
+#     chunk_size: int = 512
+#     chunk_overlap: int = 20
+#
+#     def load_md_from_dir(self) -> List[Document]:
+#         """
+#         Loads Markdown (.md) files from the specified directory.
+#
+#         Args:
+#             path (str): Path to the directory containing .md files.
+#
+#         Returns:
+#             List[dict]: A list of dictionaries with the text content of each .md file.
+#         """
+#         documents = []
+#
+#         if os.path.isfile(self.path):
+#             with open(self.path, "r", encoding="utf-8") as file:
+#                 documents.append(Document(id=file.name, content=file.read()))
+#             return documents
+#
+#         for filename in os.listdir(self.path):
+#             file_path = os.path.join(self.path, filename)
+#             with open(file_path, "r", encoding="utf-8") as file:
+#                 documents.append(Document(id=file.name, content=file.read()))
+#
+#         return documents
+#
+#     def extract_index(self) -> KnowledgeBase:
+#         documents = self.load_md_from_dir()
+#         splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+#         text_nodes = []
+#         for document in documents:
+#             text_nodes.extend(splitter.split_text(document.content))
+#
+#         return KnowledgeBase(self.path.name, chunks=text_nodes)
+
+#         return DocumentIndex(self.path.name, chunks=text_nodes)
+#
+#
+# class SimpleIndexExtractor(IndexExtractor):
+#     class Config:
+#         type_alias = "asdfasdasdfafasd"
+#
+#     chunks: List[Chunk]
+#
+#     def extract_index(self) -> DocumentIndex:
+#         return DocumentIndex("inmemory", chunks=self.chunks)
diff --git a/src/evidently/tests/dataset_generator.py b/src/evidently/tests/dataset_generator.py
new file mode 100644
index 0000000000..d9447e874a
--- /dev/null
+++ b/src/evidently/tests/dataset_generator.py
@@ -0,0 +1,45 @@
+from evidently.dataset_generators.llm.index import KnowledgeBase
+
+tmpdir = "/tmp/"
+
+
+def get_content():
+    content = """Much that once was is lost, for none now live who remember it.
+                It began with the forging of the Great Rings. Three were given to the Elves, immortal, wisest and
+                fairest of all beings. Seven to the Dwarf-Lords, great miners and craftsmen of the mountain halls.
+                And nine, nine rings were gifted to the race of Men, who above all else desire power. For within these
+                rings was bound the strength and the will to govern each race. But they were all of them deceived,
+                for another ring was made. Deep in the land of Mordor, in the Fires of Mount Doom, the Dark Lord Sauron
+                 forged a master ring, and into this ring he poured his cruelty, his malice and his will to dominate all
+                  life.
+                One ring to rule them all.
+                One by one, the free lands of Middle-Earth fell to the power of the Ring, but there were some who
+                 resisted. A last alliance of men and elves marched against the armies of Mordor, and on the very
+                 slopes of Mount Doom, they fought for the freedom of Middle-Earth. Victory was near, but the power of
+                 the ring could not be undone. It was in this moment, when all hope had faded, that Isildur, son of the
+                  king, took up his father’s sword.
+                Sauron, enemy of the free peoples of Middle-Earth, was defeated. The Ring passed to Isildur, who had
+                this one chance to destroy evil forever, but the hearts of men are easily corrupted. And the ring of
+                power has a will of its own. It betrayed Isildur, to his death.
+                And some things that should not have been forgotten were lost. History became legend. Legend became
+                myth. And for two and a half thousand years, the ring passed out of all knowledge. Until, when chance
+                came, it ensnared another bearer.
+                It came to the creature Gollum, who took it deep into the tunnels of the Misty Mountains. And there it
+                consumed him. The ring gave to Gollum unnatural long life. For five hundred years it poisoned his mind,
+                 and in the gloom of Gollum’s cave, it waited. Darkness crept back into the forests of the world.
+                 Rumor grew of a shadow in the East, whispers of a nameless fear, and the Ring of Power perceived its
+                  time had come. It abandoned Gollum, but then something happened that the Ring did not intend.
+                  It was picked up by the most unlikely creature imaginable: a hobbit, Bilbo Baggins, of the Shire.
+                For the time will soon come when hobbits will shape the fortunes of all."""
+    return content
+
+
+def test_knowledge_base():
+    file_path = tmpdir + "KnowledgeBase.md"
+    content = get_content()
+    with open(file_path, "w") as f:
+        f.write(content)
+    knowledge_base = KnowledgeBase.from_files(file_path)
+    assert len(knowledge_base.chunks) == 16
+    relevant_chunks = knowledge_base.find_relevant_chunks("Who is Sauron?", 3)
+    assert len(relevant_chunks) == 3

From bd4ba861d728af261b352d037d8427ac99936f6a Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 15:03:26 +0200
Subject: [PATCH 27/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/index.py | 16 ++++++++--------
 src/evidently/tests/dataset_generator.py      |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 1d7ba73462..dd23dc3861 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -45,7 +45,7 @@ def load_md_from_dir(path: Path) -> List[Document]:
     return documents
 
 
-class KnowledgeBase:
+class DataCollection:
     name: str
     chunks: List[Chunk]
     collection: Optional[Collection] = None
@@ -55,10 +55,10 @@ def __init__(self, name: str, chunks: List[str], collection: Optional["Collectio
         self.chunks = chunks
         self.collection = collection
 
-    @staticmethod
+    @classmethod
     def from_files(
-        path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
-    ) -> "KnowledgeBase":
+        cls, path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
+    ) -> "DataCollection":
         file_path = Path(path)
         # extractor = IndexExtractorFromFile(path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         # documents = extractor.load_md_from_dir()
@@ -69,13 +69,13 @@ def from_files(
         for document in documents:
             text_nodes.extend(splitter.split_text(document.content))
 
-        document_index = KnowledgeBase(name=file_path.name, chunks=text_nodes)
+        document_index = cls(name=file_path.name, chunks=text_nodes)
         document_index.get_collection()
         return document_index
 
-    @staticmethod
-    def from_chunks(self, chunks: List[str]):
-        document_index = KnowledgeBase("kb_from_chunks", chunks=chunks)
+    @classmethod
+    def from_chunks(cls, chunks: List[str]):
+        document_index = cls("kb_from_chunks", chunks=chunks)
         return document_index
 
     def get_collection(self):
diff --git a/src/evidently/tests/dataset_generator.py b/src/evidently/tests/dataset_generator.py
index d9447e874a..7ec4297b9a 100644
--- a/src/evidently/tests/dataset_generator.py
+++ b/src/evidently/tests/dataset_generator.py
@@ -1,4 +1,4 @@
-from evidently.dataset_generators.llm.index import KnowledgeBase
+from evidently.dataset_generators.llm.index import DataCollection
 
 tmpdir = "/tmp/"
 
@@ -39,7 +39,7 @@ def test_knowledge_base():
     content = get_content()
     with open(file_path, "w") as f:
         f.write(content)
-    knowledge_base = KnowledgeBase.from_files(file_path)
+    knowledge_base = DataCollection.from_files(file_path)
     assert len(knowledge_base.chunks) == 16
     relevant_chunks = knowledge_base.find_relevant_chunks("Who is Sauron?", 3)
     assert len(relevant_chunks) == 3

From 446b2750f847dd8e587976bc0eec5b469547dbac Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 15:24:26 +0200
Subject: [PATCH 28/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/aaa.py | 15 ++++++++-------
 src/evidently/tests/dataset_generator.py    |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index b3353174ac..282bea261e 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -8,8 +8,9 @@
 from evidently.dataset_generators.base import DatasetGeneratorResult
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.dataset_generators.llm.index import Chunk
-from evidently.dataset_generators.llm.index import DocumentIndex
-from evidently.dataset_generators.llm.index import IndexExtractor
+from evidently.dataset_generators.llm.index import DataCollection
+
+# from evidently.dataset_generators.llm.index import IndexExtractor
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
 from evidently.dataset_generators.llm.prompts import QuestionGenerationPrompt
 from evidently.utils.llm import LLMMessage
@@ -20,11 +21,11 @@
 ChunkSet = List[Chunk]
 
 
-class QuestionPairGenerator(BaseLLMDatasetGenerator):
+class DatasetFromDocs(BaseLLMDatasetGenerator):
     class Config:
-        type_alias = "asdfasdasdfaaasdfdsfasfasd"
+        type_alias = "DatasetFromDocs"
 
-    index: IndexExtractor
+    data_collection: DataCollection
     num_questions: int
     questions: QuestionGenerationPrompt
     questions_system_prompt: str = "You are an assisstant who generates questions based on provided context"
@@ -32,7 +33,7 @@ class Config:
     answer_system_prompt: str = "You are a helpful assistant thet answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
-        documents = self.index.extract_index()
+        documents = self.data_collection
         chunk_set_count, chuns_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count()
         chunk_sets = self.generate_chunksets(documents, chunk_set_count, chuns_in_set_count)
         questions: List[Question] = self.generate_questions(chunk_sets, questions_per_chunkset)
@@ -43,7 +44,7 @@ def generate(self) -> DatasetGeneratorResult:
     def get_chunks_and_question_count(self) -> Tuple[int, int, int]:
         return 1, 1, self.num_questions
 
-    def generate_chunksets(self, documents: DocumentIndex, count: int, chunks_per_set: int) -> List[ChunkSet]:
+    def generate_chunksets(self, documents: DataCollection, count: int, chunks_per_set: int) -> List[ChunkSet]:
         return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)]
 
     def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]:
diff --git a/src/evidently/tests/dataset_generator.py b/src/evidently/tests/dataset_generator.py
index 7ec4297b9a..c1b94fc537 100644
--- a/src/evidently/tests/dataset_generator.py
+++ b/src/evidently/tests/dataset_generator.py
@@ -39,7 +39,7 @@ def test_knowledge_base():
     content = get_content()
     with open(file_path, "w") as f:
         f.write(content)
-    knowledge_base = DataCollection.from_files(file_path)
+    knowledge_base = DataCollection.from_files(file_path, chunk_size=50, chunk_overlap=20)
     assert len(knowledge_base.chunks) == 16
     relevant_chunks = knowledge_base.find_relevant_chunks("Who is Sauron?", 3)
     assert len(relevant_chunks) == 3

From d37473b68c34d8e499667d550e46b6fb96ac87a7 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 15:40:24 +0200
Subject: [PATCH 29/63] generate_dataset_from_docs

---
 examples/synth_data.py                        |  8 ++--
 src/evidently/dataset_generators/llm/aaa.py   |  5 ++-
 .../dataset_generators/llm/generator.py       | 32 -------------
 src/evidently/tests/dataset_generator.py      | 45 -------------------
 tests/dataset_generator/__init__.py           |  0
 5 files changed, 7 insertions(+), 83 deletions(-)
 delete mode 100644 src/evidently/dataset_generators/llm/generator.py
 delete mode 100644 src/evidently/tests/dataset_generator.py
 create mode 100644 tests/dataset_generator/__init__.py

diff --git a/examples/synth_data.py b/examples/synth_data.py
index e7368b79e0..2875a84319 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,15 +1,15 @@
 import os
 
-from evidently.dataset_generators.llm.aaa import QuestionPairGenerator
-from evidently.dataset_generators.llm.index import SimpleIndexExtractor
+from evidently.dataset_generators.llm.aaa import DatasetFromDocs
+from evidently.dataset_generators.llm.index import DataCollection
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsPrompt
 from evidently.options.base import Options
 from evidently.ui.workspace import CloudWorkspace
 
 
 def main():
-    generator = QuestionPairGenerator(
-        index=SimpleIndexExtractor(chunks=["I am a banana", "My spoon is too big"]),
+    generator = DatasetFromDocs(
+        data_collection=DataCollection.from_chunks(chunks=["I am a banana", "My spoon is too big"]),
         questions=NaiveQuestionsPrompt(),
         answers=BaselineAnswerPrompt(),
         provider="openai",
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 282bea261e..bb94015d04 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -24,6 +24,7 @@
 class DatasetFromDocs(BaseLLMDatasetGenerator):
     class Config:
         type_alias = "DatasetFromDocs"
+        arbitrary_types_allowed = True
 
     data_collection: DataCollection
     num_questions: int
@@ -34,8 +35,8 @@ class Config:
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.data_collection
-        chunk_set_count, chuns_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count()
-        chunk_sets = self.generate_chunksets(documents, chunk_set_count, chuns_in_set_count)
+        chunk_set_count, chunks_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count()
+        chunk_sets = self.generate_chunksets(documents, chunk_set_count, chunks_in_set_count)
         questions: List[Question] = self.generate_questions(chunk_sets, questions_per_chunkset)
         relevant_chunks = [documents.find_relevant_chunks(q) for q in questions]
         answers = self.generate_answers(questions, relevant_chunks)
diff --git a/src/evidently/dataset_generators/llm/generator.py b/src/evidently/dataset_generators/llm/generator.py
deleted file mode 100644
index 1781ec258c..0000000000
--- a/src/evidently/dataset_generators/llm/generator.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from pathlib import Path
-
-import pandas as pd
-
-from evidently.dataset_generators.llm.aaa import QuestionPairGenerator
-from evidently.dataset_generators.llm.index import IndexExtractorFromFile
-from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
-from evidently.dataset_generators.llm.prompts import NaiveQuestionsPrompt
-from evidently.options.base import Options
-
-
-def generate_dataset_from_docs(file_path: Path, num_questions: 2) -> pd.DataFrame:
-    generator = QuestionPairGenerator(
-        inpute=IndexExtractorFromFile(path=file_path),
-        questions=NaiveQuestionsPrompt(),
-        answers=BaselineAnswerPrompt(),
-        num_questions=num_questions,
-        provider="openai",
-        model="gpt-4o-mini",
-        options=Options.from_any_options(None),
-    )
-    generated = generator.generate()
-    return generated
-
-
-# def generate_dataset_from_chunks(file_path, num_questions: 2) -> pd.DataFrame:
-# RAG
-# kb = KnowledgeBase.from_path(...)
-# generated = QuestionPairGenerator(inpute=kb).generate()
-
-# # From scratch
-# generated = QuestionPairGeneratorFromSamples(inpute=["What is Evidently?"]).generate()
diff --git a/src/evidently/tests/dataset_generator.py b/src/evidently/tests/dataset_generator.py
deleted file mode 100644
index c1b94fc537..0000000000
--- a/src/evidently/tests/dataset_generator.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from evidently.dataset_generators.llm.index import DataCollection
-
-tmpdir = "/tmp/"
-
-
-def get_content():
-    content = """Much that once was is lost, for none now live who remember it.
-                It began with the forging of the Great Rings. Three were given to the Elves, immortal, wisest and
-                fairest of all beings. Seven to the Dwarf-Lords, great miners and craftsmen of the mountain halls.
-                And nine, nine rings were gifted to the race of Men, who above all else desire power. For within these
-                rings was bound the strength and the will to govern each race. But they were all of them deceived,
-                for another ring was made. Deep in the land of Mordor, in the Fires of Mount Doom, the Dark Lord Sauron
-                 forged a master ring, and into this ring he poured his cruelty, his malice and his will to dominate all
-                  life.
-                One ring to rule them all.
-                One by one, the free lands of Middle-Earth fell to the power of the Ring, but there were some who
-                 resisted. A last alliance of men and elves marched against the armies of Mordor, and on the very
-                 slopes of Mount Doom, they fought for the freedom of Middle-Earth. Victory was near, but the power of
-                 the ring could not be undone. It was in this moment, when all hope had faded, that Isildur, son of the
-                  king, took up his father’s sword.
-                Sauron, enemy of the free peoples of Middle-Earth, was defeated. The Ring passed to Isildur, who had
-                this one chance to destroy evil forever, but the hearts of men are easily corrupted. And the ring of
-                power has a will of its own. It betrayed Isildur, to his death.
-                And some things that should not have been forgotten were lost. History became legend. Legend became
-                myth. And for two and a half thousand years, the ring passed out of all knowledge. Until, when chance
-                came, it ensnared another bearer.
-                It came to the creature Gollum, who took it deep into the tunnels of the Misty Mountains. And there it
-                consumed him. The ring gave to Gollum unnatural long life. For five hundred years it poisoned his mind,
-                 and in the gloom of Gollum’s cave, it waited. Darkness crept back into the forests of the world.
-                 Rumor grew of a shadow in the East, whispers of a nameless fear, and the Ring of Power perceived its
-                  time had come. It abandoned Gollum, but then something happened that the Ring did not intend.
-                  It was picked up by the most unlikely creature imaginable: a hobbit, Bilbo Baggins, of the Shire.
-                For the time will soon come when hobbits will shape the fortunes of all."""
-    return content
-
-
-def test_knowledge_base():
-    file_path = tmpdir + "KnowledgeBase.md"
-    content = get_content()
-    with open(file_path, "w") as f:
-        f.write(content)
-    knowledge_base = DataCollection.from_files(file_path, chunk_size=50, chunk_overlap=20)
-    assert len(knowledge_base.chunks) == 16
-    relevant_chunks = knowledge_base.find_relevant_chunks("Who is Sauron?", 3)
-    assert len(relevant_chunks) == 3
diff --git a/tests/dataset_generator/__init__.py b/tests/dataset_generator/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 983b040b77baa0bc9986e212b15f0a6fe9104ab9 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 15:41:59 +0200
Subject: [PATCH 30/63] generate_dataset_from_docs

---
 src/evidently/dataset_generators/llm/aaa.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index bb94015d04..149da025ec 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -9,8 +9,6 @@
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.dataset_generators.llm.index import Chunk
 from evidently.dataset_generators.llm.index import DataCollection
-
-# from evidently.dataset_generators.llm.index import IndexExtractor
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
 from evidently.dataset_generators.llm.prompts import QuestionGenerationPrompt
 from evidently.utils.llm import LLMMessage
@@ -29,9 +27,9 @@ class Config:
     data_collection: DataCollection
     num_questions: int
     questions: QuestionGenerationPrompt
-    questions_system_prompt: str = "You are an assisstant who generates questions based on provided context"
+    questions_system_prompt: str = "You are an assistant who generates questions based on provided context"
     answers: BaselineAnswerPrompt
-    answer_system_prompt: str = "You are a helpful assistant thet answer a given question directly without any preamble"
+    answer_system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.data_collection
@@ -62,11 +60,11 @@ def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_ch
         questions = [self.questions.parse(response, keys=["questions"])["questions"] for response in llm_responses]
         return [q for qs in questions for q in qs]
 
-    def generate_answers(self, questions: List[Question], relevent_chunks: List[List[Chunk]]) -> List[str]:
+    def generate_answers(self, questions: List[Question], relevant_chunks: List[List[Chunk]]) -> List[str]:
         system = LLMMessage.system(self.answer_system_prompt)
         return self.wrapper.batch_complete_sync(
             [
                 [system, LLMMessage.user(self.answers.render(question=question, context="\n".join(chunks)))]
-                for question, chunks in zip(questions, relevent_chunks)
+                for question, chunks in zip(questions, relevant_chunks)
             ]
         )

From 8eb5017abda64231328abb48c001eae15deb0cf6 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 17:48:25 +0200
Subject: [PATCH 31/63] WIP

---
 examples/synth_data.py                        |  24 ++-
 src/evidently/dataset_generators/llm/aaa.py   |  16 +-
 src/evidently/dataset_generators/llm/index.py | 145 +++++-------------
 3 files changed, 66 insertions(+), 119 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 2875a84319..8a89e2d6ca 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,22 +1,36 @@
+import json
 import os
 
-from evidently.dataset_generators.llm.aaa import DatasetFromDocs
-from evidently.dataset_generators.llm.index import DataCollection
+from evidently.dataset_generators.llm.aaa import QADatasetGenerator
+from evidently.dataset_generators.llm.index import DataCollection, DataCollectionProvider
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsPrompt
 from evidently.options.base import Options
 from evidently.ui.workspace import CloudWorkspace
 
 
 def main():
-    generator = DatasetFromDocs(
-        data_collection=DataCollection.from_chunks(chunks=["I am a banana", "My spoon is too big"]),
-        questions=NaiveQuestionsPrompt(),
+    data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
+    DataCollectionProvider.from_cloud_tmp_file()
+    generator = QADatasetGenerator(
+        data_collection=data,
+        provider="openai",
+        model="gpt-4o-mini",
+        num_questions=5,
+        options=Options.from_any_options(None)
+    )
+
+    generator = QAScratchDatasetGenerator(
+        task="I need questions about kek",
+        exapmles=["What is kek"],
+        questions=[NaiveQuestionsPrompt(), PIIQuestions()],
         answers=BaselineAnswerPrompt(),
         provider="openai",
         model="gpt-4o-mini",
         num_questions=5,
         options=Options.from_any_options(None)
     )
+
+    json.dumps(generator.dict())
     print(generator.questions.get_template())
     generated = generator.generate()
     for _, a in generated.iterrows():
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index 149da025ec..b2c8fa90bf 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -9,7 +9,9 @@
 from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
 from evidently.dataset_generators.llm.index import Chunk
 from evidently.dataset_generators.llm.index import DataCollection
+from evidently.dataset_generators.llm.index import DataCollectionProvider
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
+from evidently.dataset_generators.llm.prompts import NaiveQuestionsPrompt
 from evidently.dataset_generators.llm.prompts import QuestionGenerationPrompt
 from evidently.utils.llm import LLMMessage
 
@@ -19,20 +21,20 @@
 ChunkSet = List[Chunk]
 
 
-class DatasetFromDocs(BaseLLMDatasetGenerator):
+class QADatasetGenerator(BaseLLMDatasetGenerator):
     class Config:
         type_alias = "DatasetFromDocs"
         arbitrary_types_allowed = True
 
-    data_collection: DataCollection
+    data_collection: DataCollectionProvider
     num_questions: int
-    questions: QuestionGenerationPrompt
-    questions_system_prompt: str = "You are an assistant who generates questions based on provided context"
-    answers: BaselineAnswerPrompt
-    answer_system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
+    questions: List[QuestionGenerationPrompt] = [NaiveQuestionsPrompt()]
+    # questions_system_prompt: str = "You are an assistant who generates questions based on provided context"
+    answers: BaselineAnswerPrompt = BaselineAnswerPrompt()
+    # answer_system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
-        documents = self.data_collection
+        documents = self.data_collection.get_data_collection()
         chunk_set_count, chunks_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count()
         chunk_sets = self.generate_chunksets(documents, chunk_set_count, chunks_in_set_count)
         questions: List[Question] = self.generate_questions(chunk_sets, questions_per_chunkset)
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index dd23dc3861..bc34530f55 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,5 +1,5 @@
+import glob
 import os
-from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 from typing import Optional
@@ -9,40 +9,45 @@
 from chromadb.utils import embedding_functions
 from llama_index.core.node_parser import SentenceSplitter
 
+from evidently.pydantic_utils import EvidentlyBaseModel
+
 Chunk = str
 DEFAULT_CHUNK_SIZE = 512
 DEFAULT_CHUNK_OVERLAP = 20
 
 
-@dataclass
-class Document:
-    id: str
-    content: str
+class DataCollectionProvider(EvidentlyBaseModel):
+    class Config:
+        require_alias = False  # fixme
+
+    chunk_size: int = DEFAULT_CHUNK_SIZE
+    chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
 
+    def get_data_collection(self):
+        raise NotImplementedError
 
-def load_md_from_dir(path: Path) -> List[Document]:
-    """
-    Loads Markdown (.md) files from the specified directory.
+    @classmethod
+    def from_files(
+        cls, path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
+    ) -> "DataCollectionProvider":
+        return FileDataCollectionProvider(path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
-    Args:
-        path (str): Path to the directory containing .md files.
 
-    Returns:
-        List[dict]: A list of dictionaries with the text content of each .md file.
-    """
-    documents = []
+class FileDataCollectionProvider(DataCollectionProvider):
+    path: str
 
-    if os.path.isfile(path):
-        with open(path, "r", encoding="utf-8") as file:
-            documents.append(Document(id=file.name, content=file.read()))
-        return documents
+    def get_data_collection(self):
+        splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        text_nodes = []
+        file_path = Path(self.path)
+        paths = [self.path] if file_path.is_file() else glob.glob(os.path.join(self.path, "*"))
 
-    for filename in os.listdir(path):
-        file_path = os.path.join(path, filename)
-        with open(file_path, "r", encoding="utf-8") as file:
-            documents.append(Document(id=file.name, content=file.read()))
+        for filename in paths:
+            text_nodes.extend(splitter.split_text(Path(filename).read_text()))
 
-    return documents
+        data_collection = DataCollection(name=file_path.name, chunks=text_nodes)
+        data_collection.init_collection()
+        return data_collection
 
 
 class DataCollection:
@@ -55,30 +60,18 @@ def __init__(self, name: str, chunks: List[str], collection: Optional["Collectio
         self.chunks = chunks
         self.collection = collection
 
-    @classmethod
-    def from_files(
-        cls, path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
-    ) -> "DataCollection":
-        file_path = Path(path)
-        # extractor = IndexExtractorFromFile(path=file_path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-        # documents = extractor.load_md_from_dir()
-        documents = load_md_from_dir(path=file_path)
-        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-        text_nodes = []
-
-        for document in documents:
-            text_nodes.extend(splitter.split_text(document.content))
-
-        document_index = cls(name=file_path.name, chunks=text_nodes)
-        document_index.get_collection()
-        return document_index
-
     @classmethod
     def from_chunks(cls, chunks: List[str]):
         document_index = cls("kb_from_chunks", chunks=chunks)
         return document_index
 
-    def get_collection(self):
+    @classmethod
+    def from_files(
+        cls, path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
+    ) -> "DataCollectionProvider":
+        return FileDataCollectionProvider(path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+
+    def init_collection(self):
         if self.collection is None:
             default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
                 model_name="all-MiniLM-L6-v2",
@@ -88,14 +81,12 @@ def get_collection(self):
                 name=self.name,
                 embedding_function=default_embedding_function,
             )
-            # insert documents with embeddings to collection ChromaDB
             for i, chunk in enumerate(self.chunks):
                 collection.upsert(
                     ids=str(i),
                     documents=chunk,
                 )
             self.collection = collection
-        return self.collection
 
     def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]:
         """
@@ -108,72 +99,12 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]
         Returns:
             List[Chunk]: A list of relevant text chunks.
         """
-        # Perform the query
-        results = self.get_collection().query(
+        if self.collection is None:
+            raise ValueError("Collection is not initialized")
+        results = self.collection.query(
             query_texts=question,
             n_results=min(n_results, len(self.chunks)),
         )
 
-        # Extract relevant text chunks from the documents
         relevant_chunks = [chunk for document in results["documents"] for chunk in document]
         return relevant_chunks
-
-
-# class IndexExtractor(EvidentlyBaseModel, ABC):
-#     @abc.abstractmethod
-#     def extract_index(self) -> KnowledgeBase:
-#         raise NotImplementedError
-
-
-# class IndexExtractorFromFile(IndexExtractor):
-#     class Config:
-#         type_alias = "IndexExtractorFromFile"
-#
-#     path: Path
-#     chunk_size: int = 512
-#     chunk_overlap: int = 20
-#
-#     def load_md_from_dir(self) -> List[Document]:
-#         """
-#         Loads Markdown (.md) files from the specified directory.
-#
-#         Args:
-#             path (str): Path to the directory containing .md files.
-#
-#         Returns:
-#             List[dict]: A list of dictionaries with the text content of each .md file.
-#         """
-#         documents = []
-#
-#         if os.path.isfile(self.path):
-#             with open(self.path, "r", encoding="utf-8") as file:
-#                 documents.append(Document(id=file.name, content=file.read()))
-#             return documents
-#
-#         for filename in os.listdir(self.path):
-#             file_path = os.path.join(self.path, filename)
-#             with open(file_path, "r", encoding="utf-8") as file:
-#                 documents.append(Document(id=file.name, content=file.read()))
-#
-#         return documents
-#
-#     def extract_index(self) -> KnowledgeBase:
-#         documents = self.load_md_from_dir()
-#         splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
-#         text_nodes = []
-#         for document in documents:
-#             text_nodes.extend(splitter.split_text(document.content))
-#
-#         return KnowledgeBase(self.path.name, chunks=text_nodes)
-
-#         return DocumentIndex(self.path.name, chunks=text_nodes)
-#
-#
-# class SimpleIndexExtractor(IndexExtractor):
-#     class Config:
-#         type_alias = "asdfasdasdfafasd"
-#
-#     chunks: List[Chunk]
-#
-#     def extract_index(self) -> DocumentIndex:
-#         return DocumentIndex("inmemory", chunks=self.chunks)

From 466d77b7e115bee9b8318668661eab323cf47aeb Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 18:23:48 +0200
Subject: [PATCH 32/63] fix

---
 examples/synth_data.py                        | 32 ++++++++++--------
 src/evidently/dataset_generators/base.py      |  1 +
 src/evidently/dataset_generators/llm/aaa.py   | 33 ++++++++++++++-----
 src/evidently/dataset_generators/llm/index.py | 26 ++++++++-------
 .../dataset_generators/llm/prompts.py         | 15 +++++++--
 5 files changed, 70 insertions(+), 37 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 8a89e2d6ca..b77a1fcb79 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,16 +1,15 @@
 import json
 import os
 
-from evidently.dataset_generators.llm.aaa import QADatasetGenerator
+from evidently.dataset_generators.llm.aaa import QADatasetFromSeedGenerator, QADatasetGenerator
 from evidently.dataset_generators.llm.index import DataCollection, DataCollectionProvider
-from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsPrompt
+from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsFromContext
 from evidently.options.base import Options
 from evidently.ui.workspace import CloudWorkspace
 
 
 def main():
     data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
-    DataCollectionProvider.from_cloud_tmp_file()
     generator = QADatasetGenerator(
         data_collection=data,
         provider="openai",
@@ -18,25 +17,32 @@ def main():
         num_questions=5,
         options=Options.from_any_options(None)
     )
+    # print(generator.questions.get_template())
+    # json.dumps(generator.dict())
+    generated = generator.generate()
+    for _, a in generated.iterrows():
+        print("Q", a["questions"])
+        if "answers" in a:
+            print("A", a["answers"])
+        if "context" in a:
+            print("C", a["context"])
+        print()
 
-    generator = QAScratchDatasetGenerator(
-        task="I need questions about kek",
-        exapmles=["What is kek"],
-        questions=[NaiveQuestionsPrompt(), PIIQuestions()],
-        answers=BaselineAnswerPrompt(),
+    generator = QADatasetFromSeedGenerator(
+        seed_question="What is 'kek'?",
+        num_questions=5,
         provider="openai",
         model="gpt-4o-mini",
-        num_questions=5,
         options=Options.from_any_options(None)
     )
 
-    json.dumps(generator.dict())
-    print(generator.questions.get_template())
     generated = generator.generate()
     for _, a in generated.iterrows():
         print("Q", a["questions"])
-        print("A", a["answers"])
-        print("C", a["context"])
+        if "answers" in a:
+            print("A", a["answers"])
+        if "context" in a:
+            print("C", a["context"])
         print()
 
     # client = CloudWorkspace(token=os.environ["EVIDENTLY_TOKEN"], url="https://app.evidently.dev")
diff --git a/src/evidently/dataset_generators/base.py b/src/evidently/dataset_generators/base.py
index 9dcd5094d0..a13b067de7 100644
--- a/src/evidently/dataset_generators/base.py
+++ b/src/evidently/dataset_generators/base.py
@@ -13,6 +13,7 @@ class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
     class Config:
         type_alias = "evidently:dataset_generator:BaseDatasetGenerator"
         is_base_type = True
+        alias_required = False  # fixme
 
     options: Options
 
diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/aaa.py
index b2c8fa90bf..b0968afbf9 100644
--- a/src/evidently/dataset_generators/llm/aaa.py
+++ b/src/evidently/dataset_generators/llm/aaa.py
@@ -11,8 +11,9 @@
 from evidently.dataset_generators.llm.index import DataCollection
 from evidently.dataset_generators.llm.index import DataCollectionProvider
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
-from evidently.dataset_generators.llm.prompts import NaiveQuestionsPrompt
-from evidently.dataset_generators.llm.prompts import QuestionGenerationPrompt
+from evidently.dataset_generators.llm.prompts import NaiveQuestionsFromContext
+from evidently.dataset_generators.llm.prompts import QuestionsFromContext
+from evidently.dataset_generators.llm.prompts import QuestionsFromSeed
 from evidently.utils.llm import LLMMessage
 
 Question = str
@@ -22,16 +23,12 @@
 
 
 class QADatasetGenerator(BaseLLMDatasetGenerator):
-    class Config:
-        type_alias = "DatasetFromDocs"
-        arbitrary_types_allowed = True
-
     data_collection: DataCollectionProvider
     num_questions: int
-    questions: List[QuestionGenerationPrompt] = [NaiveQuestionsPrompt()]
-    # questions_system_prompt: str = "You are an assistant who generates questions based on provided context"
+    questions: QuestionsFromContext = NaiveQuestionsFromContext()
+    questions_system_prompt: str = "You are an assistant who generates questions based on provided context"
     answers: BaselineAnswerPrompt = BaselineAnswerPrompt()
-    # answer_system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
+    answer_system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.data_collection.get_data_collection()
@@ -70,3 +67,21 @@ def generate_answers(self, questions: List[Question], relevant_chunks: List[List
                 for question, chunks in zip(questions, relevant_chunks)
             ]
         )
+
+
+class QADatasetFromSeedGenerator(BaseLLMDatasetGenerator):
+    seed_question: str
+    num_questions: int
+    prompt: QuestionsFromSeed = QuestionsFromSeed()
+    system_prompt: str = "You are a smart assistant who helps repharase questions"
+
+    def generate(self) -> DatasetGeneratorResult:
+        response = self.wrapper.batch_complete_sync(
+            [
+                [
+                    LLMMessage.system(self.system_prompt),
+                    LLMMessage.user(self.prompt.render(number=self.num_questions, seed_question=self.seed_question)),
+                ]
+            ]
+        )
+        return pd.DataFrame({"questions": self.prompt.parse(response[0], keys=["questions"])["questions"]})
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index bc34530f55..578684455b 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -18,7 +18,7 @@
 
 class DataCollectionProvider(EvidentlyBaseModel):
     class Config:
-        require_alias = False  # fixme
+        alias_required = False  # fixme
 
     chunk_size: int = DEFAULT_CHUNK_SIZE
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
@@ -32,6 +32,19 @@ def from_files(
     ) -> "DataCollectionProvider":
         return FileDataCollectionProvider(path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
+    @classmethod
+    def from_chunks(cls, chunks: List[str]):
+        return ChunksDataCollectionProvider(chunks=chunks)
+
+
+class ChunksDataCollectionProvider(DataCollectionProvider):
+    chunks: List[Chunk]
+
+    def get_data_collection(self):
+        dc = DataCollection(name="chunks", chunks=self.chunks)
+        dc.init_collection()
+        return dc
+
 
 class FileDataCollectionProvider(DataCollectionProvider):
     path: str
@@ -60,17 +73,6 @@ def __init__(self, name: str, chunks: List[str], collection: Optional["Collectio
         self.chunks = chunks
         self.collection = collection
 
-    @classmethod
-    def from_chunks(cls, chunks: List[str]):
-        document_index = cls("kb_from_chunks", chunks=chunks)
-        return document_index
-
-    @classmethod
-    def from_files(
-        cls, path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
-    ) -> "DataCollectionProvider":
-        return FileDataCollectionProvider(path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-
     def init_collection(self):
         if self.collection is None:
             default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
diff --git a/src/evidently/dataset_generators/llm/prompts.py b/src/evidently/dataset_generators/llm/prompts.py
index 9ec704a900..1f1c449edf 100644
--- a/src/evidently/dataset_generators/llm/prompts.py
+++ b/src/evidently/dataset_generators/llm/prompts.py
@@ -13,11 +13,20 @@ class SimpleQuestionPrompt(BlockPromptTemplate):
     question_type: str = "simple"
 
 
-class QuestionGenerationPrompt(BlockPromptTemplate):
+class QuestionsFromSeed(BlockPromptTemplate):
+    blocks: ClassVar = [
+        """Write for me {number} alternative questions quite similar to the question you got.
+        The question: """,
+        PromptBlock.input("seed_question").anchored(),
+        PromptBlock.string_list_output("questions"),
+    ]
+
+
+class QuestionsFromContext(BlockPromptTemplate):
     pass
 
 
-class NaiveQuestionsPrompt(QuestionGenerationPrompt):
+class NaiveQuestionsFromContext(QuestionsFromContext):
     blocks: ClassVar = [
         "Generate {number} conceptual questions based on the provided context and "
         "can be answered from the information in the provided context.\n"
@@ -30,7 +39,7 @@ class NaiveQuestionsPrompt(QuestionGenerationPrompt):
     ]
 
 
-class ReformulateQuestionPrompt(QuestionGenerationPrompt):
+class ReformulateQuestionPrompt(QuestionsFromContext):
     blocks: ClassVar = [
         """Write for me {number} alternative questions quite similar to the question you got.
 The question:""",

From 27166aec38403c795359b4e2ff262d32fc9b2f91 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 18:25:59 +0200
Subject: [PATCH 33/63] rename

---
 src/evidently/dataset_generators/llm/{aaa.py => questions.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/evidently/dataset_generators/llm/{aaa.py => questions.py} (100%)

diff --git a/src/evidently/dataset_generators/llm/aaa.py b/src/evidently/dataset_generators/llm/questions.py
similarity index 100%
rename from src/evidently/dataset_generators/llm/aaa.py
rename to src/evidently/dataset_generators/llm/questions.py

From dbf890795da6d590825f8a6dd7e41699dae32c78 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 18:42:33 +0200
Subject: [PATCH 34/63] fix import

---
 examples/synth_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index b77a1fcb79..7222b1ad12 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,7 +1,7 @@
 import json
 import os
 
-from evidently.dataset_generators.llm.aaa import QADatasetFromSeedGenerator, QADatasetGenerator
+from evidently.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
 from evidently.dataset_generators.llm.index import DataCollection, DataCollectionProvider
 from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsFromContext
 from evidently.options.base import Options

From 48cc46961e20121504517ceb2d54317d6eb69540 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Thu, 10 Oct 2024 22:42:19 +0200
Subject: [PATCH 35/63] move system prompts into user

---
 .../dataset_generators/llm/prompts.py         | 34 +++++++++++++++++--
 .../dataset_generators/llm/questions.py       | 20 ++---------
 src/evidently/utils/llm.py                    | 12 +++++++
 3 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/prompts.py b/src/evidently/dataset_generators/llm/prompts.py
index 1f1c449edf..c6fad71381 100644
--- a/src/evidently/dataset_generators/llm/prompts.py
+++ b/src/evidently/dataset_generators/llm/prompts.py
@@ -1,7 +1,9 @@
 from typing import ClassVar
+from typing import List
 
 from evidently.utils.llm import BlockPromptTemplate
 from evidently.utils.llm import PromptBlock
+from evidently.utils.llm import WithSystemPrompt
 
 
 class SimpleQuestionPrompt(BlockPromptTemplate):
@@ -22,8 +24,8 @@ class QuestionsFromSeed(BlockPromptTemplate):
     ]
 
 
-class QuestionsFromContext(BlockPromptTemplate):
-    pass
+class QuestionsFromContext(WithSystemPrompt, BlockPromptTemplate):
+    system_prompt: str = "You are an assistant who generates questions based on provided context"
 
 
 class NaiveQuestionsFromContext(QuestionsFromContext):
@@ -38,6 +40,8 @@ class NaiveQuestionsFromContext(QuestionsFromContext):
         PromptBlock.string_list_output("questions"),
     ]
 
+    def generate_questions_from_context(self, context: str, number: int) -> List[str]: ...
+
 
 class ReformulateQuestionPrompt(QuestionsFromContext):
     blocks: ClassVar = [
@@ -47,9 +51,10 @@ class ReformulateQuestionPrompt(QuestionsFromContext):
         PromptBlock.string_list_output("questions"),
     ]
     number: int
+    system_prompt: str = "You are a smart assistant who helps repharase questions"
 
 
-class BaselineAnswerPrompt(BlockPromptTemplate):
+class BaselineAnswerPrompt(WithSystemPrompt, BlockPromptTemplate):
     blocks: ClassVar = [
         "Your task is to answer the following query:",
         PromptBlock.input("question").anchored(),
@@ -61,3 +66,26 @@ class BaselineAnswerPrompt(BlockPromptTemplate):
 Avoid providing any closing statement!""",
         PromptBlock.string_output("answer"),
     ]
+    system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
+
+
+#
+# def llm_call(*prompt):
+#     def a(f):
+#         pass
+#     return a
+#
+# class GenerateQuestionFromContextBase(BlockPromptTemplate):
+#     # @llm_call()
+#     # def generate_questions_from_context(self, context: str, number: int) -> List[str]:
+#     #     ...
+#
+# class KekImopl(GenerateQuestionFromContextBase):
+#     blocks: ClassVar = ["Generate {number} conceptual questions based on the provided context and "
+#         "can be answered from the information in the provided context.\n"
+#         "Here is a context",
+#         PromptBlock.input("context").anchored(),
+#         "Remain faithful to the above context.\n"
+#         "Avoid providing any preamble!\n"
+#         "Avoid providing any closing statement!",
+#         PromptBlock.string_list_output("questions"),]
diff --git a/src/evidently/dataset_generators/llm/questions.py b/src/evidently/dataset_generators/llm/questions.py
index b0968afbf9..0f844a7a34 100644
--- a/src/evidently/dataset_generators/llm/questions.py
+++ b/src/evidently/dataset_generators/llm/questions.py
@@ -14,7 +14,6 @@
 from evidently.dataset_generators.llm.prompts import NaiveQuestionsFromContext
 from evidently.dataset_generators.llm.prompts import QuestionsFromContext
 from evidently.dataset_generators.llm.prompts import QuestionsFromSeed
-from evidently.utils.llm import LLMMessage
 
 Question = str
 Answer = str
@@ -26,9 +25,7 @@ class QADatasetGenerator(BaseLLMDatasetGenerator):
     data_collection: DataCollectionProvider
     num_questions: int
     questions: QuestionsFromContext = NaiveQuestionsFromContext()
-    questions_system_prompt: str = "You are an assistant who generates questions based on provided context"
     answers: BaselineAnswerPrompt = BaselineAnswerPrompt()
-    answer_system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.data_collection.get_data_collection()
@@ -46,13 +43,9 @@ def generate_chunksets(self, documents: DataCollection, count: int, chunks_per_s
         return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)]
 
     def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]:
-        system = LLMMessage.system(self.questions_system_prompt)
         llm_responses = self.wrapper.batch_complete_sync(
             [
-                [
-                    system,
-                    LLMMessage.user(self.questions.render(context="\n\n".join(chunks), number=questions_per_chunkset)),
-                ]
+                self.questions.get_messages(context="\n\n".join(chunks), number=questions_per_chunkset)
                 for chunks in chunk_sets
             ]
         )
@@ -60,10 +53,9 @@ def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_ch
         return [q for qs in questions for q in qs]
 
     def generate_answers(self, questions: List[Question], relevant_chunks: List[List[Chunk]]) -> List[str]:
-        system = LLMMessage.system(self.answer_system_prompt)
         return self.wrapper.batch_complete_sync(
             [
-                [system, LLMMessage.user(self.answers.render(question=question, context="\n".join(chunks)))]
+                self.answers.get_messages(question=question, context="\n".join(chunks))
                 for question, chunks in zip(questions, relevant_chunks)
             ]
         )
@@ -73,15 +65,9 @@ class QADatasetFromSeedGenerator(BaseLLMDatasetGenerator):
     seed_question: str
     num_questions: int
     prompt: QuestionsFromSeed = QuestionsFromSeed()
-    system_prompt: str = "You are a smart assistant who helps repharase questions"
 
     def generate(self) -> DatasetGeneratorResult:
         response = self.wrapper.batch_complete_sync(
-            [
-                [
-                    LLMMessage.system(self.system_prompt),
-                    LLMMessage.user(self.prompt.render(number=self.num_questions, seed_question=self.seed_question)),
-                ]
-            ]
+            [self.prompt.get_messages(number=self.num_questions, seed_question=self.seed_question)]
         )
         return pd.DataFrame({"questions": self.prompt.parse(response[0], keys=["questions"])["questions"]})
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index e6216bf966..3ec726e242 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -340,6 +340,18 @@ def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, An
             raise LLMResponseParseError(f"Keys {keys} are required but got {list(parsed.keys())}")
         return parsed
 
+    def get_messages(self, **values) -> List[LLMMessage]:
+        return [LLMMessage.user(self.render(**values))]
+
+
+class WithSystemPrompt(PromptTemplate):
+    system_prompt: str
+
+    def get_messages(self, **values) -> List[LLMMessage]:
+        msgs = super().get_messages(**values)
+        msgs.insert(0, LLMMessage.system(self.system_prompt))
+        return msgs
+
 
 AnyBlock = Union[str, PromptBlock, Callable]
 

From 7b025729ab8262b4f10c027df1571ff3a5bb5a6d Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 18:25:18 +0200
Subject: [PATCH 36/63] generate_dataset_from_docs

---
 examples/synth_data.py                        | 12 +++++++++++-
 src/evidently/dataset_generators/llm/index.py | 16 +++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 7222b1ad12..9f6b31050f 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -8,6 +8,9 @@
 from evidently.ui.workspace import CloudWorkspace
 
 
+def generate_from_file(file_path: str):
+    file_path = "./docs/book/reference"
+    data = DataCollectionProvider.from_files(file_path)
 def main():
     data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
     generator = QADatasetGenerator(
@@ -17,6 +20,8 @@ def main():
         num_questions=5,
         options=Options.from_any_options(None)
     )
+    generated = generator.generate()
+    return generated
     # print(generator.questions.get_template())
     # json.dumps(generator.dict())
     generated = generator.generate()
@@ -31,6 +36,11 @@ def main():
     generator = QADatasetFromSeedGenerator(
         seed_question="What is 'kek'?",
         num_questions=5,
+
+def generate_from_chunk():
+    data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
+    generator = QADatasetGenerator(
+        data_collection=data,
         provider="openai",
         model="gpt-4o-mini",
         options=Options.from_any_options(None)
@@ -51,4 +61,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    generate_from_chunk()
\ No newline at end of file
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 578684455b..86061bf1ba 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -5,6 +5,7 @@
 from typing import Optional
 
 import chromadb
+import PyPDF2
 from chromadb.types import Collection
 from chromadb.utils import embedding_functions
 from llama_index.core.node_parser import SentenceSplitter
@@ -16,6 +17,19 @@
 DEFAULT_CHUNK_OVERLAP = 20
 
 
+def read_text(filename: Path) -> str:
+    if Path(filename).suffix.lower() == ".pdf":
+        with open(filename, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page_num in range(len(reader.pages)):
+                page = reader.pages[page_num]
+                text += page.extract_text()
+            return text
+    else:
+        return Path(filename).read_text()
+
+
 class DataCollectionProvider(EvidentlyBaseModel):
     class Config:
         alias_required = False  # fixme
@@ -56,7 +70,7 @@ def get_data_collection(self):
         paths = [self.path] if file_path.is_file() else glob.glob(os.path.join(self.path, "*"))
 
         for filename in paths:
-            text_nodes.extend(splitter.split_text(Path(filename).read_text()))
+            text_nodes.extend(splitter.split_text(read_text()))
 
         data_collection = DataCollection(name=file_path.name, chunks=text_nodes)
         data_collection.init_collection()

From c5aed45541a1ead3c3ed89e532bf07bc7e7af93e Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 18:30:05 +0200
Subject: [PATCH 37/63] generate_dataset_from_docs

---
 examples/synth_data.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 9f6b31050f..7f5535efa6 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -11,6 +11,8 @@
 def generate_from_file(file_path: str):
     file_path = "./docs/book/reference"
     data = DataCollectionProvider.from_files(file_path)
+
+
 def main():
     data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
     generator = QADatasetGenerator(
@@ -20,8 +22,6 @@ def main():
         num_questions=5,
         options=Options.from_any_options(None)
     )
-    generated = generator.generate()
-    return generated
     # print(generator.questions.get_template())
     # json.dumps(generator.dict())
     generated = generator.generate()
@@ -36,6 +36,19 @@ def main():
     generator = QADatasetFromSeedGenerator(
         seed_question="What is 'kek'?",
         num_questions=5,
+        provider="openai",
+        model="gpt-4o-mini",
+        options=Options.from_any_options(None)
+    )
+
+    generated = generator.generate()
+    for _, a in generated.iterrows():
+        print("Q", a["questions"])
+        if "answers" in a:
+            print("A", a["answers"])
+        if "context" in a:
+            print("C", a["context"])
+        print()
 
 def generate_from_chunk():
     data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
@@ -61,4 +74,5 @@ def generate_from_chunk():
 
 
 if __name__ == '__main__':
-    generate_from_chunk()
\ No newline at end of file
+    main()
+    # generate_from_chunk()
\ No newline at end of file

From efdc72f89a64bd495cb632449eef9d195d67b96f Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 23:13:31 +0200
Subject: [PATCH 38/63] generate_dataset_from_docs

---
 examples/synth_data.py                        | 42 +++++++------------
 src/evidently/dataset_generators/llm/index.py | 29 +++++++------
 2 files changed, 32 insertions(+), 39 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 7f5535efa6..05b259bb4c 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,20 +1,12 @@
-import json
-import os
-
 from evidently.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
-from evidently.dataset_generators.llm.index import DataCollection, DataCollectionProvider
-from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt, NaiveQuestionsFromContext
+from evidently.dataset_generators.llm.index import DataCollectionProvider
 from evidently.options.base import Options
-from evidently.ui.workspace import CloudWorkspace
-
 
-def generate_from_file(file_path: str):
-    file_path = "./docs/book/reference"
-    data = DataCollectionProvider.from_files(file_path)
 
+def generate_from_file():
+    file_path = "../cloud_quickstart_tracing.pdf"
+    data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20)
 
-def main():
-    data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
     generator = QADatasetGenerator(
         data_collection=data,
         provider="openai",
@@ -22,8 +14,6 @@ def main():
         num_questions=5,
         options=Options.from_any_options(None)
     )
-    # print(generator.questions.get_template())
-    # json.dumps(generator.dict())
     generated = generator.generate()
     for _, a in generated.iterrows():
         print("Q", a["questions"])
@@ -33,11 +23,14 @@ def main():
             print("C", a["context"])
         print()
 
-    generator = QADatasetFromSeedGenerator(
-        seed_question="What is 'kek'?",
-        num_questions=5,
+
+def main():
+    data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
+    generator = QADatasetGenerator(
+        data_collection=data,
         provider="openai",
         model="gpt-4o-mini",
+        num_questions=5,
         options=Options.from_any_options(None)
     )
 
@@ -50,10 +43,9 @@ def main():
             print("C", a["context"])
         print()
 
-def generate_from_chunk():
-    data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
-    generator = QADatasetGenerator(
-        data_collection=data,
+    generator = QADatasetFromSeedGenerator(
+        seed_question="What is 'kek'?",
+        num_questions=5,
         provider="openai",
         model="gpt-4o-mini",
         options=Options.from_any_options(None)
@@ -68,11 +60,7 @@ def generate_from_chunk():
             print("C", a["context"])
         print()
 
-    # client = CloudWorkspace(token=os.environ["EVIDENTLY_TOKEN"], url="https://app.evidently.dev")
-    #
-    # client.add_dataset(generated, "synth data", project_id="019270f6-6dda-7516-854b-aea2d84a4671")
-
 
 if __name__ == '__main__':
-    main()
-    # generate_from_chunk()
\ No newline at end of file
+    # main()
+    generate_from_file()
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/dataset_generators/llm/index.py
index 86061bf1ba..4349e41312 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/dataset_generators/llm/index.py
@@ -1,14 +1,15 @@
 import glob
 import os
+import warnings
 from pathlib import Path
 from typing import List
 from typing import Optional
 
 import chromadb
-import PyPDF2
 from chromadb.types import Collection
 from chromadb.utils import embedding_functions
 from llama_index.core.node_parser import SentenceSplitter
+from pypdf import PdfReader
 
 from evidently.pydantic_utils import EvidentlyBaseModel
 
@@ -16,16 +17,19 @@
 DEFAULT_CHUNK_SIZE = 512
 DEFAULT_CHUNK_OVERLAP = 20
 
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings("ignore", category=FutureWarning)
 
-def read_text(filename: Path) -> str:
-    if Path(filename).suffix.lower() == ".pdf":
-        with open(filename, "rb") as file:
-            reader = PyPDF2.PdfReader(file)
-            text = ""
-            for page_num in range(len(reader.pages)):
-                page = reader.pages[page_num]
-                text += page.extract_text()
-            return text
+
+def read_text(filename: str) -> str:
+    file_path = Path(filename)
+    if file_path.suffix.lower() == ".pdf":
+        reader = PdfReader(file_path)
+        text = ""
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            text += page.extract_text()
+        return text
     else:
         return Path(filename).read_text()
 
@@ -37,7 +41,7 @@ class Config:
     chunk_size: int = DEFAULT_CHUNK_SIZE
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
 
-    def get_data_collection(self):
+    def get_data_collection(self) -> "DataCollection":
         raise NotImplementedError
 
     @classmethod
@@ -70,7 +74,8 @@ def get_data_collection(self):
         paths = [self.path] if file_path.is_file() else glob.glob(os.path.join(self.path, "*"))
 
         for filename in paths:
-            text_nodes.extend(splitter.split_text(read_text()))
+            nodes = splitter.split_text(read_text(filename))
+            text_nodes.extend(nodes)
 
         data_collection = DataCollection(name=file_path.name, chunks=text_nodes)
         data_collection.init_collection()

From 468d42ea7dab7deb3d83b1c49b25ad9b465eaad7 Mon Sep 17 00:00:00 2001
From: Svetlana Popova <svetleo@evidentlyai.com>
Date: Thu, 10 Oct 2024 23:13:53 +0200
Subject: [PATCH 39/63] generate_dataset_from_docs

---
 examples/synth_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index 05b259bb4c..edb0cb78a9 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -62,5 +62,5 @@ def main():
 
 
 if __name__ == '__main__':
-    # main()
-    generate_from_file()
+    main()
+    # generate_from_file()

From cb1dd09199fb70fcbe55b313e92a4fb3962d01f0 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 14:38:18 +0200
Subject: [PATCH 40/63] prompt function signature

---
 .../dataset_generators/llm/prompts.py         |  32 ++----
 .../dataset_generators/llm/questions.py       |  24 ++--
 src/evidently/utils/llm.py                    | 105 +++++++++++++++---
 3 files changed, 108 insertions(+), 53 deletions(-)

diff --git a/src/evidently/dataset_generators/llm/prompts.py b/src/evidently/dataset_generators/llm/prompts.py
index c6fad71381..7b8eab287a 100644
--- a/src/evidently/dataset_generators/llm/prompts.py
+++ b/src/evidently/dataset_generators/llm/prompts.py
@@ -4,6 +4,7 @@
 from evidently.utils.llm import BlockPromptTemplate
 from evidently.utils.llm import PromptBlock
 from evidently.utils.llm import WithSystemPrompt
+from evidently.utils.llm import llm_call
 
 
 class SimpleQuestionPrompt(BlockPromptTemplate):
@@ -23,10 +24,16 @@ class QuestionsFromSeed(BlockPromptTemplate):
         PromptBlock.string_list_output("questions"),
     ]
 
+    @llm_call
+    def generate(self, seed_question: str, number: int) -> List[str]: ...
+
 
 class QuestionsFromContext(WithSystemPrompt, BlockPromptTemplate):
     system_prompt: str = "You are an assistant who generates questions based on provided context"
 
+    @llm_call
+    def generate_questions(self, context: str, number: int) -> List[str]: ...
+
 
 class NaiveQuestionsFromContext(QuestionsFromContext):
     blocks: ClassVar = [
@@ -40,8 +47,6 @@ class NaiveQuestionsFromContext(QuestionsFromContext):
         PromptBlock.string_list_output("questions"),
     ]
 
-    def generate_questions_from_context(self, context: str, number: int) -> List[str]: ...
-
 
 class ReformulateQuestionPrompt(QuestionsFromContext):
     blocks: ClassVar = [
@@ -68,24 +73,5 @@ class BaselineAnswerPrompt(WithSystemPrompt, BlockPromptTemplate):
     ]
     system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble"
 
-
-#
-# def llm_call(*prompt):
-#     def a(f):
-#         pass
-#     return a
-#
-# class GenerateQuestionFromContextBase(BlockPromptTemplate):
-#     # @llm_call()
-#     # def generate_questions_from_context(self, context: str, number: int) -> List[str]:
-#     #     ...
-#
-# class KekImopl(GenerateQuestionFromContextBase):
-#     blocks: ClassVar = ["Generate {number} conceptual questions based on the provided context and "
-#         "can be answered from the information in the provided context.\n"
-#         "Here is a context",
-#         PromptBlock.input("context").anchored(),
-#         "Remain faithful to the above context.\n"
-#         "Avoid providing any preamble!\n"
-#         "Avoid providing any closing statement!",
-#         PromptBlock.string_list_output("questions"),]
+    @llm_call
+    def generate_answers(self, question: str, context: str): ...
diff --git a/src/evidently/dataset_generators/llm/questions.py b/src/evidently/dataset_generators/llm/questions.py
index 0f844a7a34..c170e35953 100644
--- a/src/evidently/dataset_generators/llm/questions.py
+++ b/src/evidently/dataset_generators/llm/questions.py
@@ -43,21 +43,16 @@ def generate_chunksets(self, documents: DataCollection, count: int, chunks_per_s
         return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)]
 
     def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]:
-        llm_responses = self.wrapper.batch_complete_sync(
-            [
-                self.questions.get_messages(context="\n\n".join(chunks), number=questions_per_chunkset)
-                for chunks in chunk_sets
-            ]
+        questions = self.wrapper.run_batch_sync(
+            self.questions.generate_questions(context="\n\n".join(chunks), number=questions_per_chunkset)
+            for chunks in chunk_sets
         )
-        questions = [self.questions.parse(response, keys=["questions"])["questions"] for response in llm_responses]
         return [q for qs in questions for q in qs]
 
     def generate_answers(self, questions: List[Question], relevant_chunks: List[List[Chunk]]) -> List[str]:
-        return self.wrapper.batch_complete_sync(
-            [
-                self.answers.get_messages(question=question, context="\n".join(chunks))
-                for question, chunks in zip(questions, relevant_chunks)
-            ]
+        return self.wrapper.run_batch_sync(
+            self.answers.generate_answers(question=question, context="\n".join(chunks))
+            for question, chunks in zip(questions, relevant_chunks)
         )
 
 
@@ -67,7 +62,8 @@ class QADatasetFromSeedGenerator(BaseLLMDatasetGenerator):
     prompt: QuestionsFromSeed = QuestionsFromSeed()
 
     def generate(self) -> DatasetGeneratorResult:
-        response = self.wrapper.batch_complete_sync(
-            [self.prompt.get_messages(number=self.num_questions, seed_question=self.seed_question)]
+        response = self.wrapper.run_sync(
+            self.prompt.generate(number=self.num_questions, seed_question=self.seed_question)
         )
-        return pd.DataFrame({"questions": self.prompt.parse(response[0], keys=["questions"])["questions"]})
+
+        return pd.DataFrame({"questions": response})
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index 3ec726e242..a0b911c923 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -1,22 +1,26 @@
 import asyncio
 import dataclasses
 import datetime
+import inspect
 import json
 from abc import ABC
 from abc import abstractmethod
 from asyncio import Lock
 from asyncio import Semaphore
 from asyncio import sleep
+from functools import wraps
 from typing import Any
 from typing import Callable
 from typing import ClassVar
 from typing import Dict
+from typing import Generic
 from typing import Iterator
 from typing import List
 from typing import Optional
 from typing import Sequence
 from typing import Tuple
 from typing import Type
+from typing import TypeVar
 from typing import Union
 
 from evidently._pydantic_compat import SecretStr
@@ -82,6 +86,17 @@ async def _clean(self):
         self.enters = [e for e in self.enters if now - e < self.interval]
 
 
+TResult = TypeVar("TResult")
+
+
+@dataclasses.dataclass
+class LLMRequest(Generic[TResult]):
+    messages: List[LLMMessage]
+    response_parser: Callable[[str], TResult]
+    response_type: Type[TResult]
+    retries: int = 1
+
+
 class LLMWrapper(ABC):
     __used_options__: ClassVar[List[Type[Option]]] = []
 
@@ -89,7 +104,7 @@ class LLMWrapper(ABC):
     async def complete(self, messages: List[LLMMessage]) -> str:
         raise NotImplementedError
 
-    async def batch_complete(
+    async def complete_batch(
         self, messages_batch: List[List[LLMMessage]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
     ) -> List[str]:
         if batch_size is None:
@@ -105,7 +120,33 @@ async def work(messages: List[LLMMessage]) -> str:
 
         return await asyncio.gather(*[work(msgs) for msgs in messages_batch])
 
-    batch_complete_sync = sync_api(batch_complete)
+    async def run(self, request: LLMRequest[TResult]) -> TResult:
+        num_retries = request.retries
+        error = None
+        while num_retries >= 0:
+            num_retries -= 1
+            try:
+                response = await self.complete(request.messages)
+                return request.response_parser(response)
+            except Exception as e:
+                error = e
+        raise error
+
+    async def run_batch(
+        self, requests: Sequence[LLMRequest[TResult]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
+    ) -> List[TResult]:
+        if batch_size is None:
+            batch_size = self.get_batch_size()
+        if rpm_limit is None:
+            rpm_limit = self.get_rpm_limit()
+        rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1))
+        semaphore = Semaphore(batch_size)
+
+        async def work(request: LLMRequest[TResult]) -> TResult:
+            async with semaphore, rate_limiter:
+                return await self.run(request)
+
+        return await asyncio.gather(*[work(r) for r in requests])
 
     def get_batch_size(self) -> int:
         return 100
@@ -116,6 +157,10 @@ def get_rpm_limit(self) -> Optional[int]:
     def get_used_options(self) -> List[Type[Option]]:
         return self.__used_options__
 
+    complete_batch_sync = sync_api(complete_batch)
+    run_sync = sync_api(run)
+    run_batch_sync = sync_api(run_batch)
+
 
 LLMProvider = str
 LLMModel = str
@@ -261,13 +306,18 @@ def _render(self) -> str:
         return self.value
 
 
-class OutputFormatBlock(PromptBlock, ABC):
+class OutputFormatBlock(PromptBlock, ABC, Generic[TResult]):
     @abstractmethod
-    def parse_response(self, response: str) -> Dict[str, str]:
+    def parse_response(self, response: str) -> TResult:
         raise NotImplementedError
 
 
-class JsonOutputFormatBlock(OutputFormatBlock):
+class NoopOutputFormat(OutputFormatBlock[str]):
+    def parse_response(self, response: str) -> str:
+        return response
+
+
+class JsonOutputFormatBlock(OutputFormatBlock[Dict[str, Any]]):
     fields: Dict[str, Union[Tuple[str, str], str]]
 
     def _render(self) -> str:
@@ -284,38 +334,59 @@ def _render(self) -> str:
         example_rows_str = "\n".join(example_rows)
         return f"Return {', '.join(values)} formatted as json without formatting as follows:\n{{{{\n{example_rows_str}\n}}}}"
 
-    def parse_response(self, response: str) -> Dict[str, str]:
+    def parse_response(self, response: str) -> Dict[str, Any]:
         try:
             return json.loads(response)
         except json.JSONDecodeError as e:
             raise LLMResponseParseError(f"Failed to parse response '{response}' as json") from e
 
 
-class StringListFormatBlock(OutputFormatBlock):
+class StringListFormatBlock(OutputFormatBlock[List[str]]):
     of_what: str
 
     def _render(self) -> str:
         return f"""Return a list of {self.of_what}.
 This should be only a list of string {self.of_what}, each one on a new line with no enumeration"""
 
-    def parse_response(self, response: str) -> Dict[str, str]:
-        return {self.of_what: response.split("\n")}
+    def parse_response(self, response: str) -> List[str]:
+        return response.split("\n")
 
 
-class StringFormatBlock(OutputFormatBlock):
+class StringFormatBlock(OutputFormatBlock[str]):
     what: str
 
     def _render(self) -> str:
         return f"""Return {self.what} only."""
 
-    def parse_response(self, response: str) -> Dict[str, str]:
-        return {self.what: response}
+    def parse_response(self, response: str) -> str:
+        return response
+
+
+def llm_call(f: Callable) -> Callable[..., LLMRequest]:
+    sig = inspect.getfullargspec(f)
+    response_type = sig.annotations.get("return", str)
+
+    @wraps(f)
+    def inner(self: PromptTemplate, *args, **kwargs):
+        kwargs = inspect.getcallargs(f, *args, **kwargs, self=self)
+        del kwargs["self"]
+        # output_format = self.get_output_format()
+        # todo: validate response_type against output_format.response_type
+        # todo: validate sig.annotations against self.list_placeholders
+
+        # todo: validate kwargs against sig.annotations
+        # todo: define response parser with validation against response_type
+
+        return LLMRequest(messages=self.get_messages(**kwargs), response_parser=self.parse, response_type=response_type)
+
+    return inner
 
 
 class PromptTemplate(EvidentlyBaseModel):
     class Config:
         alias_required = False  # fixme
 
+    # __run_func__ : ClassVar[Callable]
     @abstractmethod
     def get_blocks(self) -> Sequence[PromptBlock]:
         raise NotImplementedError
@@ -331,10 +402,12 @@ def render(self, **values):
     def get_template(self) -> str:
         return "\n".join(block.render() for block in self.get_blocks())
 
-    def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, Any]:
+    def get_output_format(self) -> OutputFormatBlock:
         output = next((b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None)
-        if output is None:
-            return {"": response}
+        return output if output is not None else NoopOutputFormat()
+
+    def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, Any]:
+        output = self.get_output_format()
         parsed = output.parse_response(response)
         if keys is not None and set(keys) != set(parsed.keys()):
             raise LLMResponseParseError(f"Keys {keys} are required but got {list(parsed.keys())}")
@@ -344,7 +417,7 @@ def get_messages(self, **values) -> List[LLMMessage]:
         return [LLMMessage.user(self.render(**values))]
 
 
-class WithSystemPrompt(PromptTemplate):
+class WithSystemPrompt(PromptTemplate, ABC):
     system_prompt: str
 
     def get_messages(self, **values) -> List[LLMMessage]:

From 4100c75d123c2fca8acb5fef38517aea1a29ded2 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 15:11:54 +0200
Subject: [PATCH 41/63] function signature validation

---
 src/evidently/utils/llm.py | 50 +++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index a0b911c923..d76a9e0a01 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -3,6 +3,7 @@
 import datetime
 import inspect
 import json
+import re
 from abc import ABC
 from abc import abstractmethod
 from asyncio import Lock
@@ -23,6 +24,8 @@
 from typing import TypeVar
 from typing import Union
 
+import typing_inspect
+
 from evidently._pydantic_compat import SecretStr
 from evidently.errors import EvidentlyError
 from evidently.options.base import Options
@@ -370,18 +373,39 @@ def llm_call(f: Callable) -> Callable[..., LLMRequest]:
     def inner(self: PromptTemplate, *args, **kwargs):
         kwargs = inspect.getcallargs(f, *args, **kwargs, self=self)
         del kwargs["self"]
-        # output_format = self.get_output_format()
-        # todo: validate response_type against output_format.response_type
-        # todo: validate sig.annotations against self.list_placeholders
+        template = self.get_template()
+        placeholders = self.list_placeholders(template)
+        if set(placeholders) != set(kwargs.keys()):
+            raise TypeError(
+                f"{f} arg signature ({list(kwargs)}) does not correspond to placeholders in prompt ({placeholders})"
+            )
+
+        output_format = self.get_output_format()
+        prompt_response_type = _get_genric_arg(output_format)
+        if prompt_response_type != response_type:
+            raise TypeError(
+                f"{f} response type ({response_type}) does not correspond to prompt output type {prompt_response_type}"
+            )
 
         # todo: validate kwargs against sig.annotations
         # todo: define response parser with validation against response_type
 
-        return LLMRequest(messages=self.get_messages(**kwargs), response_parser=self.parse, response_type=response_type)
+        return LLMRequest(
+            messages=self.get_messages(kwargs, template=template),
+            response_parser=self.parse,
+            response_type=response_type,
+        )
 
     return inner
 
 
+def _get_genric_arg(cls):
+    return typing_inspect.get_args(next(b for b in cls.__orig_bases__ if typing_inspect.is_generic_type(b)))[0]
+
+
+placeholders_re = re.compile(r"\{([a-zA-Z0-9_]+)}")
+
+
 class PromptTemplate(EvidentlyBaseModel):
     class Config:
         alias_required = False  # fixme
@@ -394,14 +418,18 @@ def get_blocks(self) -> Sequence[PromptBlock]:
     def iterate(self, values: Sequence[Dict[str, str]]) -> Iterator[str]:
         template = self.get_template()
         for vals in values:
-            yield template.format(**vals)
+            yield self.render(vals, template)
 
-    def render(self, **values):
-        return self.get_template().format(**values)
+    def render(self, values: dict, template: Optional[str] = None):
+        return (template or self.get_template()).format(**values)
 
     def get_template(self) -> str:
         return "\n".join(block.render() for block in self.get_blocks())
 
+    def list_placeholders(self, template: Optional[str] = None):
+        template = template or self.get_template()
+        return list(placeholders_re.findall(template))
+
     def get_output_format(self) -> OutputFormatBlock:
         output = next((b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None)
         return output if output is not None else NoopOutputFormat()
@@ -413,15 +441,15 @@ def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, An
             raise LLMResponseParseError(f"Keys {keys} are required but got {list(parsed.keys())}")
         return parsed
 
-    def get_messages(self, **values) -> List[LLMMessage]:
-        return [LLMMessage.user(self.render(**values))]
+    def get_messages(self, values, template: Optional[str] = None) -> List[LLMMessage]:
+        return [LLMMessage.user(self.render(values, template))]
 
 
 class WithSystemPrompt(PromptTemplate, ABC):
     system_prompt: str
 
-    def get_messages(self, **values) -> List[LLMMessage]:
-        msgs = super().get_messages(**values)
+    def get_messages(self, values, template: Optional[str] = None) -> List[LLMMessage]:
+        msgs = super().get_messages(values, template)
         msgs.insert(0, LLMMessage.system(self.system_prompt))
         return msgs
 

From 42c58ce4aead2d96ef6cc033fc77a69986fd1426 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 18:23:03 +0200
Subject: [PATCH 42/63] requirements

---
 .github/workflows/main.yml | 4 ++--
 requirements.min.txt       | 2 +-
 setup.py                   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index e92b54ddfe..92f99b555b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -115,7 +115,7 @@ jobs:
       - name: Install minimal dependencies
         run: pip install -r requirements.min.txt
       - name: Install package
-        run: pip install -e .[dev,spark,fsspec]
+        run: pip install -e .[dev,spark,fsspec,llm]
       - name: Run pip-audit
         run: pip-audit --ignore-vuln PYSEC-2024-48 --ignore-vuln GHSA-jw8x-6495-233v --ignore-vuln GHSA-4hq2-rpgc-r8r7
       - name: Run Tests
@@ -155,7 +155,7 @@ jobs:
         uses: ./.github/share-actions/get-bikes-dataset-cached
 
       - name: Install package
-        run: pip install -e .[dev,spark,fsspec]
+        run: pip install -e .[dev,spark,fsspec,llm]
       - name: Run Tests
         run: python -m pytest --durations=50
 
diff --git a/requirements.min.txt b/requirements.min.txt
index 2a0bff5f00..fa00bfd7fc 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -3,7 +3,7 @@ plotly==5.10.0
 statsmodels==0.12.2
 scikit-learn==1.0.1
 pandas[parquet]==1.3.5
-numpy==1.22.0
+numpy==1.22.5
 nltk==3.6.7
 scipy==1.10.0
 requests==2.32.0
diff --git a/setup.py b/setup.py
index c22676a19e..369b661f2f 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
         "statsmodels>=0.12.2",
         "scikit-learn>=1.0.1",
         "pandas[parquet]>=1.3.5",
-        "numpy>=1.22.0,<2.1",
+        "numpy>=1.22.5,<2.1",
         "nltk>=3.6.7",
         "scipy>=1.10.0",
         "requests>=2.32.0",
@@ -76,7 +76,6 @@
         "deprecation>=2.1.0",
         "uuid6>=2024.7.10",
         "cryptography>=43.0.1",
-        "chromadb>=0.5.12",
     ],
     extras_require={
         "dev": [
@@ -103,6 +102,7 @@
             "evaluate>=0.4.1",
             "transformers[torch]>=4.39.3",
             "sentence-transformers>=2.7.0",
+            "chromadb>=0.5.12",
         ],
         "spark": ["pyspark>=3.4.0"],
         "fsspec": [

From cdcadd55e28ab469aa05490f46abdfde03381f2d Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 18:25:30 +0200
Subject: [PATCH 43/63] requirements

---
 .github/workflows/main.yml | 2 +-
 setup.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 92f99b555b..0dd15a80a0 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -173,7 +173,7 @@ jobs:
           cache: "pip"
           cache-dependency-path: setup.py
       - name: Install dependencies
-        run: pip install -e ".[dev]"
+        run: pip install -e .
       - name: Install wheel
         run: pip install wheel
       - name: Build package
diff --git a/setup.py b/setup.py
index 369b661f2f..5d45970721 100644
--- a/setup.py
+++ b/setup.py
@@ -92,7 +92,7 @@
             "types-python-dateutil==2.8.19",
             "types-ujson>=5.4.0",
             "pillow==10.3.0",
-            "httpx==0.24.1",
+            "httpx==0.27.0",
             "ruff==0.3.7",
             "pre-commit==3.5.0",
             "pytest-asyncio==0.23.7",

From 64533cda819e51bc56949e36e0ec6e3a0036f33b Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 18:30:59 +0200
Subject: [PATCH 44/63] requirements

---
 requirements.min.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.min.txt b/requirements.min.txt
index fa00bfd7fc..e6d0247e8d 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -3,7 +3,7 @@ plotly==5.10.0
 statsmodels==0.12.2
 scikit-learn==1.0.1
 pandas[parquet]==1.3.5
-numpy==1.22.5
+numpy==1.23.0
 nltk==3.6.7
 scipy==1.10.0
 requests==2.32.0

From 9e61435d6fae498ebc7c6094381a3d1ca9419b57 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 18:33:22 +0200
Subject: [PATCH 45/63] requirements

---
 requirements.min.txt | 2 +-
 setup.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.min.txt b/requirements.min.txt
index e6d0247e8d..0ff30dbbb9 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -13,7 +13,7 @@ litestar==2.8.3
 typing-inspect==0.9.0
 uvicorn==0.22.0
 watchdog==3.0.0
-typer==0.3
+typer==0.9
 rich==13
 iterative-telemetry==0.0.5
 dynaconf==3.2.4
diff --git a/setup.py b/setup.py
index 5d45970721..18c6da6702 100644
--- a/setup.py
+++ b/setup.py
@@ -65,7 +65,7 @@
         "typing-inspect>=0.9.0",
         "uvicorn[standard]>=0.22.0",
         "watchdog>=3.0.0",
-        "typer>=0.3",
+        "typer>=0.9",
         "rich>=13",
         "iterative-telemetry>=0.0.5",
         "dynaconf>=3.2.4",

From dbf543fc92707e8949a7a94be6a5051dada03a19 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 19:22:26 +0200
Subject: [PATCH 46/63] requirements

---
 requirements.min.txt | 3 ++-
 setup.py             | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/requirements.min.txt b/requirements.min.txt
index 0ff30dbbb9..08ad4b5a80 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -31,4 +31,5 @@ openai==1.16.2
 evaluate==0.4.1
 transformers[torch]==4.39.3
 sentence-transformers==2.7.0
-chromadb==0.5.12
+chromadb==0.4.0
+llama-index==0.10.18
diff --git a/setup.py b/setup.py
index 18c6da6702..816f3cc695 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
         "statsmodels>=0.12.2",
         "scikit-learn>=1.0.1",
         "pandas[parquet]>=1.3.5",
-        "numpy>=1.22.5,<2.1",
+        "numpy>=1.22.0,<2.1",
         "nltk>=3.6.7",
         "scipy>=1.10.0",
         "requests>=2.32.0",
@@ -65,7 +65,7 @@
         "typing-inspect>=0.9.0",
         "uvicorn[standard]>=0.22.0",
         "watchdog>=3.0.0",
-        "typer>=0.9",
+        "typer>=0.3",
         "rich>=13",
         "iterative-telemetry>=0.0.5",
         "dynaconf>=3.2.4",
@@ -102,7 +102,8 @@
             "evaluate>=0.4.1",
             "transformers[torch]>=4.39.3",
             "sentence-transformers>=2.7.0",
-            "chromadb>=0.5.12",
+            "chromadb>=0.4.0",
+            "llama-index>=0.10.18",
         ],
         "spark": ["pyspark>=3.4.0"],
         "fsspec": [

From f697955bbcb587f76e6c615e67f068c64e62cd85 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 19:58:03 +0200
Subject: [PATCH 47/63] requirements

---
 requirements.min.txt | 6 +++---
 setup.py             | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements.min.txt b/requirements.min.txt
index 08ad4b5a80..bdd1175572 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -3,7 +3,7 @@ plotly==5.10.0
 statsmodels==0.12.2
 scikit-learn==1.0.1
 pandas[parquet]==1.3.5
-numpy==1.23.0
+numpy==1.22.0
 nltk==3.6.7
 scipy==1.10.0
 requests==2.32.0
@@ -13,7 +13,7 @@ litestar==2.8.3
 typing-inspect==0.9.0
 uvicorn==0.22.0
 watchdog==3.0.0
-typer==0.9
+typer==0.3
 rich==13
 iterative-telemetry==0.0.5
 dynaconf==3.2.4
@@ -32,4 +32,4 @@ evaluate==0.4.1
 transformers[torch]==4.39.3
 sentence-transformers==2.7.0
 chromadb==0.4.0
-llama-index==0.10.18
+llama-index==0.9.48
diff --git a/setup.py b/setup.py
index 816f3cc695..7a1505cd7c 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@
             "transformers[torch]>=4.39.3",
             "sentence-transformers>=2.7.0",
             "chromadb>=0.4.0",
-            "llama-index>=0.10.18",
+            "llama-index>=0.9.48",
         ],
         "spark": ["pyspark>=3.4.0"],
         "fsspec": [

From cecf4c27dbd00867c162ebe115a504ecca3d5231 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 20:07:58 +0200
Subject: [PATCH 48/63] lil cleanup

---
 docs/book/input-data/column-mapping.md        |    2 +-
 requirements.min.txt                          |    2 +-
 setup.py                                      |    2 +-
 .../llm/data_generation_for_RAG.ipynb         | 1257 -----------------
 4 files changed, 3 insertions(+), 1260 deletions(-)
 delete mode 100644 src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb

diff --git a/docs/book/input-data/column-mapping.md b/docs/book/input-data/column-mapping.md
index 723eecce16..3d0ed84592 100644
--- a/docs/book/input-data/column-mapping.md
+++ b/docs/book/input-data/column-mapping.md
@@ -138,7 +138,7 @@ Here is an example of how you point to the defined list of columns that contain
 
 ```python
 column_mapping = ColumnMapping()
-column_mapping.collection = {'small_subset': embeddings_data.columns[:10]}
+column_mapping.embeddings = {'small_subset': embeddings_data.columns[:10]}
 ```
 
 {% hint style="info" %} 
diff --git a/requirements.min.txt b/requirements.min.txt
index bdd1175572..6d96eff823 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -32,4 +32,4 @@ evaluate==0.4.1
 transformers[torch]==4.39.3
 sentence-transformers==2.7.0
 chromadb==0.4.0
-llama-index==0.9.48
+llama-index==0.8.42
diff --git a/setup.py b/setup.py
index 7a1505cd7c..1f892b61b2 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,7 @@
             "transformers[torch]>=4.39.3",
             "sentence-transformers>=2.7.0",
             "chromadb>=0.4.0",
-            "llama-index>=0.9.48",
+            "llama-index>=0.8.42",
         ],
         "spark": ["pyspark>=3.4.0"],
         "fsspec": [
diff --git a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb b/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
deleted file mode 100644
index 4af120045b..0000000000
--- a/src/evidently/dataset_generators/llm/data_generation_for_RAG.ipynb
+++ /dev/null
@@ -1,1257 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "697ef555-f62c-424f-90da-bec9fbdace28",
-   "metadata": {},
-   "source": [
-    "## Extra Dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "bf4855a8-0d91-4d88-8fa2-05d2eb2ddbad",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: chromadb in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (0.5.12)\n",
-      "Requirement already satisfied: build>=1.0.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.2.1)\n",
-      "Requirement already satisfied: pydantic>=1.9 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (2.9.2)\n",
-      "Requirement already satisfied: chroma-hnswlib==0.7.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.7.6)\n",
-      "Requirement already satisfied: fastapi>=0.95.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.115.0)\n",
-      "Requirement already satisfied: uvicorn>=0.18.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.31.0)\n",
-      "Requirement already satisfied: numpy>=1.22.5 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.25.2)\n",
-      "Requirement already satisfied: posthog>=2.4.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (3.6.6)\n",
-      "Requirement already satisfied: typing-extensions>=4.5.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (4.12.2)\n",
-      "Requirement already satisfied: onnxruntime>=1.14.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.19.2)\n",
-      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.26.0)\n",
-      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.26.0)\n",
-      "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.47b0)\n",
-      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.26.0)\n",
-      "Requirement already satisfied: tokenizers>=0.13.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.20.0)\n",
-      "Requirement already satisfied: pypika>=0.48.9 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.48.9)\n",
-      "Requirement already satisfied: tqdm>=4.65.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (4.66.5)\n",
-      "Requirement already satisfied: overrides>=7.3.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (7.7.0)\n",
-      "Requirement already satisfied: importlib-resources in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (6.4.5)\n",
-      "Requirement already satisfied: grpcio>=1.58.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (1.66.2)\n",
-      "Requirement already satisfied: bcrypt>=4.0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (4.2.0)\n",
-      "Requirement already satisfied: typer>=0.9.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.9.4)\n",
-      "Requirement already satisfied: kubernetes>=28.1.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (31.0.0)\n",
-      "Requirement already satisfied: tenacity>=8.2.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (8.5.0)\n",
-      "Requirement already satisfied: PyYAML>=6.0.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (6.0.2)\n",
-      "Requirement already satisfied: mmh3>=4.0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (5.0.1)\n",
-      "Requirement already satisfied: orjson>=3.9.12 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (3.10.7)\n",
-      "Requirement already satisfied: httpx>=0.27.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (0.27.2)\n",
-      "Requirement already satisfied: rich>=10.11.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from chromadb) (13.9.1)\n",
-      "Requirement already satisfied: packaging>=19.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from build>=1.0.3->chromadb) (24.1)\n",
-      "Requirement already satisfied: pyproject_hooks in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from build>=1.0.3->chromadb) (1.1.0)\n",
-      "Requirement already satisfied: starlette<0.39.0,>=0.37.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from fastapi>=0.95.2->chromadb) (0.38.6)\n",
-      "Requirement already satisfied: anyio in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (4.6.0)\n",
-      "Requirement already satisfied: certifi in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (2024.8.30)\n",
-      "Requirement already satisfied: httpcore==1.* in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (1.0.6)\n",
-      "Requirement already satisfied: idna in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (3.10)\n",
-      "Requirement already satisfied: sniffio in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpx>=0.27.0->chromadb) (1.3.1)\n",
-      "Requirement already satisfied: h11<0.15,>=0.13 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.27.0->chromadb) (0.14.0)\n",
-      "Requirement already satisfied: six>=1.9.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n",
-      "Requirement already satisfied: python-dateutil>=2.5.3 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n",
-      "Requirement already satisfied: google-auth>=1.0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.35.0)\n",
-      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\n",
-      "Requirement already satisfied: requests in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.32.3)\n",
-      "Requirement already satisfied: requests-oauthlib in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.0)\n",
-      "Requirement already satisfied: oauthlib>=3.2.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n",
-      "Requirement already satisfied: urllib3>=1.24.2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (2.2.3)\n",
-      "Requirement already satisfied: durationpy>=0.7 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from kubernetes>=28.1.0->chromadb) (0.9)\n",
-      "Requirement already satisfied: coloredlogs in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (15.0.1)\n",
-      "Requirement already satisfied: flatbuffers in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\n",
-      "Requirement already satisfied: protobuf in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.5)\n",
-      "Requirement already satisfied: sympy in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from onnxruntime>=1.14.1->chromadb) (1.13.3)\n",
-      "Requirement already satisfied: deprecated>=1.2.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\n",
-      "Requirement already satisfied: importlib-metadata<=8.0.0,>=6.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-api>=1.2.0->chromadb) (8.0.0)\n",
-      "Requirement already satisfied: googleapis-common-protos~=1.52 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.65.0)\n",
-      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.26.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.26.0)\n",
-      "Requirement already satisfied: opentelemetry-proto==1.26.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.26.0)\n",
-      "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
-      "Requirement already satisfied: opentelemetry-instrumentation==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
-      "Requirement already satisfied: opentelemetry-semantic-conventions==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
-      "Requirement already satisfied: opentelemetry-util-http==0.47b0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.47b0)\n",
-      "Requirement already satisfied: setuptools>=16.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.47b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (65.5.1)\n",
-      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation==0.47b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\n",
-      "Requirement already satisfied: asgiref~=3.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from opentelemetry-instrumentation-asgi==0.47b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (3.8.1)\n",
-      "Requirement already satisfied: monotonic>=1.5 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from posthog>=2.4.0->chromadb) (1.6)\n",
-      "Requirement already satisfied: backoff>=1.10.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from posthog>=2.4.0->chromadb) (2.2.1)\n",
-      "Requirement already satisfied: annotated-types>=0.6.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pydantic>=1.9->chromadb) (0.7.0)\n",
-      "Requirement already satisfied: pydantic-core==2.23.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pydantic>=1.9->chromadb) (2.23.4)\n",
-      "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (3.0.0)\n",
-      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from rich>=10.11.0->chromadb) (2.18.0)\n",
-      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from tokenizers>=0.13.2->chromadb) (0.25.1)\n",
-      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from typer>=0.9.0->chromadb) (8.1.7)\n",
-      "Requirement already satisfied: httptools>=0.5.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\n",
-      "Requirement already satisfied: python-dotenv>=0.13 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\n",
-      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.20.0)\n",
-      "Requirement already satisfied: watchfiles>=0.13 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.24.0)\n",
-      "Requirement already satisfied: websockets>=10.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (13.1)\n",
-      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.5.0)\n",
-      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.1)\n",
-      "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n",
-      "Requirement already satisfied: filelock in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.12.4)\n",
-      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.9.0)\n",
-      "Requirement already satisfied: zipp>=0.5 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from importlib-metadata<=8.0.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.19.2)\n",
-      "Requirement already satisfied: mdurl~=0.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->chromadb) (0.1.2)\n",
-      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from requests->kubernetes>=28.1.0->chromadb) (3.3.2)\n",
-      "Requirement already satisfied: humanfriendly>=9.1 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb) (10.0)\n",
-      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n",
-      "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.1)\n",
-      "\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.2\u001B[0m\n",
-      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpython3.11 -m pip install --upgrade pip\u001B[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install chromadb"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c5b8aec3-e00e-4919-b2a8-b19722311261",
-   "metadata": {},
-   "source": [
-    "## Imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "547c43f3-e58f-450c-b80b-c396eb2655a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import openai \n",
-    "from openai import OpenAI\n",
-    "import pprint\n",
-    "import pandas as pd\n",
-    "import random\n",
-    "\n",
-    "import chromadb\n",
-    "from chromadb.utils import embedding_functions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "5f649ddb-af77-4961-8eb2-f69b7c6916db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chroma_client = chromadb.Client()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "5d8da19c-8d15-467f-90d0-dc7e02ed3aad",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<chromadb.api.client.Client at 0x2bc6254d0>"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chroma_client"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2c6fac99-e1db-48be-9554-88ecddac271e",
-   "metadata": {},
-   "source": [
-    "## Chunked data collection setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "9b93b470-9d32-4757-9d03-915992e2a7c3",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/svetlanapopova/.pyenv/versions/3.11.9/envs/env/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collection db_collection created successfully\n",
-      "{'data': None,\n",
-      " 'documents': [],\n",
-      " 'embeddings': None,\n",
-      " 'ids': [],\n",
-      " 'included': ['metadatas', 'documents'],\n",
-      " 'metadatas': [],\n",
-      " 'uris': None}\n"
-     ]
-    }
-   ],
-   "source": [
-    "collection_name = \"db_collection\"\n",
-    "default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=\"all-MiniLM-L6-v2\")\n",
-    "\n",
-    "chroma_client = chromadb.PersistentClient(path=\"./chromadb/\")\n",
-    "\n",
-    "# declare ChromaDB collection\n",
-    "collection = chroma_client.get_or_create_collection(\n",
-    "    name=collection_name,\n",
-    "    embedding_function=default_embedding_function\n",
-    "    )\n",
-    "\n",
-    "result = collection.get()\n",
-    "\n",
-    "print(f\"Collection {collection_name} created successfully\")\n",
-    "pprint.pprint(result)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "d547021f-9d4d-42cf-b580-abc6a1008cd1",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "def load_md_from_dir(dir_path):\n",
-    "    \"\"\"\n",
-    "    Loads Markdown (.md) files from the specified directory.\n",
-    "\n",
-    "    Args:\n",
-    "        dir_path (str): Path to the directory containing .md files.\n",
-    "\n",
-    "    Returns:\n",
-    "        List[dict]: A list of dictionaries with the text content of each .md file.\n",
-    "    \"\"\"\n",
-    "    md_files = [\n",
-    "        os.path.join(dir_path, filename) \n",
-    "        for filename in os.listdir(dir_path) \n",
-    "        if filename.endswith(\".md\")\n",
-    "    ]\n",
-    "    \n",
-    "    documents = []\n",
-    "    for file_path in md_files:\n",
-    "        with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
-    "            documents.append({\"text\": file.read()})\n",
-    "    \n",
-    "    return documents"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "53f147f1-f8a2-4095-a840-2bacbc0aaf63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def split_text(text, chunk_size=100, chunk_overlap=20):\n",
-    "    \"\"\"\n",
-    "    Splits the input text into overlapping chunks.\n",
-    "\n",
-    "    Args:\n",
-    "        text (str): The text to split.\n",
-    "        chunk_size (int): The size of each chunk. Default is 100.\n",
-    "        chunk_overlap (int): The number of overlapping characters between chunks. Default is 20.\n",
-    "\n",
-    "    Returns:\n",
-    "        List[str]: A list of text chunks.\n",
-    "    \"\"\"\n",
-    "    chunks = []\n",
-    "    text_length = len(text)\n",
-    "    \n",
-    "    for start in range(0, text_length, chunk_size - chunk_overlap):\n",
-    "        end = min(start + chunk_size, text_length)\n",
-    "        chunks.append(text[start:end])\n",
-    "    \n",
-    "    return chunks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "c0c9c8e2-0f2f-4fe0-aeee-68b0bb67cea8",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " 1 files loaded\n",
-      "Split in to 8 chunks\n"
-     ]
-    }
-   ],
-   "source": [
-    "directory_path = \"../../../../evidently_reference/\"\n",
-    "\n",
-    "# load documents from directory\n",
-    "md_files = load_md_from_dir(directory_path)\n",
-    "\n",
-    "print(f\" {len(md_files)} files loaded\")\n",
-    "\n",
-    "# Split text into chunks\n",
-    "chunked_files = [\n",
-    "    {\n",
-    "        'id': f\"{file_id}-{chunk_id}\",\n",
-    "        'text': chunk,\n",
-    "    }\n",
-    "    for file_id, file in enumerate(md_files)\n",
-    "    for chunk_id, chunk in enumerate(split_text(file[\"text\"], chunk_size=500, chunk_overlap=50))\n",
-    "]\n",
-    "\n",
-    "print(f\"Split in to {len(chunked_files)} chunks\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "7bf99fde-8aa7-4111-ad7e-eec59bd0c23e",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'chunked_files' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
-      "Cell \u001B[0;32mIn[10], line 2\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;66;03m# insert documents with embeddings to collection ChromaDB\u001B[39;00m\n\u001B[0;32m----> 2\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m chunk \u001B[38;5;129;01min\u001B[39;00m \u001B[43mchunked_files\u001B[49m:\n\u001B[1;32m      3\u001B[0m     collection\u001B[38;5;241m.\u001B[39mupsert(\n\u001B[1;32m      4\u001B[0m             ids\u001B[38;5;241m=\u001B[39mchunk[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mid\u001B[39m\u001B[38;5;124m'\u001B[39m],\n\u001B[1;32m      5\u001B[0m             documents\u001B[38;5;241m=\u001B[39mchunk[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m'\u001B[39m],\n\u001B[1;32m      6\u001B[0m     )\n\u001B[1;32m      8\u001B[0m result \u001B[38;5;241m=\u001B[39m collection\u001B[38;5;241m.\u001B[39mget()\n",
-      "\u001B[0;31mNameError\u001B[0m: name 'chunked_files' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "# insert documents with embeddings to collection ChromaDB\n",
-    "for chunk in chunked_files:\n",
-    "    collection.upsert(\n",
-    "            ids=chunk['id'],\n",
-    "            documents=chunk['text'],\n",
-    "    )\n",
-    "\n",
-    "result = collection.get()\n",
-    "\n",
-    "print(f\"Collection {collection_name} has {len(result['ids'])} documents\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "50ec8822-e2dc-4a01-bad3-44f1f123ed5c",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[Collection(id=639fbee3-87d9-447e-a134-f2ad8596b07c, name=db_collection)]\n"
-     ]
-    }
-   ],
-   "source": [
-    "#Just incase we need to delete collection\n",
-    "list_collections = chroma_client.list_collections()\n",
-    "print(list_collections)\n",
-    "\n",
-    "#chroma_client.delete_collection(collection_name)\n",
-    "#list_collections = chroma_client.list_collections()\n",
-    "#print(list_collections)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ebaf34e6-d454-4d38-b08a-f57550b39e74",
-   "metadata": {},
-   "source": [
-    "## Dataset Generation chain of promts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "ced14436-c4b3-4b25-8cc7-7cdb112eed66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n",
-    "client = OpenAI(\n",
-    "    api_key=os.environ.get(\"OPENAI_API_KEY\"),\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "04b0cf1e-5356-44e4-855b-5169a21260e2",
-   "metadata": {},
-   "source": [
-    "### Naive questions generation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "d81f6108-79ef-4148-bc8e-b1050cc1637f",
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
-       " 'r><br> | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **PopularityBias()** <br><br> Evaluates the popularity bias in recommendations by computing ARP (average recommendation popularity), Gini index, and coverage. <br><br>Requires a training dataset. | **Required**:<ul><li>`K`</li><li>`normalize_arp (default: False)` - whether to normalize ARP calculation by the most popular item in training</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ItemBiasMetric()** <br><br> Visu',\n",
-       " 'th reference**: the test fails if the TNR is over 20% higher or lower.<br><br>**No reference**: the test fails if the TNR is lower than the TNR of the dummy model. |\\n| **TestFPR()** | Dataset-level. <br><br> Computes the False Positive Rate and compares it to the reference or against a defined condition. | **Required**:<br>N/A<br><br> **Optional:**<ul><li>`probas_threshold`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul>**',\n",
-       " 'tems by a chosen characteristic.\\n\\nThe visualization shows:\\n* The distribution of items in the training set for the defined `column_name` (with duplicates dropped). This represents the item catalog by this dimension. \\n* The distribution of the recommended items for the defined `column_name` in the current and reference (if available) datasets. \\n\\nThis visualization helps see the patterns in the model recommendations. In a simplified example, you might observe that the training data contains 3x com',\n",
-       " '.|\\n| **TestGiniIndex(k=k)** | Dataset-level. <br><br> Computes the Gini Index at the top K recommendations and compares it to the reference or against a defined condition.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li></ul> **Optional**:<br>N/A<br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10% from reference.<br><br>**With reference**: if the Gini Index at the top K is over 10% higher or lower, the test fails.<br><br>**No reference**: Test',\n",
-       " ' a defined condition. | **Required**:<br>N/A<br><br> **Optional:**<ul><li>`probas_threshold`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/-20% or better than a dummy model.<br><br>**With reference**: the test fails if the FNR is over 20% higher or lower.<br><br>**No reference**: the test fails if the FNR is higher than the FNR of the dummy model. |\\n|',\n",
-       " ' test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of',\n",
-       " 'rcentage error in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorDistribution()** <br><br> Visualizes the distribution of the model error in a histogram. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorNormality()** <br><br> Visualizes the quantile-quantile plot (Q-Q plot) to estimate value normality. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionTopErrorMetric()** <br><br> Calculates the regression performance me',\n",
-       " \"\\n**Note**: Only a single top relevant item is considered in this metric, disregarding the position and relevance of other items in the list.\\n\\n# Diversity\\n\\n![](../.gitbook/assets/reports/metric_diversity-min.png)\\n\\n**Evidently Metric**: `DiversityMetric`\\n\\n**Recommendation diversity**: this metric measures the average intra-list diversity at K. It reflects the variety of items within the same user's recommendation list, averaged by all users. \\n\\n**Implemented method**:\\n* **Measure the difference bet\",\n",
-       " '*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te']"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Fixed size for the random list\n",
-    "sample_size = 10\n",
-    "\n",
-    "# Generate a random list with the fixed size from the existing list\n",
-    "random_chuncks = [item['text'] for item in random.sample(chunked_files, min(sample_size, len(chunked_files)))]\n",
-    "random_chuncks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "90712120-3ac4-48d0-a749-9f8ef72f4247",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "system_prompt = \"You are an assisstant who generates questions based on provided context\"\n",
-    "number_of_questions = 10\n",
-    "user_prompt = \"\"\"\n",
-    "Generate {N} conceptual questions based on the provided context and can be answered from the information in the provided context.\n",
-    "Here is a context\n",
-    "<context>\n",
-    "    {context}\n",
-    "</context>\n",
-    "\n",
-    "Remain faithful to the underlying context. \n",
-    "Avoid providing any preamble!\n",
-    "Avoid providing any closing statement!\n",
-    "Please return only a list of coma separated generated questions in string format.\n",
-    "\"\"\"\n",
-    "\n",
-    "context = \"\\n\\n\".join(random_chuncks)\n",
-    "\n",
-    "formated_user_prompt = user_prompt.format(context=context, N=number_of_questions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "7aa2632f-5bda-4395-81a2-77ccb4dd994b",
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    }
-   ],
-   "source": [
-    "response = client.chat.completions.create(\n",
-    "    model=\"gpt-4o\",  # Updated to a valid model\n",
-    "    messages=[\n",
-    "        {\"role\": \"system\", \"content\": system_prompt},\n",
-    "        {\"role\": \"user\", \"content\": formated_user_prompt}\n",
-    "    ],\n",
-    "    max_tokens=400,  # Limits the response length\n",
-    "    temperature=0.7,  # Controls randomness in the output\n",
-    "    n=1\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "d01f1b79-7781-4e60-b6b0-71a31f860376",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_queries = response.choices[0].message.content.strip().split(\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "4ba9d23b-b502-4e6e-9535-6c672d6ec309",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['\"How is the \\'Name\\' of a Metric used in reading tables?',\n",
-       " \" What information does the 'Description' section provide in the context of Metrics?\",\n",
-       " \" What parameters are considered for the 'PopularityBias()' Metric?\",\n",
-       " \" How does the 'TestFPR()' function operate at the dataset level?\",\n",
-       " \" What is the role of the 'TestGiniIndex(k=k)' in evaluating dataset bias?\",\n",
-       " \" What visualization is provided by the 'RegressionErrorDistribution()'?\",\n",
-       " \" How does the 'RegressionErrorNormality()' assess value normality?\",\n",
-       " \" What is the primary focus of the 'DiversityMetric' in recommendation systems?\",\n",
-       " \" How are missing values tested in the 'TestShareOfRowsWithMissingValues()'?\",\n",
-       " ' How does the \\'TestNumberOfDuplicatedRows()\\' function evaluate dataset integrity?\"']"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "generated_queries"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3e8547f2-b3a7-4058-8175-b3872f318d1a",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
-   "source": [
-    "### [PLEASE IGNORE THE WHOLE BLOCK] Get alternative questions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "fe13a2c7-1c76-4d18-8bde-d1821078822f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#it is not used so far\n",
-    "seed_query = \"How do I get Evidently data drift report for my data?\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "0d135ad3-5be4-45de-8039-c556770c32c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#random seed question generation\n",
-    "system_prompt = \"You are an assisstant who generates questions based on provided context\"\n",
-    "user_prompt = \"\"\"\n",
-    "Generate a conceptual question based on the provided context and can be answered from the information in the provided context.\n",
-    "Here is a context\n",
-    "<context>\n",
-    "    {context}\n",
-    "</context>\n",
-    "\n",
-    "Remain faithful to the underlying context. \n",
-    "Avoid providing any preamble!\n",
-    "Avoid providing any closing statement!\n",
-    "Please return only a question\n",
-    "\"\"\"\n",
-    "\n",
-    "context = \"\\n\\n\".join(random_chuncks)\n",
-    "\n",
-    "formated_user_prompt = user_prompt.format(context=context, N=number_of_questions)\n",
-    "\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"gpt-4o\",  # Updated to a valid model\n",
-    "    messages=[\n",
-    "        {\"role\": \"system\", \"content\": system_prompt},\n",
-    "        {\"role\": \"user\", \"content\": formated_user_prompt}\n",
-    "    ],\n",
-    "    max_tokens=400,  # Limits the response length\n",
-    "    temperature=0.7,  # Controls randomness in the output\n",
-    "    n=1\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "0b751174-f5ab-4b2c-bde1-3e1c9bcc9d45",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_seed = response.choices[0].message.content.strip().split(\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "82c0618f-e790-4565-acad-e62edf4dfba3",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['How does the `PopularityBias()` metric evaluate recommendation systems',\n",
-       " ' and what parameters are required to compute this metric?']"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "generated_seed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "39df8c68-84cb-43af-aba3-63d1f10537ac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#do not forget to write a prompt for seed query generation\n",
-    "system_prompt = \"You are a smart assistant who helps rephrase questions\" \n",
-    "\n",
-    "number_of_reformulations = 5\n",
-    "\n",
-    "seed_query = generated_seed\n",
-    "\n",
-    "user_prompt = \"\"\"Write for me {number_of_reformulations} alternative questions quite similar to the question you got.\n",
-    "The question: {seed_query}\n",
-    "\n",
-    "Return a list of questions.\n",
-    "This should be only a list of string questions, separated by comma\n",
-    "\"\"\"\n",
-    "\n",
-    "formated_user_prompt = user_prompt.format(number_of_reformulations=number_of_reformulations, \n",
-    "                                          seed_query=seed_query)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "983b4545-0511-473e-8797-7fbdf2d5ff54",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Make a request to the OpenAI to expand a seed question\n",
-    "\n",
-    "response = client.chat.completions.create(\n",
-    "    model=\"gpt-4o\",  # Updated to a valid model\n",
-    "    messages=[\n",
-    "        {\"role\": \"system\", \"content\": system_prompt},\n",
-    "        {\"role\": \"user\", \"content\": formated_user_prompt}\n",
-    "    ],\n",
-    "    max_tokens=400,  # Limits the response length\n",
-    "    temperature=0.7,  # Controls randomness in the output\n",
-    "    n=1\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "9c2fe61b-5470-469a-949c-9e1a65c0f4e0",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Generated Completion:\n",
-      "['What parameters are needed to calculate the `PopularityBias()` metric in evaluating recommendation systems?', 'In what way does the `PopularityBias()` metric assess recommendation systems, and what are the necessary parameters?', 'Which parameters are essential for the `PopularityBias()` metric, and how does it evaluate recommendation systems?', 'How is the `PopularityBias()` metric used to evaluate recommendation systems, and what parameters does it need?', 'What is the role of the `PopularityBias()` metric in assessing recommendation systems, and which parameters are required for its computation?']\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[\"['What parameters are needed to calculate the `PopularityBias()` metric in evaluating recommendation systems?'\",\n",
-       " \" 'In what way does the `PopularityBias()` metric assess recommendation systems\",\n",
-       " \" and what are the necessary parameters?'\",\n",
-       " \" 'Which parameters are essential for the `PopularityBias()` metric\",\n",
-       " \" and how does it evaluate recommendation systems?'\",\n",
-       " \" 'How is the `PopularityBias()` metric used to evaluate recommendation systems\",\n",
-       " \" and what parameters does it need?'\",\n",
-       " \" 'What is the role of the `PopularityBias()` metric in assessing recommendation systems\",\n",
-       " \" and which parameters are required for its computation?']\"]"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "completion_text = response.choices[0].message.content\n",
-    "print(f\"Generated Completion:\\n{completion_text}\")\n",
-    "\n",
-    "queries = completion_text.strip().split(\",\")\n",
-    "queries"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "df169dc5-acbd-46e7-b163-c5ebebb8ea0d",
-   "metadata": {},
-   "source": [
-    "### Find relevant chuncks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "0333932a-8f6e-48e6-9f28-9f5c0406d091",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def query_collection(question, n_results = 3):\n",
-    "    \"\"\"\n",
-    "    Queries the collection with a given question and returns the relevant text chunks.\n",
-    "    \n",
-    "    Args:\n",
-    "        question (str): The query or question text to search for.\n",
-    "        n_results (int): Number of results to retrieve. Default is 3.\n",
-    "\n",
-    "    Returns:\n",
-    "        List[str]: A list of relevant text chunks.\n",
-    "    \"\"\"\n",
-    "    # Perform the query\n",
-    "    results = collection.query(\n",
-    "        query_texts=question,\n",
-    "        n_results=n_results,\n",
-    "        # include=['embeddings', 'documents', 'distances']\n",
-    "    )\n",
-    "\n",
-    "    # Extract relevant text chunks from the documents\n",
-    "    relevant_chunks = [\n",
-    "        chunk for document in results[\"documents\"] for chunk in document\n",
-    "    ]\n",
-    "    \n",
-    "    return relevant_chunks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "b827e30d-a7b2-406f-a139-5b7fdd3bab6c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[' times item *i* was rated in the training set (popularity of item *i*)\\n\\n**Range**: 0 to infinity \\n\\n**Interpretation**: the higher the value, the more popular on average the recommendations are in top-K.  \\n\\n**Note**: This metric is not normalized and depends on the number of recommendations in the training set.\\n\\nFurther reading: [Abdollahpouri, H., Mansoury, M., Burke, R., Mobasher, B., & Malthouse, E. (2021). User-centered Evaluation of Popularity Bias in Recommender Systems](https://dl.acm.org/',\n",
-       " 'bdollahpouri, H., Mansoury, M., Burke, R., Mobasher, B., & Malthouse, E. (2021). User-centered Evaluation of Popularity Bias in Recommender Systems](https://dl.acm.org/doi/fullHtml/10.1145/3450613.3456821)\\n\\n# Recommendation table\\n\\n![](../.gitbook/assets/reports/metric_recsys_table-min.png)\\n\\n**Evidently Metric**: `RecCasesTable`\\n\\nThis visual Metric shows the list of recommendations for the specified user IDs (`user_ids: List`). If you do not pass the list of IDs, Evidently will choose 5 random on',\n",
-       " 'reports/metric_popularity_bias-min.png)\\n\\n**Evidently Metric**: `PopularityBias`\\n\\nThe recommendation popularity bias is a tendency to favor a few popular items. This metric includes several measurements: ARP, Coverage and Gini index.\\n\\n## 1. Average Recommendation Popularity (ARP)\\n\\nARP reflects the average popularity of the items recommended to the users. \\n\\n**Implementation**.\\n* Compute the item popularity as the number of times each item was seen in training. \\n* Compute the average popularity for',\n",
-       " '---\\ndescription: List of Metrics, Descriptors and Metric Presets available in Evidently.\\n---\\n\\n<details>\\n\\n<summary>How to use this page</summary>\\n\\nThis is a reference page. It shows all the available Metrics, Descriptors and Presets. \\n  \\nYou can use the menu on the right to navigate the sections. We organize the Metrics by logical groups. Note that these groups do **not** match the Presets with a similar name. For example, there are more Data Quality Metrics than included in the `DataQualityPrese',\n",
-       " 'ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
-       " 'unique <= 2): proportion difference test for independent samples based on Z-score.\\n\\nAll tests use a 0.95 confidence level by default.  \\n\\nFor **larger data with \\\\> 1000 observations** in the reference dataset:\\n\\n* For numerical columns (n\\\\_unique \\\\> 5):[Wasserstein Distance](https://en.wikipedia.org/wiki/Wasserstein_metric).\\n* For categorical columns or numerical with n\\\\_unique <= 5):[Jensen--Shannon divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence).\\n\\nAll metrics use a t']"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "query_collection(seed_query)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "d549b9ab-1e3a-490e-a57e-669af72dbdb6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#relevant_chunks = [query_collection(query) for query in queries]\n",
-    "relevant_chunks = [query_collection(query) for query in generated_queries]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "cfeb5073-f37c-4d24-85a7-bf2043dacb1e",
-   "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    },
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[['ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
-       "  'igate the sections. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Test or Test preset.  \\n* **Description**: plain text explanation. For Tests, we specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: available configurations. \\n  * Required parameters are necessary for calculations, e.g. a column name for a column-level test.\\n  * Optional parameters modify how the underlying metric is calculated, e.g. which statistical test or correlation method is use',\n",
-       "  '---\\ndescription: List of Metrics, Descriptors and Metric Presets available in Evidently.\\n---\\n\\n<details>\\n\\n<summary>How to use this page</summary>\\n\\nThis is a reference page. It shows all the available Metrics, Descriptors and Presets. \\n  \\nYou can use the menu on the right to navigate the sections. We organize the Metrics by logical groups. Note that these groups do **not** match the Presets with a similar name. For example, there are more Data Quality Metrics than included in the `DataQualityPrese'],\n",
-       " ['---\\ndescription: List of Metrics, Descriptors and Metric Presets available in Evidently.\\n---\\n\\n<details>\\n\\n<summary>How to use this page</summary>\\n\\nThis is a reference page. It shows all the available Metrics, Descriptors and Presets. \\n  \\nYou can use the menu on the right to navigate the sections. We organize the Metrics by logical groups. Note that these groups do **not** match the Presets with a similar name. For example, there are more Data Quality Metrics than included in the `DataQualityPrese',\n",
-       "  'ity Metrics than included in the `DataQualityPreset`. \\n\\n# How to read the tables\\n\\n* **Name**: the name of the Metric.  \\n* **Description**: plain text explanation. For Metrics, we also specify whether it applies to the whole dataset or individual columns.\\n* **Parameters**: required and optional parameters for the Metric or Preset. We also specify the defaults that apply if you do not pass a custom parameter.\\n\\n**Metric visualizations**. Each Metric includes a default render. To see the visualizati',\n",
-       "  '*Interpretation**: the higher the value, the more varied items are shown to each user (e.g. inside a single recommendation block).\\n\\n**Requirements**: You must pass the `item_features` list to point to numerical columns or embeddings that describe the recommended items. For example, these could be encoded genres that represent each movie. This makes it possible to compare the degree of similarity between different items. \\n\\n**Notes**: \\n* This metric does not consider relevance. A recommender syste'],\n",
-       " ['reports/metric_popularity_bias-min.png)\\n\\n**Evidently Metric**: `PopularityBias`\\n\\nThe recommendation popularity bias is a tendency to favor a few popular items. This metric includes several measurements: ARP, Coverage and Gini index.\\n\\n## 1. Average Recommendation Popularity (ARP)\\n\\nARP reflects the average popularity of the items recommended to the users. \\n\\n**Implementation**.\\n* Compute the item popularity as the number of times each item was seen in training. \\n* Compute the average popularity for',\n",
-       "  ' times item *i* was rated in the training set (popularity of item *i*)\\n\\n**Range**: 0 to infinity \\n\\n**Interpretation**: the higher the value, the more popular on average the recommendations are in top-K.  \\n\\n**Note**: This metric is not normalized and depends on the number of recommendations in the training set.\\n\\nFurther reading: [Abdollahpouri, H., Mansoury, M., Burke, R., Mobasher, B., & Malthouse, E. (2021). User-centered Evaluation of Popularity Bias in Recommender Systems](https://dl.acm.org/',\n",
-       "  'r><br> | **Required**:<ul><li>`k`</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **PopularityBias()** <br><br> Evaluates the popularity bias in recommendations by computing ARP (average recommendation popularity), Gini index, and coverage. <br><br>Requires a training dataset. | **Required**:<ul><li>`K`</li><li>`normalize_arp (default: False)` - whether to normalize ARP calculation by the most popular item in training</li></ul>**Optional**:<ul><li>-</li></ul> |\\n| **ItemBiasMetric()** <br><br> Visu'],\n",
-       " ['th reference**: the test fails if the TNR is over 20% higher or lower.<br><br>**No reference**: the test fails if the TNR is lower than the TNR of the dummy model. |\\n| **TestFPR()** | Dataset-level. <br><br> Computes the False Positive Rate and compares it to the reference or against a defined condition. | **Required**:<br>N/A<br><br> **Optional:**<ul><li>`probas_threshold`(default for classification = None; default for probabilistic classification = 0.5)</li><li>`k` (default = None)</li></ul>**',\n",
-       "  'In some tests and metrics, Evidently uses the default Data Drift Detection algorithm. It helps detect the distribution drift in the individual features, prediction, or target. This page describes how the **default** algorithm works.\\n\\n# How it works\\n\\nEvidently compares the distributions of the values in a given column (or columns) of the two datasets. You should pass these datasets as **reference** and **current**. Evidently applies several statistical tests and drift detection methods to detect ',\n",
-       "  ' parameters**:\\n* `columns`\\n\\n</details>\\n\\n<details>\\n\\n<summary>Data Drift Preset</summary>\\n\\n`DataDriftPreset` evaluates the data distribution drift in all individual columns, and share of drifting columns in the dataset. Input columns are required. \\n\\n**Composition**:\\n* `DataDriftTable()` for all or specified `columns`\\n* `DatasetDriftMetric()` for all or specified `columns`\\n\\n**Optional parameters**:\\n* `columns`\\n* `stattest`\\n* `cat_stattest`\\n* `num_stattest`\\n* `per_column_stattest`\\n* `text_stattest`\\n'],\n",
-       " ['wer, the test fails.<br><br>**No reference**: Tests if Gini Index < 1. |\\n| **TestCoverage(k=k)** | Dataset-level. <br><br> Computes the Coverage at the top K recommendations and compares it to the reference or against a defined condition. <br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li></ul> **Optional**:<br>N/A<br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10% from reference.<br><br>**With reference**: if the Coverage at the top K is over 1',\n",
-       "  '.|\\n| **TestGiniIndex(k=k)** | Dataset-level. <br><br> Computes the Gini Index at the top K recommendations and compares it to the reference or against a defined condition.<br><br>Requires a training dataset. | **Required**:<ul><li>`k`</li></ul> **Optional**:<br>N/A<br><br> **Test conditions**: <ul><li>*standard parameters*</li></ul> | Expects +/-10% from reference.<br><br>**With reference**: if the Gini Index at the top K is over 10% higher or lower, the test fails.<br><br>**No reference**: Test',\n",
-       "  'eference**: if the Coverage at the top K is over 10% higher or lower, the test fails.<br><br>**No reference**: Tests if Coverage > 0.|\\n\\n'],\n",
-       " ['ter plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionPredictedVsActualPlot()** <br><br> Visualizes predicted vs. actual values in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorPlot()** <br><br> Visualizes the model error (predicted - actual) in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionAbsPercentageErrorPlot()** <br><br> Visualizes the absolute percentage error in a line plot. | **Required:**<br>',\n",
-       "  'rcentage error in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorDistribution()** <br><br> Visualizes the distribution of the model error in a histogram. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorNormality()** <br><br> Visualizes the quantile-quantile plot (Q-Q plot) to estimate value normality. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionTopErrorMetric()** <br><br> Calculates the regression performance me',\n",
-       "  'all scores, if available). \\n\\n# Item Bias \\n\\n![](../.gitbook/assets/reports/metric_itembias_2-min.png)\\n\\n**Evidently Metric**: `ItemBiasMetric`\\n\\nThis visual metric shows the distribution of recommendations by a specified category or numerical value (`column_name`) compared to its distribution in the training set.\\n\\nThis helps compare the model recommendations against what could have been a random classifier that follows the observed distribution of items by a chosen characteristic.\\n\\nThe visualizatio'],\n",
-       " ['rcentage error in a line plot. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorDistribution()** <br><br> Visualizes the distribution of the model error in a histogram. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionErrorNormality()** <br><br> Visualizes the quantile-quantile plot (Q-Q plot) to estimate value normality. | **Required:**<br>n/a<br><br>**Optional:**<br>n/a |\\n| **RegressionTopErrorMetric()** <br><br> Calculates the regression performance me',\n",
-       "  'her than the RMSE of the dummy model that predicts the optimal constant (mean of the target value). |\\n| **TestValueMeanError()**<br>| Dataset-level. <br><br> Computes the Mean Error (ME) and tests if it is near zero or compares it against a defined condition. | **Required**:<br>N/A<br><br> **Optional**:<br>N/A<br><br> **Test conditions** <ul><li>*standard parameters*</li></ul>| Expects the Mean Error to be near zero.<br><br>**With/without reference**: the test fails if the Mean Error is skewed a',\n",
-       "  'n and underestimation).</li></ul> |\\n| **RegressionErrorBiasTable()** <br><br> Plots the relationship between feature values and model quality per group (for top-X% error groups, as above). | **Required:**<br>n/a<br><br>**Optional:**<ul><li>`columns`(default = all categorical and numerical columns)</li><li>`top_error` (default=0.05; the metrics are calculated for top-5% predictions with overestimation and underestimation).</li></ul>|\\n\\n# Ranking and Recommendations \\n\\nAll metrics are dataset-level.'],\n",
-       " [\"\\n**Note**: Only a single top relevant item is considered in this metric, disregarding the position and relevance of other items in the list.\\n\\n# Diversity\\n\\n![](../.gitbook/assets/reports/metric_diversity-min.png)\\n\\n**Evidently Metric**: `DiversityMetric`\\n\\n**Recommendation diversity**: this metric measures the average intra-list diversity at K. It reflects the variety of items within the same user's recommendation list, averaged by all users. \\n\\n**Implemented method**:\\n* **Measure the difference bet\",\n",
-       "  \"\\n* **Intra-list diversity**. Calculate intra-list diversity for each user by averaging the Cosine Distance between each pair of items in the user's top-K list.\\n* **Overall diversity**. Calculate the overall diversity by averaging the intra-list diversity across all users.\\n\\n**Range**: The metric is based on Cosine distance, and can take values from 0 to 2. \\n**0:** identical recommendations in top-K.\\n**2:** very diverse recommendations in top-K.\\n\\n**Interpretation**: the higher the value, the more \",\n",
-       "  'ders items that are present in training. \\n\\nFurther reading: [Castells, P., Vargas, S., & Wang, J. (2011). Novelty and Diversity Metrics for Recommender Systems: Choice, Discovery and Relevance](https://repositorio.uam.es/bitstream/handle/10486/666094/novelty_castells_DDR_2011.pdf)\\n\\n# Serendipity\\n\\n![](../.gitbook/assets/reports/metric_serendipity-min.png)\\n\\n**Evidently Metric**: `SerendipityMetric`\\n\\nRecommendation serendipity: this metric measures how unusual the relevant recommendations are in K,'],\n",
-       " ['r> **Optional**: <ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul> **Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0. <br><br>**With reference**: the test fails if the share of missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains missing values.|\\n| **TestShareOfMissingValues()**| Dataset-level. <br><br> Tests the share of missing values in the dataset against ',\n",
-       "  ' test fails if the dataset contains rows with missing values.|\\n| **TestShareOfRowsWithMissingValues()** | Dataset-level. <br><br> Tests the share of rows that contain missing values against the reference or a defined condition. | **Required**:<br>N/A<br><br>**Optional**:<ul><li>`missing_values = [], replace = True/False` (default = default list)</li></ul>**Test conditions** <ul><li>*standard parameters*</li></ul>| Expects up to +10% or 0.<br><br>**With reference**: the test fails if the share of',\n",
-       "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O'],\n",
-       " ['*: the test fails if there is at least one empty column.|\\n| **TestNumberOfDuplicatedRows()** | Dataset-level. <br><br> Tests the number of duplicate rows against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects +/- 10% or none.<br><br>**With reference**: the test fails if the share of duplicate rows is over 10% higher or lower than in the reference.<br><br>**No reference**: the te',\n",
-       "  '**With reference**: the test fails if the share of rows with missing values is over 10% higher than in reference. <br><br>**No reference**: the test fails if the dataset contains rows with missing values.|\\n| **TestNumberOfDifferentMissingValues()**| Dataset-level. <br><br> Tests the number of differently encoded missing values in the dataset against the reference or a defined condition. Detects 4 types of missing values by default and/or values from a user list. | **Required**:<br>N/A<br><br>**O',\n",
-       "  ' in the reference.<br><br>**No reference**: the test fails if there is at least one duplicate row. |\\n| **TestNumberOfDuplicatedColumns()** | Dataset-level. <br><br> Tests the number of duplicate columns against reference or a defined condition. |**Required**:<br> N/A <br><br> **Optional**:<br> N/A <br><br>**Test conditions**: <ul><li>*standard parameters*</li></ul>| Expects =< or none.<br><br>**With reference**: the test fails if the number of duplicate columns is higher than in the reference.<b']]"
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "relevant_chunks"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5e24bdc4-fb5c-4df4-b00f-01903c4ed370",
-   "metadata": {},
-   "source": [
-    "### Baseline answer generation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "ee992257-020d-461b-9b2f-928b93acb4c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Make a request to the OpenAI to answer generated question with relevant context\n",
-    "\n",
-    "def generate_baseline_answer(query, relevant_chunks):\n",
-    "    system_prompt = \"You are a helpful assistant thet answer a given question directly withou any preamble\"\n",
-    "\n",
-    "    user_prompt = \"\"\"\n",
-    "    Your task is to answer the following query: \n",
-    "    <query>\n",
-    "    {query}\n",
-    "    </query>\n",
-    "    \n",
-    "    You have access to the following documents which are meant to provide context as you answer the query:\n",
-    "    <documents>\n",
-    "    {context}\n",
-    "    </documents>\n",
-    "    \n",
-    "    Please remain faithful to the underlying context, and deviate from it only if you haven't found the answer in the provided context. \n",
-    "    Avoid providing any preamble!\n",
-    "    Avoid providing any closing statement!\n",
-    "    Please return the answer only\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    context = \"\\n\\n\".join(relevant_chunks)\n",
-    "    formated_user_prompt = user_prompt.format(query=query, context=context)\n",
-    "\n",
-    "    response = client.chat.completions.create(\n",
-    "        model=\"gpt-4o\",  # Updated to a valid model\n",
-    "        messages=[\n",
-    "            {\"role\": \"system\", \"content\": system_prompt},\n",
-    "            {\"role\": \"user\", \"content\": formated_user_prompt}\n",
-    "        ],\n",
-    "        max_tokens=400,  # Limits the response length\n",
-    "        temperature=0.7,  # Controls randomness in the output\n",
-    "        n=1\n",
-    "    )\n",
-    "    \n",
-    "    completion_text = response.choices[0].message.content\n",
-    "    return completion_text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "8db672b0-f63a-400b-b00f-22e96d02dbe5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "baseline_answers = [generate_baseline_answer(generated_queries[i], relevant_chunks[i]) for i in range(min(len(generated_queries), len(relevant_chunks)))]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "ae366d95-3438-4d6c-8030-8e8c666e0e17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_dataset = pd.DataFrame({\n",
-    "    'Query': generated_queries,\n",
-    "    'Relevant chunks': relevant_chunks,\n",
-    "    'Baseline_answers': baseline_answers\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "bdfc1029-34bb-4870-bcc2-0c32f56a0bc1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Query</th>\n",
-       "      <th>Relevant chunks</th>\n",
-       "      <th>Baseline_answers</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>\"How is the 'Name' of a Metric used in reading...</td>\n",
-       "      <td>[ity Metrics than included in the `DataQuality...</td>\n",
-       "      <td>The 'Name' of a Metric is used to identify the...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>What information does the 'Description' secti...</td>\n",
-       "      <td>[---\\ndescription: List of Metrics, Descriptor...</td>\n",
-       "      <td>The 'Description' section provides a plain tex...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>What parameters are considered for the 'Popul...</td>\n",
-       "      <td>[reports/metric_popularity_bias-min.png)\\n\\n**...</td>\n",
-       "      <td>ARP, Coverage, and Gini index are the paramete...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>How does the 'TestFPR()' function operate at ...</td>\n",
-       "      <td>[th reference**: the test fails if the TNR is ...</td>\n",
-       "      <td>The `TestFPR()` function operates at the datas...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>What is the role of the 'TestGiniIndex(k=k)' ...</td>\n",
-       "      <td>[wer, the test fails.&lt;br&gt;&lt;br&gt;**No reference**:...</td>\n",
-       "      <td>The role of the 'TestGiniIndex(k=k)' in evalua...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>What visualization is provided by the 'Regres...</td>\n",
-       "      <td>[ter plot. | **Required:**&lt;br&gt;n/a&lt;br&gt;&lt;br&gt;**Opt...</td>\n",
-       "      <td>Visualizes the distribution of the model error...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>How does the 'RegressionErrorNormality()' ass...</td>\n",
-       "      <td>[rcentage error in a line plot. | **Required:*...</td>\n",
-       "      <td>RegressionErrorNormality() assesses value norm...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>What is the primary focus of the 'DiversityMe...</td>\n",
-       "      <td>[\\n**Note**: Only a single top relevant item i...</td>\n",
-       "      <td>The primary focus of the 'DiversityMetric' in ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>How are missing values tested in the 'TestSha...</td>\n",
-       "      <td>[r&gt; **Optional**: &lt;ul&gt;&lt;li&gt;`missing_values = []...</td>\n",
-       "      <td>The 'TestShareOfRowsWithMissingValues()' tests...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>How does the 'TestNumberOfDuplicatedRows()' f...</td>\n",
-       "      <td>[*: the test fails if there is at least one em...</td>\n",
-       "      <td>The 'TestNumberOfDuplicatedRows()' function ev...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                               Query  \\\n",
-       "0  \"How is the 'Name' of a Metric used in reading...   \n",
-       "1   What information does the 'Description' secti...   \n",
-       "2   What parameters are considered for the 'Popul...   \n",
-       "3   How does the 'TestFPR()' function operate at ...   \n",
-       "4   What is the role of the 'TestGiniIndex(k=k)' ...   \n",
-       "5   What visualization is provided by the 'Regres...   \n",
-       "6   How does the 'RegressionErrorNormality()' ass...   \n",
-       "7   What is the primary focus of the 'DiversityMe...   \n",
-       "8   How are missing values tested in the 'TestSha...   \n",
-       "9   How does the 'TestNumberOfDuplicatedRows()' f...   \n",
-       "\n",
-       "                                     Relevant chunks  \\\n",
-       "0  [ity Metrics than included in the `DataQuality...   \n",
-       "1  [---\\ndescription: List of Metrics, Descriptor...   \n",
-       "2  [reports/metric_popularity_bias-min.png)\\n\\n**...   \n",
-       "3  [th reference**: the test fails if the TNR is ...   \n",
-       "4  [wer, the test fails.<br><br>**No reference**:...   \n",
-       "5  [ter plot. | **Required:**<br>n/a<br><br>**Opt...   \n",
-       "6  [rcentage error in a line plot. | **Required:*...   \n",
-       "7  [\\n**Note**: Only a single top relevant item i...   \n",
-       "8  [r> **Optional**: <ul><li>`missing_values = []...   \n",
-       "9  [*: the test fails if there is at least one em...   \n",
-       "\n",
-       "                                    Baseline_answers  \n",
-       "0  The 'Name' of a Metric is used to identify the...  \n",
-       "1  The 'Description' section provides a plain tex...  \n",
-       "2  ARP, Coverage, and Gini index are the paramete...  \n",
-       "3  The `TestFPR()` function operates at the datas...  \n",
-       "4  The role of the 'TestGiniIndex(k=k)' in evalua...  \n",
-       "5  Visualizes the distribution of the model error...  \n",
-       "6  RegressionErrorNormality() assesses value norm...  \n",
-       "7  The primary focus of the 'DiversityMetric' in ...  \n",
-       "8  The 'TestShareOfRowsWithMissingValues()' tests...  \n",
-       "9  The 'TestNumberOfDuplicatedRows()' function ev...  "
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "generated_dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "3de32ca3-162f-4ed8-ba88-09a5b9572457",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.set_option(\"display.max_colwidth\", None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "db43a50d-4b1a-4b42-a529-67e85bef0f9a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Query</th>\n",
-       "      <th>Baseline_answers</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>\"How is the 'Name' of a Metric used in reading tables?</td>\n",
-       "      <td>The 'Name' of a Metric is used to identify the specific Metric being referenced.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>What information does the 'Description' section provide in the context of Metrics?</td>\n",
-       "      <td>The 'Description' section provides a plain text explanation of the Metric, specifying whether it applies to the whole dataset or individual columns.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>What parameters are considered for the 'PopularityBias()' Metric?</td>\n",
-       "      <td>ARP, Coverage, and Gini index are the parameters considered for the 'PopularityBias()' Metric.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>How does the 'TestFPR()' function operate at the dataset level?</td>\n",
-       "      <td>The `TestFPR()` function operates at the dataset level by computing the False Positive Rate (FPR) and comparing it to a reference or against a defined condition.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>What is the role of the 'TestGiniIndex(k=k)' in evaluating dataset bias?</td>\n",
-       "      <td>The role of the 'TestGiniIndex(k=k)' in evaluating dataset bias is to compute the Gini Index at the top K recommendations and compare it to a reference or a defined condition. If the Gini Index at the top K is over 10% higher or lower than the reference, the test fails. This helps in assessing the fairness and distribution of recommendations, indicating potential bias if the Gini Index significantly deviates from the reference.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>What visualization is provided by the 'RegressionErrorDistribution()'?</td>\n",
-       "      <td>Visualizes the distribution of the model error in a histogram.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>How does the 'RegressionErrorNormality()' assess value normality?</td>\n",
-       "      <td>RegressionErrorNormality() assesses value normality by visualizing the quantile-quantile plot (Q-Q plot).</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>What is the primary focus of the 'DiversityMetric' in recommendation systems?</td>\n",
-       "      <td>The primary focus of the 'DiversityMetric' in recommendation systems is to measure the average intra-list diversity at K, reflecting the variety of items within the same user's recommendation list, averaged by all users.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>How are missing values tested in the 'TestShareOfRowsWithMissingValues()'?</td>\n",
-       "      <td>The 'TestShareOfRowsWithMissingValues()' tests the share of rows that contain missing values against a reference or a defined condition. With reference, the test fails if the share of rows with missing values is over 10% higher than in the reference. Without reference, the test fails if the dataset contains any rows with missing values.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>How does the 'TestNumberOfDuplicatedRows()' function evaluate dataset integrity?\"</td>\n",
-       "      <td>The 'TestNumberOfDuplicatedRows()' function evaluates dataset integrity by testing the number of duplicate rows against a reference or a defined condition. If a reference is provided, the test fails if the share of duplicate rows is over 10% higher or lower than in the reference. If no reference is provided, the test fails if there is at least one duplicate row.</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                                                 Query  \\\n",
-       "0                               \"How is the 'Name' of a Metric used in reading tables?   \n",
-       "1   What information does the 'Description' section provide in the context of Metrics?   \n",
-       "2                    What parameters are considered for the 'PopularityBias()' Metric?   \n",
-       "3                      How does the 'TestFPR()' function operate at the dataset level?   \n",
-       "4             What is the role of the 'TestGiniIndex(k=k)' in evaluating dataset bias?   \n",
-       "5               What visualization is provided by the 'RegressionErrorDistribution()'?   \n",
-       "6                    How does the 'RegressionErrorNormality()' assess value normality?   \n",
-       "7        What is the primary focus of the 'DiversityMetric' in recommendation systems?   \n",
-       "8           How are missing values tested in the 'TestShareOfRowsWithMissingValues()'?   \n",
-       "9    How does the 'TestNumberOfDuplicatedRows()' function evaluate dataset integrity?\"   \n",
-       "\n",
-       "                                                                                                                                                                                                                                                                                                                                                                                                                                  Baseline_answers  \n",
-       "0                                                                                                                                                                                                                                                                                                                                                                 The 'Name' of a Metric is used to identify the specific Metric being referenced.  \n",
-       "1                                                                                                                                                                                                                                                                                             The 'Description' section provides a plain text explanation of the Metric, specifying whether it applies to the whole dataset or individual columns.  \n",
-       "2                                                                                                                                                                                                                                                                                                                                                   ARP, Coverage, and Gini index are the parameters considered for the 'PopularityBias()' Metric.  \n",
-       "3                                                                                                                                                                                                                                                                                The `TestFPR()` function operates at the dataset level by computing the False Positive Rate (FPR) and comparing it to a reference or against a defined condition.  \n",
-       "4  The role of the 'TestGiniIndex(k=k)' in evaluating dataset bias is to compute the Gini Index at the top K recommendations and compare it to a reference or a defined condition. If the Gini Index at the top K is over 10% higher or lower than the reference, the test fails. This helps in assessing the fairness and distribution of recommendations, indicating potential bias if the Gini Index significantly deviates from the reference.  \n",
-       "5                                                                                                                                                                                                                                                                                                                                                                                   Visualizes the distribution of the model error in a histogram.  \n",
-       "6                                                                                                                                                                                                                                                                                                                                        RegressionErrorNormality() assesses value normality by visualizing the quantile-quantile plot (Q-Q plot).  \n",
-       "7                                                                                                                                                                                                                     The primary focus of the 'DiversityMetric' in recommendation systems is to measure the average intra-list diversity at K, reflecting the variety of items within the same user's recommendation list, averaged by all users.  \n",
-       "8                                                                                               The 'TestShareOfRowsWithMissingValues()' tests the share of rows that contain missing values against a reference or a defined condition. With reference, the test fails if the share of rows with missing values is over 10% higher than in the reference. Without reference, the test fails if the dataset contains any rows with missing values.  \n",
-       "9                                                                     The 'TestNumberOfDuplicatedRows()' function evaluates dataset integrity by testing the number of duplicate rows against a reference or a defined condition. If a reference is provided, the test fails if the share of duplicate rows is over 10% higher or lower than in the reference. If no reference is provided, the test fails if there is at least one duplicate row.  "
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "generated_dataset[[\"Query\", \"Baseline_answers\"]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8530a6c0-6d4d-4c44-be73-f7d3c7d88e50",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From f22f1f964c68e67386d95b1d35a4aafda378fb9c Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Fri, 11 Oct 2024 22:44:51 +0200
Subject: [PATCH 49/63] mypy

---
 setup.cfg                                |  6 ++
 src/evidently/dataset_generators/base.py |  3 +-
 src/evidently/utils/llm.py               | 71 +++++-------------------
 3 files changed, 23 insertions(+), 57 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 7f9f43d785..caad3b9970 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -106,6 +106,12 @@ ignore_missing_imports = True
 [mypy-litellm.*]
 ignore_missing_imports = True
 
+[mypy-chromadb.*]
+ignore_missing_imports = True
+
+[mypy-llama-index.*]
+ignore_missing_imports = True
+
 [tool:pytest]
 testpaths=tests
 python_classes=*Test
diff --git a/src/evidently/dataset_generators/base.py b/src/evidently/dataset_generators/base.py
index a13b067de7..61fe38ab6f 100644
--- a/src/evidently/dataset_generators/base.py
+++ b/src/evidently/dataset_generators/base.py
@@ -1,12 +1,13 @@
 from abc import ABC
 from abc import abstractmethod
+from typing import TypeAlias
 
 import pandas as pd
 
 from evidently.options.base import Options
 from evidently.pydantic_utils import EvidentlyBaseModel
 
-DatasetGeneratorResult = pd.DataFrame
+DatasetGeneratorResult: TypeAlias = pd.DataFrame
 
 
 class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index d76a9e0a01..36663a64c2 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -67,7 +67,7 @@ class RateLimiter:
     def __init__(self, rate: Optional[int], interval: datetime.timedelta):
         self.rate = rate
         self.interval = interval
-        self.enters = []
+        self.enters: List[datetime.datetime] = []
         self.lock = Lock()
 
     async def __aenter__(self):
@@ -208,9 +208,11 @@ class OpenAIWrapper(LLMWrapper):
     __used_options__: ClassVar = [OpenAIKey]
 
     def __init__(self, model: str, options: Options):
+        import openai
+
         self.model = model
         self.options = options.get(OpenAIKey)
-        self._clients = {}
+        self._clients: Dict[int, openai.AsyncOpenAI] = {}
 
     @property
     def client(self):
@@ -246,7 +248,7 @@ class LiteLLMWrapper(LLMWrapper):
     def __init__(self, model: str):
         self.model = model
 
-    def complete(self, messages: List[LLMMessage]) -> str:
+    async def complete(self, messages: List[LLMMessage]) -> str:
         from litellm import completion
 
         return completion(model=self.model, messages=messages).choices[0].message.content
@@ -316,6 +318,9 @@ def parse_response(self, response: str) -> TResult:
 
 
 class NoopOutputFormat(OutputFormatBlock[str]):
+    def _render(self) -> str:
+        return ""
+
     def parse_response(self, response: str) -> str:
         return response
 
@@ -381,7 +386,7 @@ def inner(self: PromptTemplate, *args, **kwargs):
             )
 
         output_format = self.get_output_format()
-        prompt_response_type = _get_genric_arg(output_format)
+        prompt_response_type = _get_genric_arg(output_format.__class__)
         if prompt_response_type != response_type:
             raise TypeError(
                 f"{f} response type ({response_type}) does not correspond to prompt output type {prompt_response_type}"
@@ -399,7 +404,7 @@ def inner(self: PromptTemplate, *args, **kwargs):
     return inner
 
 
-def _get_genric_arg(cls):
+def _get_genric_arg(cls: Type):
     return typing_inspect.get_args(next(b for b in cls.__orig_bases__ if typing_inspect.is_generic_type(b)))[0]
 
 
@@ -431,7 +436,9 @@ def list_placeholders(self, template: Optional[str] = None):
         return list(placeholders_re.findall(template))
 
     def get_output_format(self) -> OutputFormatBlock:
-        output = next((b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None)
+        output: Optional[OutputFormatBlock] = next(
+            (b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None
+        )
         return output if output is not None else NoopOutputFormat()
 
     def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, Any]:
@@ -468,54 +475,6 @@ def _to_block(self, block: AnyBlock) -> PromptBlock:
             return block
         if isinstance(block, str):
             return PromptBlock.simple(block)
-        if callable(block):
-            return PromptBlock.func(block)
+        # if callable(block):  todo
+        #     return PromptBlock.func(block)
         raise NotImplementedError(f"Cannot create promt block from {block}")
-
-
-# class BinaryClassificationPromtTemplate(PromptTemplate):
-#     def get_blocks(self) -> Sequence[PromptBlock]:
-#         fields = {}
-#         if self.include_category:
-#             cat = f"{self.target_category} or {self.non_target_category}"
-#             if self.uncertainty == Uncertainty.UNKNOWN:
-#                 cat += " or UNKNOWN"
-#             fields["category"] = (cat, self.output_column)
-#         if self.include_score:
-#             fields["score"] = ("<score here>", self.output_score_column)
-#         if self.include_reasoning:
-#             fields["reasoning"] = ('"<reasoning here>"', self.output_reasoning_column)
-#         return [
-#             PromptBlock.simple(self.criteria),
-#             PromptBlock.simple(
-#                 f"Classify text between {self.anchor_start} and {self.anchor_end} "
-#                 f"into two categories: {self.target_category} and {self.non_target_category}."
-#             ),
-#             PromptBlock.input().anchored(self.anchor_start, self.anchor_end),
-#             PromptBlock.func(self._instructions),
-#             JsonOutputFormatBlock(fields=fields),
-#         ]
-#
-#     criteria: str = ""
-#     instructions_template: str = (
-#         "Use the following categories for classification:\n{__categories__}\n{__scoring__}\nThink step by step."
-#     )
-#     anchor_start: str = "___text_starts_here___"
-#     anchor_end: str = "___text_ends_here___"
-#
-#     placeholders: Dict[str, str] = {}
-#     target_category: str
-#     non_target_category: str
-#
-#     uncertainty: Uncertainty = Uncertainty.UNKNOWN
-#
-#     include_category: bool = True
-#     include_reasoning: bool = False
-#     include_score: bool = False
-#     score_range: Tuple[float, float] = (0.0, 1.0)
-#
-#     output_column: str = "category"
-#     output_reasoning_column: str = "reasoning"
-#     output_score_column: str = "score"
-#
-#     pre_messages: List[LLMMessage] = Field(default_factory=list)

From d3f91228ffec22d23947fe41a0173580de761236 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 18:24:53 +0400
Subject: [PATCH 50/63] move, add splitter

---
 .../dataset_generators/__init__.py            |   0
 .../dataset_generators/base.py                |   0
 .../dataset_generators/llm/__init__.py        |   0
 .../dataset_generators/llm/base.py            |   2 +-
 .../dataset_generators/llm/index.py           |  18 +--
 .../dataset_generators/llm/prompts.py         |   0
 .../dataset_generators/llm/questions.py       |  18 +--
 .../dataset_generators/llm/splitter.py        | 125 ++++++++++++++++++
 8 files changed, 145 insertions(+), 18 deletions(-)
 rename src/evidently/{ => experimental}/dataset_generators/__init__.py (100%)
 rename src/evidently/{ => experimental}/dataset_generators/base.py (100%)
 rename src/evidently/{ => experimental}/dataset_generators/llm/__init__.py (100%)
 rename src/evidently/{ => experimental}/dataset_generators/llm/base.py (89%)
 rename src/evidently/{ => experimental}/dataset_generators/llm/index.py (87%)
 rename src/evidently/{ => experimental}/dataset_generators/llm/prompts.py (100%)
 rename src/evidently/{ => experimental}/dataset_generators/llm/questions.py (77%)
 create mode 100644 src/evidently/experimental/dataset_generators/llm/splitter.py

diff --git a/src/evidently/dataset_generators/__init__.py b/src/evidently/experimental/dataset_generators/__init__.py
similarity index 100%
rename from src/evidently/dataset_generators/__init__.py
rename to src/evidently/experimental/dataset_generators/__init__.py
diff --git a/src/evidently/dataset_generators/base.py b/src/evidently/experimental/dataset_generators/base.py
similarity index 100%
rename from src/evidently/dataset_generators/base.py
rename to src/evidently/experimental/dataset_generators/base.py
diff --git a/src/evidently/dataset_generators/llm/__init__.py b/src/evidently/experimental/dataset_generators/llm/__init__.py
similarity index 100%
rename from src/evidently/dataset_generators/llm/__init__.py
rename to src/evidently/experimental/dataset_generators/llm/__init__.py
diff --git a/src/evidently/dataset_generators/llm/base.py b/src/evidently/experimental/dataset_generators/llm/base.py
similarity index 89%
rename from src/evidently/dataset_generators/llm/base.py
rename to src/evidently/experimental/dataset_generators/llm/base.py
index 8be9f7aad6..09bfaa411a 100644
--- a/src/evidently/dataset_generators/llm/base.py
+++ b/src/evidently/experimental/dataset_generators/llm/base.py
@@ -1,7 +1,7 @@
 from typing import Optional
 
 from evidently._pydantic_compat import PrivateAttr
-from evidently.dataset_generators.base import BaseDatasetGenerator
+from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
 from evidently.options.base import Options
 from evidently.utils.llm import LLMWrapper
 from evidently.utils.llm import get_llm_wrapper
diff --git a/src/evidently/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py
similarity index 87%
rename from src/evidently/dataset_generators/llm/index.py
rename to src/evidently/experimental/dataset_generators/llm/index.py
index 4349e41312..7b0bb705b7 100644
--- a/src/evidently/dataset_generators/llm/index.py
+++ b/src/evidently/experimental/dataset_generators/llm/index.py
@@ -8,9 +8,9 @@
 import chromadb
 from chromadb.types import Collection
 from chromadb.utils import embedding_functions
-from llama_index.core.node_parser import SentenceSplitter
-from pypdf import PdfReader
 
+from evidently.experimental.dataset_generators.llm.splitter import AnySplitter
+from evidently.experimental.dataset_generators.llm.splitter import Splitter
 from evidently.pydantic_utils import EvidentlyBaseModel
 
 Chunk = str
@@ -24,6 +24,10 @@
 def read_text(filename: str) -> str:
     file_path = Path(filename)
     if file_path.suffix.lower() == ".pdf":
+        try:
+            from pypdf import PdfReader
+        except ImportError as e:
+            raise ImportError("Please install pypdf to extract context from .pdf files") from e
         reader = PdfReader(file_path)
         text = ""
         for page_num in range(len(reader.pages)):
@@ -40,6 +44,7 @@ class Config:
 
     chunk_size: int = DEFAULT_CHUNK_SIZE
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
+    splitter: AnySplitter = "llama_index"
 
     def get_data_collection(self) -> "DataCollection":
         raise NotImplementedError
@@ -68,16 +73,13 @@ class FileDataCollectionProvider(DataCollectionProvider):
     path: str
 
     def get_data_collection(self):
-        splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
-        text_nodes = []
         file_path = Path(self.path)
         paths = [self.path] if file_path.is_file() else glob.glob(os.path.join(self.path, "*"))
 
-        for filename in paths:
-            nodes = splitter.split_text(read_text(filename))
-            text_nodes.extend(nodes)
+        splitter = Splitter.from_any(self.splitter, self.chunk_size, self.chunk_overlap)
+        chunks = list(splitter.split([read_text(p) for p in paths]))
 
-        data_collection = DataCollection(name=file_path.name, chunks=text_nodes)
+        data_collection = DataCollection(name=file_path.name, chunks=chunks)
         data_collection.init_collection()
         return data_collection
 
diff --git a/src/evidently/dataset_generators/llm/prompts.py b/src/evidently/experimental/dataset_generators/llm/prompts.py
similarity index 100%
rename from src/evidently/dataset_generators/llm/prompts.py
rename to src/evidently/experimental/dataset_generators/llm/prompts.py
diff --git a/src/evidently/dataset_generators/llm/questions.py b/src/evidently/experimental/dataset_generators/llm/questions.py
similarity index 77%
rename from src/evidently/dataset_generators/llm/questions.py
rename to src/evidently/experimental/dataset_generators/llm/questions.py
index c170e35953..7d18b55ed1 100644
--- a/src/evidently/dataset_generators/llm/questions.py
+++ b/src/evidently/experimental/dataset_generators/llm/questions.py
@@ -5,15 +5,15 @@
 
 import pandas as pd
 
-from evidently.dataset_generators.base import DatasetGeneratorResult
-from evidently.dataset_generators.llm.base import BaseLLMDatasetGenerator
-from evidently.dataset_generators.llm.index import Chunk
-from evidently.dataset_generators.llm.index import DataCollection
-from evidently.dataset_generators.llm.index import DataCollectionProvider
-from evidently.dataset_generators.llm.prompts import BaselineAnswerPrompt
-from evidently.dataset_generators.llm.prompts import NaiveQuestionsFromContext
-from evidently.dataset_generators.llm.prompts import QuestionsFromContext
-from evidently.dataset_generators.llm.prompts import QuestionsFromSeed
+from evidently.experimental.dataset_generators.base import DatasetGeneratorResult
+from evidently.experimental.dataset_generators.llm.base import BaseLLMDatasetGenerator
+from evidently.experimental.dataset_generators.llm.index import Chunk
+from evidently.experimental.dataset_generators.llm.index import DataCollection
+from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
+from evidently.experimental.dataset_generators.llm.prompts import BaselineAnswerPrompt
+from evidently.experimental.dataset_generators.llm.prompts import NaiveQuestionsFromContext
+from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromContext
+from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromSeed
 
 Question = str
 Answer = str
diff --git a/src/evidently/experimental/dataset_generators/llm/splitter.py b/src/evidently/experimental/dataset_generators/llm/splitter.py
new file mode 100644
index 0000000000..d810956b52
--- /dev/null
+++ b/src/evidently/experimental/dataset_generators/llm/splitter.py
@@ -0,0 +1,125 @@
+import re
+from abc import ABC
+from abc import abstractmethod
+from enum import Enum
+from typing import ClassVar
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Union
+
+from evidently._pydantic_compat import PrivateAttr
+from evidently.pydantic_utils import EvidentlyBaseModel
+
+
+class TextSource:
+    @classmethod
+    def from_any(cls, text_source: "AnyTextSource"):
+        if isinstance(text_source, TextSource):
+            return text_source
+        if isinstance(text_source, str):
+            return StrSource(text_source)
+        raise NotImplementedError(f"Cannot create TextSource from {text_source.__class__.__name__}")
+
+    @abstractmethod
+    def get_text(self) -> str:
+        raise NotImplementedError
+
+
+class StrSource(TextSource):
+    def __init__(self, value: str):
+        self.value = value
+
+    def get_text(self) -> str:
+        return self.value
+
+
+AnyTextSource = Union[str, bytes, TextSource]
+
+Chunk = str
+Split = str
+
+
+class Splitters(str, Enum):
+    Simple = "simple"
+    LlamaIndex = "llama_index"
+
+
+AnySplitter = Union[str, Splitters, "Splitter"]
+
+
+class Splitter(EvidentlyBaseModel, ABC):
+    class Config:
+        is_base_type = True
+        alias_required = False  # fixme
+
+    chunk_size: int
+    chunk_overlap: int
+
+    def split(self, texts: Union[AnyTextSource, List[AnyTextSource]]) -> Sequence[Chunk]:
+        if not isinstance(texts, list):
+            texts = [texts]
+
+        for text in texts:
+            yield from self.split_text(TextSource.from_any(text))
+
+    @abstractmethod
+    def split_text(self, text: TextSource) -> Sequence[Chunk]:
+        raise NotImplementedError
+
+    @classmethod
+    def from_any(cls, splitter: AnySplitter, chunk_size: int, chunk_overlap: int, **kwargs):
+        if isinstance(splitter, Splitter):
+            return splitter
+        if isinstance(splitter, str):
+            splitter = Splitters(splitter)
+        if isinstance(splitter, Splitters):
+            if splitter == Splitters.Simple:
+                return SimpleSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+            if splitter == Splitters.LlamaIndex:
+                return LlamaIndexSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
+            raise ValueError(f"Unknown splitter {splitter}")
+        raise NotImplementedError(f"Cannot create splitter from {splitter.__class__.__name__}")
+
+
+class SimpleSplitter(Splitter):
+    split_re: ClassVar = re.compile(r"([^,.;。？！]+[,.;。？！]?)")
+
+    def split_text(self, text: TextSource) -> Sequence[Chunk]:
+        current_splits = []
+        current_size = 0
+        for split in self.split_re.split(text.get_text()):
+            split_size = len(split)
+            if len(current_splits) > 0 and current_size + split_size > self.chunk_size:
+                yield "".join(current_splits)
+                while current_size > self.chunk_overlap and len(current_splits) > 0:
+                    last, *current_splits = current_splits
+                    last_size = len(last)
+                    current_size -= last_size
+            current_size += split_size
+            current_splits.append(split)
+        if current_size > 0:
+            yield "".join(current_splits)
+
+
+class LlamaIndexSplitter(Splitter):
+    separator: str = " "
+    paragraph_separator: Optional[str] = None
+    _splitter = PrivateAttr(None)
+
+    @property
+    def splitter(self):
+        if self._splitter is None:
+            from llama_index.core.node_parser import SentenceSplitter
+            from llama_index.core.node_parser.text.sentence import DEFAULT_PARAGRAPH_SEP
+
+            self._splitter = SentenceSplitter(
+                chunk_size=self.chunk_size,
+                chunk_overlap=self.chunk_overlap,
+                separator=self.separator,
+                paragraph_separator=self.paragraph_separator or DEFAULT_PARAGRAPH_SEP,
+            )
+        return self._splitter
+
+    def split_text(self, text: TextSource) -> Sequence[Chunk]:
+        yield from self.splitter.split_text(text.get_text())

From a7934337854afece36d55d327f72ce91aeeeb8e8 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 18:28:03 +0400
Subject: [PATCH 51/63] fix example and deps

---
 examples/synth_data.py                                     | 4 ++--
 requirements.min.txt                                       | 1 -
 setup.py                                                   | 1 -
 src/evidently/experimental/dataset_generators/llm/index.py | 4 ----
 4 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/examples/synth_data.py b/examples/synth_data.py
index edb0cb78a9..d0463cda00 100644
--- a/examples/synth_data.py
+++ b/examples/synth_data.py
@@ -1,5 +1,5 @@
-from evidently.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
-from evidently.dataset_generators.llm.index import DataCollectionProvider
+from evidently.experimental.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
+from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
 from evidently.options.base import Options
 
 
diff --git a/requirements.min.txt b/requirements.min.txt
index 6d96eff823..e7a5d12f28 100644
--- a/requirements.min.txt
+++ b/requirements.min.txt
@@ -32,4 +32,3 @@ evaluate==0.4.1
 transformers[torch]==4.39.3
 sentence-transformers==2.7.0
 chromadb==0.4.0
-llama-index==0.8.42
diff --git a/setup.py b/setup.py
index 1f892b61b2..df67f1a052 100644
--- a/setup.py
+++ b/setup.py
@@ -103,7 +103,6 @@
             "transformers[torch]>=4.39.3",
             "sentence-transformers>=2.7.0",
             "chromadb>=0.4.0",
-            "llama-index>=0.8.42",
         ],
         "spark": ["pyspark>=3.4.0"],
         "fsspec": [
diff --git a/src/evidently/experimental/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py
index 7b0bb705b7..cb632ccec1 100644
--- a/src/evidently/experimental/dataset_generators/llm/index.py
+++ b/src/evidently/experimental/dataset_generators/llm/index.py
@@ -1,6 +1,5 @@
 import glob
 import os
-import warnings
 from pathlib import Path
 from typing import List
 from typing import Optional
@@ -17,9 +16,6 @@
 DEFAULT_CHUNK_SIZE = 512
 DEFAULT_CHUNK_OVERLAP = 20
 
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-warnings.filterwarnings("ignore", category=FutureWarning)
-
 
 def read_text(filename: str) -> str:
     file_path = Path(filename)

From fc320eeee9e0427aff83c23e82503e5c00c78b48 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 18:45:13 +0400
Subject: [PATCH 52/63] rename

---
 examples/{synth_data.py => data_generators.py}   |  2 +-
 .../experimental/dataset_generators/llm/index.py | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)
 rename examples/{synth_data.py => data_generators.py} (97%)

diff --git a/examples/synth_data.py b/examples/data_generators.py
similarity index 97%
rename from examples/synth_data.py
rename to examples/data_generators.py
index d0463cda00..1cd8ef87e1 100644
--- a/examples/synth_data.py
+++ b/examples/data_generators.py
@@ -5,7 +5,7 @@
 
 def generate_from_file():
     file_path = "../cloud_quickstart_tracing.pdf"
-    data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20)
+    data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20, splitter="simple")
 
     generator = QADatasetGenerator(
         data_collection=data,
diff --git a/src/evidently/experimental/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py
index cb632ccec1..c249588d8e 100644
--- a/src/evidently/experimental/dataset_generators/llm/index.py
+++ b/src/evidently/experimental/dataset_generators/llm/index.py
@@ -47,9 +47,15 @@ def get_data_collection(self) -> "DataCollection":
 
     @classmethod
     def from_files(
-        cls, path: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
+        cls,
+        path: str,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+        splitter: AnySplitter = "llama_index",
     ) -> "DataCollectionProvider":
-        return FileDataCollectionProvider(path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        return FileDataCollectionProvider(
+            path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, splitter=splitter
+        )
 
     @classmethod
     def from_chunks(cls, chunks: List[str]):
@@ -92,6 +98,12 @@ def __init__(self, name: str, chunks: List[str], collection: Optional["Collectio
 
     def init_collection(self):
         if self.collection is None:
+            # fixme: huggingface/tokenizers warns about clean_up_tokenization_spaces
+            import warnings
+
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+            warnings.filterwarnings("ignore", category=FutureWarning)
+
             default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
                 model_name="all-MiniLM-L6-v2",
             )

From fa1563c79f28beaf44f9019c7b88c92d5a0f54c3 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 18:48:41 +0400
Subject: [PATCH 53/63] lint

---
 setup.cfg                                                    | 5 ++++-
 .../experimental/dataset_generators/llm/splitter.py          | 2 +-
 src/evidently/utils/llm.py                                   | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index caad3b9970..231d1f6f6c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -109,7 +109,10 @@ ignore_missing_imports = True
 [mypy-chromadb.*]
 ignore_missing_imports = True
 
-[mypy-llama-index.*]
+[mypy-llama_index.*]
+ignore_missing_imports = True
+
+[mypy-pypdf.*]
 ignore_missing_imports = True
 
 [tool:pytest]
diff --git a/src/evidently/experimental/dataset_generators/llm/splitter.py b/src/evidently/experimental/dataset_generators/llm/splitter.py
index d810956b52..a0c7041e8d 100644
--- a/src/evidently/experimental/dataset_generators/llm/splitter.py
+++ b/src/evidently/experimental/dataset_generators/llm/splitter.py
@@ -86,7 +86,7 @@ class SimpleSplitter(Splitter):
     split_re: ClassVar = re.compile(r"([^,.;。？！]+[,.;。？！]?)")
 
     def split_text(self, text: TextSource) -> Sequence[Chunk]:
-        current_splits = []
+        current_splits: List[str] = []
         current_size = 0
         for split in self.split_re.split(text.get_text()):
             split_size = len(split)
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index 36663a64c2..c716323f36 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -439,7 +439,7 @@ def get_output_format(self) -> OutputFormatBlock:
         output: Optional[OutputFormatBlock] = next(
             (b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None
         )
-        return output if output is not None else NoopOutputFormat()
+        return output if output is not None else NoopOutputFormat()  # type: ignore[return-value]
 
     def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, Any]:
         output = self.get_output_format()

From f0ed6d4b35851daab69a8357f70ffe7a36cdc2e1 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 19:03:41 +0400
Subject: [PATCH 54/63] lint

---
 src/evidently/experimental/dataset_generators/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evidently/experimental/dataset_generators/base.py b/src/evidently/experimental/dataset_generators/base.py
index 61fe38ab6f..27c8b2b0cc 100644
--- a/src/evidently/experimental/dataset_generators/base.py
+++ b/src/evidently/experimental/dataset_generators/base.py
@@ -1,8 +1,8 @@
 from abc import ABC
 from abc import abstractmethod
-from typing import TypeAlias
 
 import pandas as pd
+from typing_extensions import TypeAlias
 
 from evidently.options.base import Options
 from evidently.pydantic_utils import EvidentlyBaseModel

From 8d50d7b0a3a3600750be439b783449db4f1336d5 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 19:05:43 +0400
Subject: [PATCH 55/63] audit

---
 .github/workflows/main.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0dd15a80a0..c26c805971 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -117,7 +117,11 @@ jobs:
       - name: Install package
         run: pip install -e .[dev,spark,fsspec,llm]
       - name: Run pip-audit
-        run: pip-audit --ignore-vuln PYSEC-2024-48 --ignore-vuln GHSA-jw8x-6495-233v --ignore-vuln GHSA-4hq2-rpgc-r8r7
+        run: pip-audit \
+          --ignore-vuln PYSEC-2024-48 \
+          --ignore-vuln GHSA-jw8x-6495-233v \
+          --ignore-vuln GHSA-4hq2-rpgc-r8r7 \
+          --ignore-vuln PYSEC-2024-38  # fastapi (needed by chromadb). we don't use fastapi
       - name: Run Tests
         run: python -m pytest --durations=50
   test:

From bb0ecc729126175229656b0c4f42e72cc6e53c0e Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 19:25:16 +0400
Subject: [PATCH 56/63] type aliases and audit

---
 .github/workflows/main.yml                    | 11 +++----
 .../experimental/dataset_generators/base.py   |  2 --
 .../dataset_generators/llm/index.py           |  9 +++++-
 .../dataset_generators/llm/prompts.py         | 30 +++++++++++++++----
 .../dataset_generators/llm/questions.py       | 20 ++++++++-----
 .../dataset_generators/llm/splitter.py        |  7 ++++-
 src/evidently/utils/llm.py                    | 25 ++++++++++++++--
 7 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c26c805971..ca6260fdcd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -117,11 +117,12 @@ jobs:
       - name: Install package
         run: pip install -e .[dev,spark,fsspec,llm]
       - name: Run pip-audit
-        run: pip-audit \
-          --ignore-vuln PYSEC-2024-48 \
-          --ignore-vuln GHSA-jw8x-6495-233v \
-          --ignore-vuln GHSA-4hq2-rpgc-r8r7 \
-          --ignore-vuln PYSEC-2024-38  # fastapi (needed by chromadb). we don't use fastapi
+        run: |
+          pip-audit \
+          --ignore-vuln PYSEC-2024-48 \ # black ReDoS, we dont use black in prod
+          --ignore-vuln GHSA-jw8x-6495-233v \ # sklearn TfidfVectorizer leak in github actions
+          --ignore-vuln GHSA-4hq2-rpgc-r8r7 \ # litestar repo CI/CD
+          --ignore-vuln PYSEC-2024-38  # fastapi ReDoS(needed by chromadb). we don't use fastapi
       - name: Run Tests
         run: python -m pytest --durations=50
   test:
diff --git a/src/evidently/experimental/dataset_generators/base.py b/src/evidently/experimental/dataset_generators/base.py
index 27c8b2b0cc..0aefc12c8e 100644
--- a/src/evidently/experimental/dataset_generators/base.py
+++ b/src/evidently/experimental/dataset_generators/base.py
@@ -12,9 +12,7 @@
 
 class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
     class Config:
-        type_alias = "evidently:dataset_generator:BaseDatasetGenerator"
         is_base_type = True
-        alias_required = False  # fixme
 
     options: Options
 
diff --git a/src/evidently/experimental/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py
index c249588d8e..3530f8126a 100644
--- a/src/evidently/experimental/dataset_generators/llm/index.py
+++ b/src/evidently/experimental/dataset_generators/llm/index.py
@@ -36,7 +36,8 @@ def read_text(filename: str) -> str:
 
 class DataCollectionProvider(EvidentlyBaseModel):
     class Config:
-        alias_required = False  # fixme
+        type_alias = "evidently:base:DataCollectionProvider"
+        is_base_type = True
 
     chunk_size: int = DEFAULT_CHUNK_SIZE
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
@@ -63,6 +64,9 @@ def from_chunks(cls, chunks: List[str]):
 
 
 class ChunksDataCollectionProvider(DataCollectionProvider):
+    class Config:
+        type_alias = "evidently:data_provider:ChunksDataCollectionProvider"
+
     chunks: List[Chunk]
 
     def get_data_collection(self):
@@ -72,6 +76,9 @@ def get_data_collection(self):
 
 
 class FileDataCollectionProvider(DataCollectionProvider):
+    class Config:
+        type_alias = "evidently:data_provider:FileDataCollectionProvider"
+
     path: str
 
     def get_data_collection(self):
diff --git a/src/evidently/experimental/dataset_generators/llm/prompts.py b/src/evidently/experimental/dataset_generators/llm/prompts.py
index 7b8eab287a..795be2f722 100644
--- a/src/evidently/experimental/dataset_generators/llm/prompts.py
+++ b/src/evidently/experimental/dataset_generators/llm/prompts.py
@@ -7,7 +7,10 @@
 from evidently.utils.llm import llm_call
 
 
-class SimpleQuestionPrompt(BlockPromptTemplate):
+class SimpleQuestionPromptTemplate(BlockPromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:SimpleQuestionPromptTemplate"
+
     blocks: ClassVar = [
         "Please generate a {question_type} question about this:",
         PromptBlock.input("context").anchored(),
@@ -16,7 +19,10 @@ class SimpleQuestionPrompt(BlockPromptTemplate):
     question_type: str = "simple"
 
 
-class QuestionsFromSeed(BlockPromptTemplate):
+class QuestionsFromSeedPromptTemplate(BlockPromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:QuestionsFromSeedPromptTemplate"
+
     blocks: ClassVar = [
         """Write for me {number} alternative questions quite similar to the question you got.
         The question: """,
@@ -28,14 +34,20 @@ class QuestionsFromSeed(BlockPromptTemplate):
     def generate(self, seed_question: str, number: int) -> List[str]: ...
 
 
-class QuestionsFromContext(WithSystemPrompt, BlockPromptTemplate):
+class QuestionsFromContextPromptTemplate(WithSystemPrompt, BlockPromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:QuestionsFromContextPromptTemplate"
+
     system_prompt: str = "You are an assistant who generates questions based on provided context"
 
     @llm_call
     def generate_questions(self, context: str, number: int) -> List[str]: ...
 
 
-class NaiveQuestionsFromContext(QuestionsFromContext):
+class NaiveQuestionsFromContextPromptTemplate(QuestionsFromContextPromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate"
+
     blocks: ClassVar = [
         "Generate {number} conceptual questions based on the provided context and "
         "can be answered from the information in the provided context.\n"
@@ -48,7 +60,10 @@ class NaiveQuestionsFromContext(QuestionsFromContext):
     ]
 
 
-class ReformulateQuestionPrompt(QuestionsFromContext):
+class ReformulateQuestionPromptTemplate(QuestionsFromContextPromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:ReformulateQuestionPromptTemplate"
+
     blocks: ClassVar = [
         """Write for me {number} alternative questions quite similar to the question you got.
 The question:""",
@@ -59,7 +74,10 @@ class ReformulateQuestionPrompt(QuestionsFromContext):
     system_prompt: str = "You are a smart assistant who helps repharase questions"
 
 
-class BaselineAnswerPrompt(WithSystemPrompt, BlockPromptTemplate):
+class BaselineAnswerPromptTemplate(WithSystemPrompt, BlockPromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:BaselineAnswerPromptTemplate"
+
     blocks: ClassVar = [
         "Your task is to answer the following query:",
         PromptBlock.input("question").anchored(),
diff --git a/src/evidently/experimental/dataset_generators/llm/questions.py b/src/evidently/experimental/dataset_generators/llm/questions.py
index 7d18b55ed1..263d7f5fd7 100644
--- a/src/evidently/experimental/dataset_generators/llm/questions.py
+++ b/src/evidently/experimental/dataset_generators/llm/questions.py
@@ -10,10 +10,10 @@
 from evidently.experimental.dataset_generators.llm.index import Chunk
 from evidently.experimental.dataset_generators.llm.index import DataCollection
 from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
-from evidently.experimental.dataset_generators.llm.prompts import BaselineAnswerPrompt
-from evidently.experimental.dataset_generators.llm.prompts import NaiveQuestionsFromContext
-from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromContext
-from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromSeed
+from evidently.experimental.dataset_generators.llm.prompts import BaselineAnswerPromptTemplate
+from evidently.experimental.dataset_generators.llm.prompts import NaiveQuestionsFromContextPromptTemplate
+from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromContextPromptTemplate
+from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromSeedPromptTemplate
 
 Question = str
 Answer = str
@@ -22,10 +22,13 @@
 
 
 class QADatasetGenerator(BaseLLMDatasetGenerator):
+    class Config:
+        type_alias = "evidently:dataset_generator:QADatasetGenerator"
+
     data_collection: DataCollectionProvider
     num_questions: int
-    questions: QuestionsFromContext = NaiveQuestionsFromContext()
-    answers: BaselineAnswerPrompt = BaselineAnswerPrompt()
+    questions: QuestionsFromContextPromptTemplate = NaiveQuestionsFromContextPromptTemplate()
+    answers: BaselineAnswerPromptTemplate = BaselineAnswerPromptTemplate()
 
     def generate(self) -> DatasetGeneratorResult:
         documents = self.data_collection.get_data_collection()
@@ -57,9 +60,12 @@ def generate_answers(self, questions: List[Question], relevant_chunks: List[List
 
 
 class QADatasetFromSeedGenerator(BaseLLMDatasetGenerator):
+    class Config:
+        type_alias = "evidently:dataset_generator:QADatasetFromSeedGenerator"
+
     seed_question: str
     num_questions: int
-    prompt: QuestionsFromSeed = QuestionsFromSeed()
+    prompt: QuestionsFromSeedPromptTemplate = QuestionsFromSeedPromptTemplate()
 
     def generate(self) -> DatasetGeneratorResult:
         response = self.wrapper.run_sync(
diff --git a/src/evidently/experimental/dataset_generators/llm/splitter.py b/src/evidently/experimental/dataset_generators/llm/splitter.py
index a0c7041e8d..e4b775eb29 100644
--- a/src/evidently/experimental/dataset_generators/llm/splitter.py
+++ b/src/evidently/experimental/dataset_generators/llm/splitter.py
@@ -51,7 +51,6 @@ class Splitters(str, Enum):
 class Splitter(EvidentlyBaseModel, ABC):
     class Config:
         is_base_type = True
-        alias_required = False  # fixme
 
     chunk_size: int
     chunk_overlap: int
@@ -83,6 +82,9 @@ def from_any(cls, splitter: AnySplitter, chunk_size: int, chunk_overlap: int, **
 
 
 class SimpleSplitter(Splitter):
+    class Config:
+        type_alias = "evidently:splitter:SimpleSplitter"
+
     split_re: ClassVar = re.compile(r"([^,.;。？！]+[,.;。？！]?)")
 
     def split_text(self, text: TextSource) -> Sequence[Chunk]:
@@ -103,6 +105,9 @@ def split_text(self, text: TextSource) -> Sequence[Chunk]:
 
 
 class LlamaIndexSplitter(Splitter):
+    class Config:
+        type_alias = "evidently:splitter:LlamaIndexSplitter"
+
     separator: str = " "
     paragraph_separator: Optional[str] = None
     _splitter = PrivateAttr(None)
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm.py
index c716323f36..19464716a9 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm.py
@@ -256,7 +256,7 @@ async def complete(self, messages: List[LLMMessage]) -> str:
 
 class PromptBlock(EvidentlyBaseModel):
     class Config:
-        alias_required = False  # fixme
+        is_base_type = True
 
     def render(self):
         # )))
@@ -296,6 +296,9 @@ def anchored(self, start: str = "__start__", end: str = "__end__"):
 
 
 class Anchor(PromptBlock):
+    class Config:
+        type_alias = "evidently:prompt_block:Anchor"
+
     start: str
     block: PromptBlock
     end: str
@@ -305,6 +308,9 @@ def _render(self) -> str:
 
 
 class SimpleBlock(PromptBlock):
+    class Config:
+        type_alias = "evidently:prompt_block:SimpleBlock"
+
     value: str
 
     def _render(self) -> str:
@@ -318,6 +324,9 @@ def parse_response(self, response: str) -> TResult:
 
 
 class NoopOutputFormat(OutputFormatBlock[str]):
+    class Config:
+        type_alias = "evidently:prompt_block:NoopOutputFormat"
+
     def _render(self) -> str:
         return ""
 
@@ -326,6 +335,9 @@ def parse_response(self, response: str) -> str:
 
 
 class JsonOutputFormatBlock(OutputFormatBlock[Dict[str, Any]]):
+    class Config:
+        type_alias = "evidently:prompt_block:JsonOutputFormatBlock"
+
     fields: Dict[str, Union[Tuple[str, str], str]]
 
     def _render(self) -> str:
@@ -350,6 +362,9 @@ def parse_response(self, response: str) -> Dict[str, Any]:
 
 
 class StringListFormatBlock(OutputFormatBlock[List[str]]):
+    class Config:
+        type_alias = "evidently:prompt_block:StringListFormatBlock"
+
     of_what: str
 
     def _render(self) -> str:
@@ -361,6 +376,9 @@ def parse_response(self, response: str) -> List[str]:
 
 
 class StringFormatBlock(OutputFormatBlock[str]):
+    class Config:
+        type_alias = "evidently:prompt_block:StringFormatBlock"
+
     what: str
 
     def _render(self) -> str:
@@ -413,7 +431,7 @@ def _get_genric_arg(cls: Type):
 
 class PromptTemplate(EvidentlyBaseModel):
     class Config:
-        alias_required = False  # fixme
+        is_base_type = True
 
     # __run_func__ : ClassVar[Callable]
     @abstractmethod
@@ -465,6 +483,9 @@ def get_messages(self, values, template: Optional[str] = None) -> List[LLMMessag
 
 
 class BlockPromptTemplate(PromptTemplate):
+    class Config:
+        type_alias = "evidently:prompt_template:BlockPromptTemplate"
+
     blocks: ClassVar[List[AnyBlock]]
 
     def get_blocks(self) -> Sequence[PromptBlock]:

From e093f542715accff3c77863a928a8beddf5f5289 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 19:39:48 +0400
Subject: [PATCH 57/63] type aliases and sudit and llm util refactor and stuff

---
 .../dataset_generators/__init__.py            |   3 +
 .../dataset_generators/_registry.py           |  67 +++++
 .../dataset_generators/llm/base.py            |   4 +-
 .../dataset_generators/llm/index.py           |   9 +-
 .../dataset_generators/llm/prompts.py         |   8 +-
 src/evidently/utils/llm/__init__.py           |   3 +
 src/evidently/utils/llm/_registry.py          |  21 ++
 src/evidently/utils/llm/base.py               |  20 ++
 src/evidently/utils/llm/errors.py             |  13 +
 .../utils/{llm.py => llm/prompts.py}          | 232 +-----------------
 src/evidently/utils/llm/wrapper.py            | 215 ++++++++++++++++
 tests/test_pydantic_aliases.py                |  10 +
 12 files changed, 366 insertions(+), 239 deletions(-)
 create mode 100644 src/evidently/experimental/dataset_generators/_registry.py
 create mode 100644 src/evidently/utils/llm/__init__.py
 create mode 100644 src/evidently/utils/llm/_registry.py
 create mode 100644 src/evidently/utils/llm/base.py
 create mode 100644 src/evidently/utils/llm/errors.py
 rename src/evidently/utils/{llm.py => llm/prompts.py} (54%)
 create mode 100644 src/evidently/utils/llm/wrapper.py

diff --git a/src/evidently/experimental/dataset_generators/__init__.py b/src/evidently/experimental/dataset_generators/__init__.py
index e69de29bb2..4bfe1f7c80 100644
--- a/src/evidently/experimental/dataset_generators/__init__.py
+++ b/src/evidently/experimental/dataset_generators/__init__.py
@@ -0,0 +1,3 @@
+from . import _registry
+
+__all__ = ["_registry"]
diff --git a/src/evidently/experimental/dataset_generators/_registry.py b/src/evidently/experimental/dataset_generators/_registry.py
new file mode 100644
index 0000000000..9f54d6a119
--- /dev/null
+++ b/src/evidently/experimental/dataset_generators/_registry.py
@@ -0,0 +1,67 @@
+from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
+from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
+from evidently.experimental.dataset_generators.llm.splitter import Splitter
+from evidently.pydantic_utils import register_type_alias
+from evidently.utils.llm.prompts import PromptTemplate
+
+register_type_alias(
+    BaseDatasetGenerator,
+    "evidently.experimental.dataset_generators.llm.questions.QADatasetFromSeedGenerator",
+    "evidently:dataset_generator:QADatasetFromSeedGenerator",
+)
+register_type_alias(
+    BaseDatasetGenerator,
+    "evidently.experimental.dataset_generators.llm.questions.QADatasetGenerator",
+    "evidently:dataset_generator:QADatasetGenerator",
+)
+register_type_alias(
+    DataCollectionProvider,
+    "evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider",
+    "evidently:data_provider:ChunksDataCollectionProvider",
+)
+register_type_alias(
+    DataCollectionProvider,
+    "evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider",
+    "evidently:data_provider:FileDataCollectionProvider",
+)
+
+register_type_alias(
+    PromptTemplate,
+    "evidently.experimental.dataset_generators.llm.prompts.BaselineAnswerPromptTemplate",
+    "evidently:prompt_template:BaselineAnswerPromptTemplate",
+)
+register_type_alias(
+    PromptTemplate,
+    "evidently.experimental.dataset_generators.llm.prompts.NaiveQuestionsFromContextPromptTemplate",
+    "evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate",
+)
+register_type_alias(
+    PromptTemplate,
+    "evidently.experimental.dataset_generators.llm.prompts.QuestionsFromContextPromptTemplate",
+    "evidently:prompt_template:QuestionsFromContextPromptTemplate",
+)
+register_type_alias(
+    PromptTemplate,
+    "evidently.experimental.dataset_generators.llm.prompts.QuestionsFromSeedPromptTemplate",
+    "evidently:prompt_template:QuestionsFromSeedPromptTemplate",
+)
+register_type_alias(
+    PromptTemplate,
+    "evidently.experimental.dataset_generators.llm.prompts.ReformulateQuestionPromptTemplate",
+    "evidently:prompt_template:ReformulateQuestionPromptTemplate",
+)
+register_type_alias(
+    PromptTemplate,
+    "evidently.experimental.dataset_generators.llm.prompts.SimpleQuestionPromptTemplate",
+    "evidently:prompt_template:SimpleQuestionPromptTemplate",
+)
+register_type_alias(
+    Splitter,
+    "evidently.experimental.dataset_generators.llm.splitter.LlamaIndexSplitter",
+    "evidently:splitter:LlamaIndexSplitter",
+)
+register_type_alias(
+    Splitter,
+    "evidently.experimental.dataset_generators.llm.splitter.SimpleSplitter",
+    "evidently:splitter:SimpleSplitter",
+)
diff --git a/src/evidently/experimental/dataset_generators/llm/base.py b/src/evidently/experimental/dataset_generators/llm/base.py
index 09bfaa411a..112f71cfe0 100644
--- a/src/evidently/experimental/dataset_generators/llm/base.py
+++ b/src/evidently/experimental/dataset_generators/llm/base.py
@@ -3,8 +3,8 @@
 from evidently._pydantic_compat import PrivateAttr
 from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
 from evidently.options.base import Options
-from evidently.utils.llm import LLMWrapper
-from evidently.utils.llm import get_llm_wrapper
+from evidently.utils.llm.wrapper import LLMWrapper
+from evidently.utils.llm.wrapper import get_llm_wrapper
 
 
 class BaseLLMDatasetGenerator(
diff --git a/src/evidently/experimental/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py
index 3530f8126a..1b5d2c2bf5 100644
--- a/src/evidently/experimental/dataset_generators/llm/index.py
+++ b/src/evidently/experimental/dataset_generators/llm/index.py
@@ -1,3 +1,4 @@
+import abc
 import glob
 import os
 from pathlib import Path
@@ -34,15 +35,15 @@ def read_text(filename: str) -> str:
         return Path(filename).read_text()
 
 
-class DataCollectionProvider(EvidentlyBaseModel):
+class DataCollectionProvider(EvidentlyBaseModel, abc.ABC):
     class Config:
-        type_alias = "evidently:base:DataCollectionProvider"
         is_base_type = True
 
     chunk_size: int = DEFAULT_CHUNK_SIZE
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
     splitter: AnySplitter = "llama_index"
 
+    @abc.abstractmethod
     def get_data_collection(self) -> "DataCollection":
         raise NotImplementedError
 
@@ -65,7 +66,7 @@ def from_chunks(cls, chunks: List[str]):
 
 class ChunksDataCollectionProvider(DataCollectionProvider):
     class Config:
-        type_alias = "evidently:data_provider:ChunksDataCollectionProvider"
+        type_alias = "evidently:data_collecton_provider:ChunksDataCollectionProvider"
 
     chunks: List[Chunk]
 
@@ -77,7 +78,7 @@ def get_data_collection(self):
 
 class FileDataCollectionProvider(DataCollectionProvider):
     class Config:
-        type_alias = "evidently:data_provider:FileDataCollectionProvider"
+        type_alias = "evidently:data_collecton_provider:FileDataCollectionProvider"
 
     path: str
 
diff --git a/src/evidently/experimental/dataset_generators/llm/prompts.py b/src/evidently/experimental/dataset_generators/llm/prompts.py
index 795be2f722..bb38038f57 100644
--- a/src/evidently/experimental/dataset_generators/llm/prompts.py
+++ b/src/evidently/experimental/dataset_generators/llm/prompts.py
@@ -1,10 +1,10 @@
 from typing import ClassVar
 from typing import List
 
-from evidently.utils.llm import BlockPromptTemplate
-from evidently.utils.llm import PromptBlock
-from evidently.utils.llm import WithSystemPrompt
-from evidently.utils.llm import llm_call
+from evidently.utils.llm.prompts import BlockPromptTemplate
+from evidently.utils.llm.prompts import PromptBlock
+from evidently.utils.llm.prompts import WithSystemPrompt
+from evidently.utils.llm.prompts import llm_call
 
 
 class SimpleQuestionPromptTemplate(BlockPromptTemplate):
diff --git a/src/evidently/utils/llm/__init__.py b/src/evidently/utils/llm/__init__.py
new file mode 100644
index 0000000000..4bfe1f7c80
--- /dev/null
+++ b/src/evidently/utils/llm/__init__.py
@@ -0,0 +1,3 @@
+from . import _registry
+
+__all__ = ["_registry"]
diff --git a/src/evidently/utils/llm/_registry.py b/src/evidently/utils/llm/_registry.py
new file mode 100644
index 0000000000..63f06a4ade
--- /dev/null
+++ b/src/evidently/utils/llm/_registry.py
@@ -0,0 +1,21 @@
+from evidently.pydantic_utils import register_type_alias
+from evidently.utils.llm.prompts import PromptBlock
+from evidently.utils.llm.prompts import PromptTemplate
+
+register_type_alias(PromptBlock, "evidently.utils.llm.prompts.Anchor", "evidently:prompt_block:Anchor")
+register_type_alias(
+    PromptBlock, "evidently.utils.llm.prompts.JsonOutputFormatBlock", "evidently:prompt_block:JsonOutputFormatBlock"
+)
+register_type_alias(
+    PromptBlock, "evidently.utils.llm.prompts.NoopOutputFormat", "evidently:prompt_block:NoopOutputFormat"
+)
+register_type_alias(PromptBlock, "evidently.utils.llm.prompts.SimpleBlock", "evidently:prompt_block:SimpleBlock")
+register_type_alias(
+    PromptBlock, "evidently.utils.llm.prompts.StringFormatBlock", "evidently:prompt_block:StringFormatBlock"
+)
+register_type_alias(
+    PromptBlock, "evidently.utils.llm.prompts.StringListFormatBlock", "evidently:prompt_block:StringListFormatBlock"
+)
+register_type_alias(
+    PromptTemplate, "evidently.utils.llm.prompts.BlockPromptTemplate", "evidently:prompt_template:BlockPromptTemplate"
+)
diff --git a/src/evidently/utils/llm/base.py b/src/evidently/utils/llm/base.py
new file mode 100644
index 0000000000..2abf77b571
--- /dev/null
+++ b/src/evidently/utils/llm/base.py
@@ -0,0 +1,20 @@
+import dataclasses
+from typing import Any
+from typing import Dict
+
+
+@dataclasses.dataclass
+class LLMMessage:
+    role: str
+    content: str
+
+    @classmethod
+    def user(cls, message: str):
+        return LLMMessage("user", message)
+
+    @classmethod
+    def system(cls, message: str):
+        return LLMMessage("system", message)
+
+
+LLMResponse = Dict[str, Any]
diff --git a/src/evidently/utils/llm/errors.py b/src/evidently/utils/llm/errors.py
new file mode 100644
index 0000000000..606fb62542
--- /dev/null
+++ b/src/evidently/utils/llm/errors.py
@@ -0,0 +1,13 @@
+from evidently.errors import EvidentlyError
+
+
+class EvidentlyLLMError(EvidentlyError):
+    pass
+
+
+class LLMResponseParseError(EvidentlyLLMError):
+    pass
+
+
+class LLMRequestError(EvidentlyLLMError):
+    pass
diff --git a/src/evidently/utils/llm.py b/src/evidently/utils/llm/prompts.py
similarity index 54%
rename from src/evidently/utils/llm.py
rename to src/evidently/utils/llm/prompts.py
index 19464716a9..bc0eed4749 100644
--- a/src/evidently/utils/llm.py
+++ b/src/evidently/utils/llm/prompts.py
@@ -1,14 +1,8 @@
-import asyncio
-import dataclasses
-import datetime
 import inspect
 import json
 import re
 from abc import ABC
 from abc import abstractmethod
-from asyncio import Lock
-from asyncio import Semaphore
-from asyncio import sleep
 from functools import wraps
 from typing import Any
 from typing import Callable
@@ -26,234 +20,14 @@
 
 import typing_inspect
 
-from evidently._pydantic_compat import SecretStr
-from evidently.errors import EvidentlyError
-from evidently.options.base import Options
-from evidently.options.option import Option
 from evidently.pydantic_utils import EvidentlyBaseModel
-from evidently.ui.base import sync_api
-
-
-@dataclasses.dataclass
-class LLMMessage:
-    role: str
-    content: str
-
-    @classmethod
-    def user(cls, message: str):
-        return LLMMessage("user", message)
-
-    @classmethod
-    def system(cls, message: str):
-        return LLMMessage("system", message)
-
-
-LLMResponse = Dict[str, Any]
-
-
-class EvidentlyLLMError(EvidentlyError):
-    pass
-
-
-class LLMResponseParseError(EvidentlyLLMError):
-    pass
-
-
-class LLMRequestError(EvidentlyLLMError):
-    pass
-
-
-class RateLimiter:
-    def __init__(self, rate: Optional[int], interval: datetime.timedelta):
-        self.rate = rate
-        self.interval = interval
-        self.enters: List[datetime.datetime] = []
-        self.lock = Lock()
-
-    async def __aenter__(self):
-        if self.rate is None:
-            return
-        while True:
-            async with self.lock:
-                await self._clean()
-                if len(self.enters) < self.rate:
-                    self.enters.append(datetime.datetime.now())
-                    break
-            await sleep(0.1)
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-    async def _clean(self):
-        now = datetime.datetime.now()
-        self.enters = [e for e in self.enters if now - e < self.interval]
-
+from evidently.utils.llm.base import LLMMessage
+from evidently.utils.llm.errors import LLMResponseParseError
+from evidently.utils.llm.wrapper import LLMRequest
 
 TResult = TypeVar("TResult")
 
 
-@dataclasses.dataclass
-class LLMRequest(Generic[TResult]):
-    messages: List[LLMMessage]
-    response_parser: Callable[[str], TResult]
-    response_type: Type[TResult]
-    retries: int = 1
-
-
-class LLMWrapper(ABC):
-    __used_options__: ClassVar[List[Type[Option]]] = []
-
-    @abstractmethod
-    async def complete(self, messages: List[LLMMessage]) -> str:
-        raise NotImplementedError
-
-    async def complete_batch(
-        self, messages_batch: List[List[LLMMessage]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
-    ) -> List[str]:
-        if batch_size is None:
-            batch_size = self.get_batch_size()
-        if rpm_limit is None:
-            rpm_limit = self.get_rpm_limit()
-        rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1))
-        semaphore = Semaphore(batch_size)
-
-        async def work(messages: List[LLMMessage]) -> str:
-            async with semaphore, rate_limiter:
-                return await self.complete(messages)
-
-        return await asyncio.gather(*[work(msgs) for msgs in messages_batch])
-
-    async def run(self, request: LLMRequest[TResult]) -> TResult:
-        num_retries = request.retries
-        error = None
-        while num_retries >= 0:
-            num_retries -= 1
-            try:
-                response = await self.complete(request.messages)
-                return request.response_parser(response)
-            except Exception as e:
-                error = e
-        raise error
-
-    async def run_batch(
-        self, requests: Sequence[LLMRequest[TResult]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
-    ) -> List[TResult]:
-        if batch_size is None:
-            batch_size = self.get_batch_size()
-        if rpm_limit is None:
-            rpm_limit = self.get_rpm_limit()
-        rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1))
-        semaphore = Semaphore(batch_size)
-
-        async def work(request: LLMRequest[TResult]) -> TResult:
-            async with semaphore, rate_limiter:
-                return await self.run(request)
-
-        return await asyncio.gather(*[work(r) for r in requests])
-
-    def get_batch_size(self) -> int:
-        return 100
-
-    def get_rpm_limit(self) -> Optional[int]:
-        return None
-
-    def get_used_options(self) -> List[Type[Option]]:
-        return self.__used_options__
-
-    complete_batch_sync = sync_api(complete_batch)
-    run_sync = sync_api(run)
-    run_batch_sync = sync_api(run_batch)
-
-
-LLMProvider = str
-LLMModel = str
-LLMWrapperProvider = Callable[[LLMModel, Options], LLMWrapper]
-_wrappers: Dict[Tuple[LLMProvider, Optional[LLMModel]], LLMWrapperProvider] = {}
-
-
-def llm_provider(name: LLMProvider, model: Optional[LLMModel]):
-    def dec(f: LLMWrapperProvider):
-        _wrappers[(name, model)] = f
-        return f
-
-    return dec
-
-
-def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) -> LLMWrapper:
-    key: Tuple[str, Optional[str]] = (provider, model)
-    if key in _wrappers:
-        return _wrappers[key](model, options)
-    key = (provider, None)
-    if key in _wrappers:
-        return _wrappers[key](model, options)
-    raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
-
-
-class OpenAIKey(Option):
-    api_key: Optional[SecretStr] = None
-    rpm_limit: int = 500
-
-    def __init__(self, api_key: Optional[str] = None):
-        self.api_key = SecretStr(api_key) if api_key is not None else None
-        super().__init__()
-
-    def get_api_key(self) -> Optional[str]:
-        if self.api_key is None:
-            return None
-        return self.api_key.get_secret_value()
-
-
-@llm_provider("openai", None)
-class OpenAIWrapper(LLMWrapper):
-    __used_options__: ClassVar = [OpenAIKey]
-
-    def __init__(self, model: str, options: Options):
-        import openai
-
-        self.model = model
-        self.options = options.get(OpenAIKey)
-        self._clients: Dict[int, openai.AsyncOpenAI] = {}
-
-    @property
-    def client(self):
-        import openai
-
-        try:
-            loop = asyncio.get_running_loop()
-        except RuntimeError as e:
-            raise RuntimeError("Cannot access OpenAIWrapper client without loop") from e
-        loop_id = id(loop)
-        if loop_id not in self._clients:
-            self._clients[loop_id] = openai.AsyncOpenAI(api_key=self.options.get_api_key())
-        return self._clients[loop_id]
-
-    async def complete(self, messages: List[LLMMessage]) -> str:
-        import openai
-
-        messages = [{"role": msg.role, "content": msg.content} for msg in messages]
-        try:
-            response = await self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
-        except openai.OpenAIError as e:
-            raise LLMRequestError("Failed to call OpenAI complete API") from e
-        content = response.choices[0].message.content
-        assert content is not None  # todo: better error
-        return content
-
-    def get_rpm_limit(self) -> Optional[int]:
-        return self.options.rpm_limit
-
-
-@llm_provider("litellm", None)
-class LiteLLMWrapper(LLMWrapper):
-    def __init__(self, model: str):
-        self.model = model
-
-    async def complete(self, messages: List[LLMMessage]) -> str:
-        from litellm import completion
-
-        return completion(model=self.model, messages=messages).choices[0].message.content
-
-
 class PromptBlock(EvidentlyBaseModel):
     class Config:
         is_base_type = True
diff --git a/src/evidently/utils/llm/wrapper.py b/src/evidently/utils/llm/wrapper.py
new file mode 100644
index 0000000000..a77e63b5a4
--- /dev/null
+++ b/src/evidently/utils/llm/wrapper.py
@@ -0,0 +1,215 @@
+import asyncio
+import dataclasses
+import datetime
+from abc import ABC
+from abc import abstractmethod
+from asyncio import Lock
+from asyncio import Semaphore
+from asyncio import sleep
+from typing import Callable
+from typing import ClassVar
+from typing import Dict
+from typing import Generic
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Type
+from typing import TypeVar
+
+from evidently._pydantic_compat import SecretStr
+from evidently.features.llm_judge import LLMMessage
+from evidently.options.base import Options
+from evidently.options.option import Option
+from evidently.ui.base import sync_api
+from evidently.utils.llm.errors import LLMRequestError
+
+TResult = TypeVar("TResult")
+
+
+class RateLimiter:
+    def __init__(self, rate: Optional[int], interval: datetime.timedelta):
+        self.rate = rate
+        self.interval = interval
+        self.enters: List[datetime.datetime] = []
+        self.lock = Lock()
+
+    async def __aenter__(self):
+        if self.rate is None:
+            return
+        while True:
+            async with self.lock:
+                await self._clean()
+                if len(self.enters) < self.rate:
+                    self.enters.append(datetime.datetime.now())
+                    break
+            await sleep(0.1)
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+    async def _clean(self):
+        now = datetime.datetime.now()
+        self.enters = [e for e in self.enters if now - e < self.interval]
+
+
+@dataclasses.dataclass
+class LLMRequest(Generic[TResult]):
+    messages: List[LLMMessage]
+    response_parser: Callable[[str], TResult]
+    response_type: Type[TResult]
+    retries: int = 1
+
+
+class LLMWrapper(ABC):
+    __used_options__: ClassVar[List[Type[Option]]] = []
+
+    @abstractmethod
+    async def complete(self, messages: List[LLMMessage]) -> str:
+        raise NotImplementedError
+
+    async def complete_batch(
+        self, messages_batch: List[List[LLMMessage]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
+    ) -> List[str]:
+        if batch_size is None:
+            batch_size = self.get_batch_size()
+        if rpm_limit is None:
+            rpm_limit = self.get_rpm_limit()
+        rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1))
+        semaphore = Semaphore(batch_size)
+
+        async def work(messages: List[LLMMessage]) -> str:
+            async with semaphore, rate_limiter:
+                return await self.complete(messages)
+
+        return await asyncio.gather(*[work(msgs) for msgs in messages_batch])
+
+    async def run(self, request: LLMRequest[TResult]) -> TResult:
+        num_retries = request.retries
+        error = None
+        while num_retries >= 0:
+            num_retries -= 1
+            try:
+                response = await self.complete(request.messages)
+                return request.response_parser(response)
+            except Exception as e:
+                error = e
+        raise error
+
+    async def run_batch(
+        self, requests: Sequence[LLMRequest[TResult]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None
+    ) -> List[TResult]:
+        if batch_size is None:
+            batch_size = self.get_batch_size()
+        if rpm_limit is None:
+            rpm_limit = self.get_rpm_limit()
+        rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1))
+        semaphore = Semaphore(batch_size)
+
+        async def work(request: LLMRequest[TResult]) -> TResult:
+            async with semaphore, rate_limiter:
+                return await self.run(request)
+
+        return await asyncio.gather(*[work(r) for r in requests])
+
+    def get_batch_size(self) -> int:
+        return 100
+
+    def get_rpm_limit(self) -> Optional[int]:
+        return None
+
+    def get_used_options(self) -> List[Type[Option]]:
+        return self.__used_options__
+
+    complete_batch_sync = sync_api(complete_batch)
+    run_sync = sync_api(run)
+    run_batch_sync = sync_api(run_batch)
+
+
+LLMProvider = str
+LLMModel = str
+LLMWrapperProvider = Callable[[LLMModel, Options], LLMWrapper]
+_wrappers: Dict[Tuple[LLMProvider, Optional[LLMModel]], LLMWrapperProvider] = {}
+
+
+def llm_provider(name: LLMProvider, model: Optional[LLMModel]):
+    def dec(f: LLMWrapperProvider):
+        _wrappers[(name, model)] = f
+        return f
+
+    return dec
+
+
+def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) -> LLMWrapper:
+    key: Tuple[str, Optional[str]] = (provider, model)
+    if key in _wrappers:
+        return _wrappers[key](model, options)
+    key = (provider, None)
+    if key in _wrappers:
+        return _wrappers[key](model, options)
+    raise ValueError(f"LLM wrapper for provider {provider} model {model} not found")
+
+
+class OpenAIKey(Option):
+    api_key: Optional[SecretStr] = None
+    rpm_limit: int = 500
+
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = SecretStr(api_key) if api_key is not None else None
+        super().__init__()
+
+    def get_api_key(self) -> Optional[str]:
+        if self.api_key is None:
+            return None
+        return self.api_key.get_secret_value()
+
+
+@llm_provider("openai", None)
+class OpenAIWrapper(LLMWrapper):
+    __used_options__: ClassVar = [OpenAIKey]
+
+    def __init__(self, model: str, options: Options):
+        import openai
+
+        self.model = model
+        self.options = options.get(OpenAIKey)
+        self._clients: Dict[int, openai.AsyncOpenAI] = {}
+
+    @property
+    def client(self):
+        import openai
+
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError as e:
+            raise RuntimeError("Cannot access OpenAIWrapper client without loop") from e
+        loop_id = id(loop)
+        if loop_id not in self._clients:
+            self._clients[loop_id] = openai.AsyncOpenAI(api_key=self.options.get_api_key())
+        return self._clients[loop_id]
+
+    async def complete(self, messages: List[LLMMessage]) -> str:
+        import openai
+
+        messages = [{"role": msg.role, "content": msg.content} for msg in messages]
+        try:
+            response = await self.client.chat.completions.create(model=self.model, messages=messages)  # type: ignore[arg-type]
+        except openai.OpenAIError as e:
+            raise LLMRequestError("Failed to call OpenAI complete API") from e
+        content = response.choices[0].message.content
+        assert content is not None  # todo: better error
+        return content
+
+    def get_rpm_limit(self) -> Optional[int]:
+        return self.options.rpm_limit
+
+
+@llm_provider("litellm", None)
+class LiteLLMWrapper(LLMWrapper):
+    def __init__(self, model: str):
+        self.model = model
+
+    async def complete(self, messages: List[LLMMessage]) -> str:
+        from litellm import completion
+
+        return completion(model=self.model, messages=messages).choices[0].message.content
diff --git a/tests/test_pydantic_aliases.py b/tests/test_pydantic_aliases.py
index 488322edd3..0cd96d923c 100644
--- a/tests/test_pydantic_aliases.py
+++ b/tests/test_pydantic_aliases.py
@@ -16,6 +16,9 @@
 from evidently.base_metric import MetricResult
 from evidently.collector.config import CollectorTrigger
 from evidently.collector.storage import CollectorStorage
+from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
+from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
+from evidently.experimental.dataset_generators.llm.splitter import Splitter
 from evidently.features.generated_features import BaseDescriptor
 from evidently.features.generated_features import GeneratedFeatures
 from evidently.features.llm_judge import BaseLLMPromptTemplate
@@ -32,6 +35,8 @@
 from evidently.tests.base_test import TestParameters
 from evidently.ui.components.base import Component
 from evidently.ui.dashboards.base import DashboardPanel
+from evidently.utils.llm.prompts import PromptBlock
+from evidently.utils.llm.prompts import PromptTemplate
 
 T = TypeVar("T")
 
@@ -105,6 +110,11 @@ def test_all_aliases_correct():
         CollectorStorage: "collector_storage",
         BaseLLMPromptTemplate: "prompt_template",
         DashboardPanel: "dashboard_panel",
+        BaseDatasetGenerator: "dataset_generator",
+        Splitter: "splitter",
+        DataCollectionProvider: "data_collecton_provider",
+        PromptBlock: "prompt_block",
+        PromptTemplate: "prompt_template",
     }
     skip = [Component]
     skip_literal = [EvidentlyBaseModel, WithTestAndMetricDependencies, BasePreset]

From bfcacb035f942039388b0dd2fb3c5a58db648b7b Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 19:50:16 +0400
Subject: [PATCH 58/63] fix import

---
 src/evidently/utils/llm/wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/evidently/utils/llm/wrapper.py b/src/evidently/utils/llm/wrapper.py
index a77e63b5a4..ef26cdb68d 100644
--- a/src/evidently/utils/llm/wrapper.py
+++ b/src/evidently/utils/llm/wrapper.py
@@ -18,10 +18,10 @@
 from typing import TypeVar
 
 from evidently._pydantic_compat import SecretStr
-from evidently.features.llm_judge import LLMMessage
 from evidently.options.base import Options
 from evidently.options.option import Option
 from evidently.ui.base import sync_api
+from evidently.utils.llm.base import LLMMessage
 from evidently.utils.llm.errors import LLMRequestError
 
 TResult = TypeVar("TResult")

From 30412251c03fb4651a092fe53df3de1c4ce762c7 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 19:52:57 +0400
Subject: [PATCH 59/63] pip audit

---
 .github/workflows/main.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ca6260fdcd..f6f3c5a68a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -119,10 +119,14 @@ jobs:
       - name: Run pip-audit
         run: |
           pip-audit \
-          --ignore-vuln PYSEC-2024-48 \ # black ReDoS, we dont use black in prod
-          --ignore-vuln GHSA-jw8x-6495-233v \ # sklearn TfidfVectorizer leak in github actions
-          --ignore-vuln GHSA-4hq2-rpgc-r8r7 \ # litestar repo CI/CD
-          --ignore-vuln PYSEC-2024-38  # fastapi ReDoS(needed by chromadb). we don't use fastapi
+          # black ReDoS, we dont use black in prod
+          --ignore-vuln PYSEC-2024-48 \
+          # sklearn TfidfVectorizer leak in github actions
+          --ignore-vuln GHSA-jw8x-6495-233v \
+          # litestar repo CI/CD
+          --ignore-vuln GHSA-4hq2-rpgc-r8r7 \
+          # fastapi ReDoS(needed by chromadb). we don't use fastapi
+          --ignore-vuln PYSEC-2024-38
       - name: Run Tests
         run: python -m pytest --durations=50
   test:

From 70dfe5716f459d6aef059d33878523784dc84751 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 20:01:41 +0400
Subject: [PATCH 60/63] pip audit

---
 .github/workflows/main.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f6f3c5a68a..d5a37961b0 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -119,13 +119,9 @@ jobs:
       - name: Run pip-audit
         run: |
           pip-audit \
-          # black ReDoS, we dont use black in prod
-          --ignore-vuln PYSEC-2024-48 \
-          # sklearn TfidfVectorizer leak in github actions
+          \ # sklearn TfidfVectorizer leak in github actions
           --ignore-vuln GHSA-jw8x-6495-233v \
-          # litestar repo CI/CD
-          --ignore-vuln GHSA-4hq2-rpgc-r8r7 \
-          # fastapi ReDoS(needed by chromadb). we don't use fastapi
+          \ # fastapi ReDoS(needed by chromadb). we don't use fastapi
           --ignore-vuln PYSEC-2024-38
       - name: Run Tests
         run: python -m pytest --durations=50

From 71f236bd41aa28de095cfe1fa6c0423d9b4414c2 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Mon, 14 Oct 2024 20:20:08 +0400
Subject: [PATCH 61/63] reg

---
 src/evidently/experimental/dataset_generators/_registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/evidently/experimental/dataset_generators/_registry.py b/src/evidently/experimental/dataset_generators/_registry.py
index 9f54d6a119..74a027ac6a 100644
--- a/src/evidently/experimental/dataset_generators/_registry.py
+++ b/src/evidently/experimental/dataset_generators/_registry.py
@@ -17,12 +17,12 @@
 register_type_alias(
     DataCollectionProvider,
     "evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider",
-    "evidently:data_provider:ChunksDataCollectionProvider",
+    "evidently:data_collecton_provider:ChunksDataCollectionProvider",
 )
 register_type_alias(
     DataCollectionProvider,
     "evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider",
-    "evidently:data_provider:FileDataCollectionProvider",
+    "evidently:data_collecton_provider:FileDataCollectionProvider",
 )
 
 register_type_alias(

From 8ca6e53d4220e7dd66f1420fa0de18050f2765a4 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Tue, 15 Oct 2024 18:19:56 +0400
Subject: [PATCH 62/63] pip audit

---
 .github/workflows/main.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d5a37961b0..0532ebe5b2 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -119,9 +119,7 @@ jobs:
       - name: Run pip-audit
         run: |
           pip-audit \
-          \ # sklearn TfidfVectorizer leak in github actions
           --ignore-vuln GHSA-jw8x-6495-233v \
-          \ # fastapi ReDoS(needed by chromadb). we don't use fastapi
           --ignore-vuln PYSEC-2024-38
       - name: Run Tests
         run: python -m pytest --durations=50

From df8ca75234f5b8ac49f97c6ad46c509fc3026228 Mon Sep 17 00:00:00 2001
From: mike0sv <mike0sv@gmail.com>
Date: Wed, 16 Oct 2024 14:21:11 +0300
Subject: [PATCH 63/63] remove

---
 src/evidently/experimental/dataset_generators/llm/base.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/evidently/experimental/dataset_generators/llm/base.py b/src/evidently/experimental/dataset_generators/llm/base.py
index 112f71cfe0..9710610657 100644
--- a/src/evidently/experimental/dataset_generators/llm/base.py
+++ b/src/evidently/experimental/dataset_generators/llm/base.py
@@ -7,10 +7,7 @@
 from evidently.utils.llm.wrapper import get_llm_wrapper
 
 
-class BaseLLMDatasetGenerator(
-    # fixme WithLLMWrapper,
-    BaseDatasetGenerator
-):
+class BaseLLMDatasetGenerator(BaseDatasetGenerator):
     provider: str
     model: str
     _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)