diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e92b54ddfe..0532ebe5b2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -115,9 +115,12 @@ jobs: - name: Install minimal dependencies run: pip install -r requirements.min.txt - name: Install package - run: pip install -e .[dev,spark,fsspec] + run: pip install -e .[dev,spark,fsspec,llm] - name: Run pip-audit - run: pip-audit --ignore-vuln PYSEC-2024-48 --ignore-vuln GHSA-jw8x-6495-233v --ignore-vuln GHSA-4hq2-rpgc-r8r7 + run: | + pip-audit \ + --ignore-vuln GHSA-jw8x-6495-233v \ + --ignore-vuln PYSEC-2024-38 - name: Run Tests run: python -m pytest --durations=50 test: @@ -155,7 +158,7 @@ jobs: uses: ./.github/share-actions/get-bikes-dataset-cached - name: Install package - run: pip install -e .[dev,spark,fsspec] + run: pip install -e .[dev,spark,fsspec,llm] - name: Run Tests run: python -m pytest --durations=50 @@ -173,7 +176,7 @@ jobs: cache: "pip" cache-dependency-path: setup.py - name: Install dependencies - run: pip install -e ".[dev]" + run: pip install -e . - name: Install wheel run: pip install wheel - name: Build package diff --git a/examples/data_generators.py b/examples/data_generators.py new file mode 100644 index 0000000000..1cd8ef87e1 --- /dev/null +++ b/examples/data_generators.py @@ -0,0 +1,66 @@ +from evidently.experimental.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator +from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider +from evidently.options.base import Options + + +def generate_from_file(): + file_path = "../cloud_quickstart_tracing.pdf" + data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20, splitter="simple") + + generator = QADatasetGenerator( + data_collection=data, + provider="openai", + model="gpt-4o-mini", + num_questions=5, + options=Options.from_any_options(None) + ) + generated = generator.generate() + for _, a in generated.iterrows(): + print("Q", a["questions"]) + if "answers" in a: + print("A", a["answers"]) + if "context" in a: + print("C", a["context"]) + print() + + +def main(): + data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"]) + generator = QADatasetGenerator( + data_collection=data, + provider="openai", + model="gpt-4o-mini", + num_questions=5, + options=Options.from_any_options(None) + ) + + generated = generator.generate() + for _, a in generated.iterrows(): + print("Q", a["questions"]) + if "answers" in a: + print("A", a["answers"]) + if "context" in a: + print("C", a["context"]) + print() + + generator = QADatasetFromSeedGenerator( + seed_question="What is 'kek'?", + num_questions=5, + provider="openai", + model="gpt-4o-mini", + options=Options.from_any_options(None) + ) + + generated = generator.generate() + for _, a in generated.iterrows(): + print("Q", a["questions"]) + if "answers" in a: + print("A", a["answers"]) + if "context" in a: + print("C", a["context"]) + print() + + +if __name__ == '__main__': + main() + # generate_from_file() diff --git a/requirements.min.txt b/requirements.min.txt index f858d8e507..e7a5d12f28 100644 --- a/requirements.min.txt +++ b/requirements.min.txt @@ -31,3 +31,4 @@ openai==1.16.2 evaluate==0.4.1 transformers[torch]==4.39.3 sentence-transformers==2.7.0 +chromadb==0.4.0 diff --git a/setup.cfg b/setup.cfg index 7f9f43d785..231d1f6f6c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -106,6 +106,15 @@ ignore_missing_imports = True [mypy-litellm.*] ignore_missing_imports = True +[mypy-chromadb.*] +ignore_missing_imports = True + +[mypy-llama_index.*] +ignore_missing_imports = True + +[mypy-pypdf.*] +ignore_missing_imports = True + [tool:pytest] testpaths=tests python_classes=*Test diff --git a/setup.py b/setup.py index 46e9cd43aa..df67f1a052 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ "types-python-dateutil==2.8.19", "types-ujson>=5.4.0", "pillow==10.3.0", - "httpx==0.24.1", + "httpx==0.27.0", "ruff==0.3.7", "pre-commit==3.5.0", "pytest-asyncio==0.23.7", @@ -102,6 +102,7 @@ "evaluate>=0.4.1", "transformers[torch]>=4.39.3", "sentence-transformers>=2.7.0", + "chromadb>=0.4.0", ], "spark": ["pyspark>=3.4.0"], "fsspec": [ diff --git a/src/evidently/experimental/dataset_generators/__init__.py b/src/evidently/experimental/dataset_generators/__init__.py new file mode 100644 index 0000000000..4bfe1f7c80 --- /dev/null +++ b/src/evidently/experimental/dataset_generators/__init__.py @@ -0,0 +1,3 @@ +from . import _registry + +__all__ = ["_registry"] diff --git a/src/evidently/experimental/dataset_generators/_registry.py b/src/evidently/experimental/dataset_generators/_registry.py new file mode 100644 index 0000000000..74a027ac6a --- /dev/null +++ b/src/evidently/experimental/dataset_generators/_registry.py @@ -0,0 +1,67 @@ +from evidently.experimental.dataset_generators.base import BaseDatasetGenerator +from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider +from evidently.experimental.dataset_generators.llm.splitter import Splitter +from evidently.pydantic_utils import register_type_alias +from evidently.utils.llm.prompts import PromptTemplate + +register_type_alias( + BaseDatasetGenerator, + "evidently.experimental.dataset_generators.llm.questions.QADatasetFromSeedGenerator", + "evidently:dataset_generator:QADatasetFromSeedGenerator", +) +register_type_alias( + BaseDatasetGenerator, + "evidently.experimental.dataset_generators.llm.questions.QADatasetGenerator", + "evidently:dataset_generator:QADatasetGenerator", +) +register_type_alias( + DataCollectionProvider, + "evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider", + "evidently:data_collecton_provider:ChunksDataCollectionProvider", +) +register_type_alias( + DataCollectionProvider, + "evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider", + "evidently:data_collecton_provider:FileDataCollectionProvider", +) + +register_type_alias( + PromptTemplate, + "evidently.experimental.dataset_generators.llm.prompts.BaselineAnswerPromptTemplate", + "evidently:prompt_template:BaselineAnswerPromptTemplate", +) +register_type_alias( + PromptTemplate, + "evidently.experimental.dataset_generators.llm.prompts.NaiveQuestionsFromContextPromptTemplate", + "evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate", +) +register_type_alias( + PromptTemplate, + "evidently.experimental.dataset_generators.llm.prompts.QuestionsFromContextPromptTemplate", + "evidently:prompt_template:QuestionsFromContextPromptTemplate", +) +register_type_alias( + PromptTemplate, + "evidently.experimental.dataset_generators.llm.prompts.QuestionsFromSeedPromptTemplate", + "evidently:prompt_template:QuestionsFromSeedPromptTemplate", +) +register_type_alias( + PromptTemplate, + "evidently.experimental.dataset_generators.llm.prompts.ReformulateQuestionPromptTemplate", + "evidently:prompt_template:ReformulateQuestionPromptTemplate", +) +register_type_alias( + PromptTemplate, + "evidently.experimental.dataset_generators.llm.prompts.SimpleQuestionPromptTemplate", + "evidently:prompt_template:SimpleQuestionPromptTemplate", +) +register_type_alias( + Splitter, + "evidently.experimental.dataset_generators.llm.splitter.LlamaIndexSplitter", + "evidently:splitter:LlamaIndexSplitter", +) +register_type_alias( + Splitter, + "evidently.experimental.dataset_generators.llm.splitter.SimpleSplitter", + "evidently:splitter:SimpleSplitter", +) diff --git a/src/evidently/experimental/dataset_generators/base.py b/src/evidently/experimental/dataset_generators/base.py new file mode 100644 index 0000000000..0aefc12c8e --- /dev/null +++ b/src/evidently/experimental/dataset_generators/base.py @@ -0,0 +1,21 @@ +from abc import ABC +from abc import abstractmethod + +import pandas as pd +from typing_extensions import TypeAlias + +from evidently.options.base import Options +from evidently.pydantic_utils import EvidentlyBaseModel + +DatasetGeneratorResult: TypeAlias = pd.DataFrame + + +class BaseDatasetGenerator(EvidentlyBaseModel, ABC): + class Config: + is_base_type = True + + options: Options + + @abstractmethod + def generate(self) -> DatasetGeneratorResult: + raise NotImplementedError diff --git a/src/evidently/experimental/dataset_generators/llm/__init__.py b/src/evidently/experimental/dataset_generators/llm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/evidently/experimental/dataset_generators/llm/base.py b/src/evidently/experimental/dataset_generators/llm/base.py new file mode 100644 index 0000000000..9710610657 --- /dev/null +++ b/src/evidently/experimental/dataset_generators/llm/base.py @@ -0,0 +1,22 @@ +from typing import Optional + +from evidently._pydantic_compat import PrivateAttr +from evidently.experimental.dataset_generators.base import BaseDatasetGenerator +from evidently.options.base import Options +from evidently.utils.llm.wrapper import LLMWrapper +from evidently.utils.llm.wrapper import get_llm_wrapper + + +class BaseLLMDatasetGenerator(BaseDatasetGenerator): + provider: str + model: str + _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None) + + def get_llm_wrapper(self, options: Options) -> LLMWrapper: + if self._llm_wrapper is None: + self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options) + return self._llm_wrapper + + @property + def wrapper(self): + return self.get_llm_wrapper(self.options) diff --git a/src/evidently/experimental/dataset_generators/llm/index.py b/src/evidently/experimental/dataset_generators/llm/index.py new file mode 100644 index 0000000000..1b5d2c2bf5 --- /dev/null +++ b/src/evidently/experimental/dataset_generators/llm/index.py @@ -0,0 +1,149 @@ +import abc +import glob +import os +from pathlib import Path +from typing import List +from typing import Optional + +import chromadb +from chromadb.types import Collection +from chromadb.utils import embedding_functions + +from evidently.experimental.dataset_generators.llm.splitter import AnySplitter +from evidently.experimental.dataset_generators.llm.splitter import Splitter +from evidently.pydantic_utils import EvidentlyBaseModel + +Chunk = str +DEFAULT_CHUNK_SIZE = 512 +DEFAULT_CHUNK_OVERLAP = 20 + + +def read_text(filename: str) -> str: + file_path = Path(filename) + if file_path.suffix.lower() == ".pdf": + try: + from pypdf import PdfReader + except ImportError as e: + raise ImportError("Please install pypdf to extract context from .pdf files") from e + reader = PdfReader(file_path) + text = "" + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + text += page.extract_text() + return text + else: + return Path(filename).read_text() + + +class DataCollectionProvider(EvidentlyBaseModel, abc.ABC): + class Config: + is_base_type = True + + chunk_size: int = DEFAULT_CHUNK_SIZE + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP + splitter: AnySplitter = "llama_index" + + @abc.abstractmethod + def get_data_collection(self) -> "DataCollection": + raise NotImplementedError + + @classmethod + def from_files( + cls, + path: str, + chunk_size: int = DEFAULT_CHUNK_SIZE, + chunk_overlap: int = DEFAULT_CHUNK_OVERLAP, + splitter: AnySplitter = "llama_index", + ) -> "DataCollectionProvider": + return FileDataCollectionProvider( + path=path, chunk_size=chunk_size, chunk_overlap=chunk_overlap, splitter=splitter + ) + + @classmethod + def from_chunks(cls, chunks: List[str]): + return ChunksDataCollectionProvider(chunks=chunks) + + +class ChunksDataCollectionProvider(DataCollectionProvider): + class Config: + type_alias = "evidently:data_collecton_provider:ChunksDataCollectionProvider" + + chunks: List[Chunk] + + def get_data_collection(self): + dc = DataCollection(name="chunks", chunks=self.chunks) + dc.init_collection() + return dc + + +class FileDataCollectionProvider(DataCollectionProvider): + class Config: + type_alias = "evidently:data_collecton_provider:FileDataCollectionProvider" + + path: str + + def get_data_collection(self): + file_path = Path(self.path) + paths = [self.path] if file_path.is_file() else glob.glob(os.path.join(self.path, "*")) + + splitter = Splitter.from_any(self.splitter, self.chunk_size, self.chunk_overlap) + chunks = list(splitter.split([read_text(p) for p in paths])) + + data_collection = DataCollection(name=file_path.name, chunks=chunks) + data_collection.init_collection() + return data_collection + + +class DataCollection: + name: str + chunks: List[Chunk] + collection: Optional[Collection] = None + + def __init__(self, name: str, chunks: List[str], collection: Optional["Collection"] = None): + self.name = name + self.chunks = chunks + self.collection = collection + + def init_collection(self): + if self.collection is None: + # fixme: huggingface/tokenizers warns about clean_up_tokenization_spaces + import warnings + + os.environ["TOKENIZERS_PARALLELISM"] = "false" + warnings.filterwarnings("ignore", category=FutureWarning) + + default_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name="all-MiniLM-L6-v2", + ) + chroma_client = chromadb.Client() + collection = chroma_client.get_or_create_collection( + name=self.name, + embedding_function=default_embedding_function, + ) + for i, chunk in enumerate(self.chunks): + collection.upsert( + ids=str(i), + documents=chunk, + ) + self.collection = collection + + def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]: + """ + Queries the collection with a given question and returns the relevant text chunks. + + Args: + question (str): The query or question text to search for. + n_results (int): Number of results to retrieve. Default is 3. + + Returns: + List[Chunk]: A list of relevant text chunks. + """ + if self.collection is None: + raise ValueError("Collection is not initialized") + results = self.collection.query( + query_texts=question, + n_results=min(n_results, len(self.chunks)), + ) + + relevant_chunks = [chunk for document in results["documents"] for chunk in document] + return relevant_chunks diff --git a/src/evidently/experimental/dataset_generators/llm/prompts.py b/src/evidently/experimental/dataset_generators/llm/prompts.py new file mode 100644 index 0000000000..bb38038f57 --- /dev/null +++ b/src/evidently/experimental/dataset_generators/llm/prompts.py @@ -0,0 +1,95 @@ +from typing import ClassVar +from typing import List + +from evidently.utils.llm.prompts import BlockPromptTemplate +from evidently.utils.llm.prompts import PromptBlock +from evidently.utils.llm.prompts import WithSystemPrompt +from evidently.utils.llm.prompts import llm_call + + +class SimpleQuestionPromptTemplate(BlockPromptTemplate): + class Config: + type_alias = "evidently:prompt_template:SimpleQuestionPromptTemplate" + + blocks: ClassVar = [ + "Please generate a {question_type} question about this:", + PromptBlock.input("context").anchored(), + PromptBlock.json_output(question="question text", answer="answer text"), + ] + question_type: str = "simple" + + +class QuestionsFromSeedPromptTemplate(BlockPromptTemplate): + class Config: + type_alias = "evidently:prompt_template:QuestionsFromSeedPromptTemplate" + + blocks: ClassVar = [ + """Write for me {number} alternative questions quite similar to the question you got. + The question: """, + PromptBlock.input("seed_question").anchored(), + PromptBlock.string_list_output("questions"), + ] + + @llm_call + def generate(self, seed_question: str, number: int) -> List[str]: ... + + +class QuestionsFromContextPromptTemplate(WithSystemPrompt, BlockPromptTemplate): + class Config: + type_alias = "evidently:prompt_template:QuestionsFromContextPromptTemplate" + + system_prompt: str = "You are an assistant who generates questions based on provided context" + + @llm_call + def generate_questions(self, context: str, number: int) -> List[str]: ... + + +class NaiveQuestionsFromContextPromptTemplate(QuestionsFromContextPromptTemplate): + class Config: + type_alias = "evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate" + + blocks: ClassVar = [ + "Generate {number} conceptual questions based on the provided context and " + "can be answered from the information in the provided context.\n" + "Here is a context", + PromptBlock.input("context").anchored(), + "Remain faithful to the above context.\n" + "Avoid providing any preamble!\n" + "Avoid providing any closing statement!", + PromptBlock.string_list_output("questions"), + ] + + +class ReformulateQuestionPromptTemplate(QuestionsFromContextPromptTemplate): + class Config: + type_alias = "evidently:prompt_template:ReformulateQuestionPromptTemplate" + + blocks: ClassVar = [ + """Write for me {number} alternative questions quite similar to the question you got. +The question:""", + PromptBlock.input("context").anchored(), + PromptBlock.string_list_output("questions"), + ] + number: int + system_prompt: str = "You are a smart assistant who helps repharase questions" + + +class BaselineAnswerPromptTemplate(WithSystemPrompt, BlockPromptTemplate): + class Config: + type_alias = "evidently:prompt_template:BaselineAnswerPromptTemplate" + + blocks: ClassVar = [ + "Your task is to answer the following query:", + PromptBlock.input("question").anchored(), + "You have access to the following documents which are meant to provide context as you answer the query:", + PromptBlock.input("context").anchored(), + """Please remain faithful to the underlying context, +and deviate from it only if you haven't found the answer in the provided context. +Avoid providing any preamble! +Avoid providing any closing statement!""", + PromptBlock.string_output("answer"), + ] + system_prompt: str = "You are a helpful assistant that answer a given question directly without any preamble" + + @llm_call + def generate_answers(self, question: str, context: str): ... diff --git a/src/evidently/experimental/dataset_generators/llm/questions.py b/src/evidently/experimental/dataset_generators/llm/questions.py new file mode 100644 index 0000000000..263d7f5fd7 --- /dev/null +++ b/src/evidently/experimental/dataset_generators/llm/questions.py @@ -0,0 +1,75 @@ +import random +from typing import List +from typing import Sequence +from typing import Tuple + +import pandas as pd + +from evidently.experimental.dataset_generators.base import DatasetGeneratorResult +from evidently.experimental.dataset_generators.llm.base import BaseLLMDatasetGenerator +from evidently.experimental.dataset_generators.llm.index import Chunk +from evidently.experimental.dataset_generators.llm.index import DataCollection +from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider +from evidently.experimental.dataset_generators.llm.prompts import BaselineAnswerPromptTemplate +from evidently.experimental.dataset_generators.llm.prompts import NaiveQuestionsFromContextPromptTemplate +from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromContextPromptTemplate +from evidently.experimental.dataset_generators.llm.prompts import QuestionsFromSeedPromptTemplate + +Question = str +Answer = str +GeneratedQuestion = Tuple[Question, Answer, Chunk] +ChunkSet = List[Chunk] + + +class QADatasetGenerator(BaseLLMDatasetGenerator): + class Config: + type_alias = "evidently:dataset_generator:QADatasetGenerator" + + data_collection: DataCollectionProvider + num_questions: int + questions: QuestionsFromContextPromptTemplate = NaiveQuestionsFromContextPromptTemplate() + answers: BaselineAnswerPromptTemplate = BaselineAnswerPromptTemplate() + + def generate(self) -> DatasetGeneratorResult: + documents = self.data_collection.get_data_collection() + chunk_set_count, chunks_in_set_count, questions_per_chunkset = self.get_chunks_and_question_count() + chunk_sets = self.generate_chunksets(documents, chunk_set_count, chunks_in_set_count) + questions: List[Question] = self.generate_questions(chunk_sets, questions_per_chunkset) + relevant_chunks = [documents.find_relevant_chunks(q) for q in questions] + answers = self.generate_answers(questions, relevant_chunks) + return pd.DataFrame({"questions": questions, "answers": answers, "context": relevant_chunks}) + + def get_chunks_and_question_count(self) -> Tuple[int, int, int]: + return 1, 1, self.num_questions + + def generate_chunksets(self, documents: DataCollection, count: int, chunks_per_set: int) -> List[ChunkSet]: + return [[random.choice(documents.chunks) for _ in range(chunks_per_set)] for _ in range(count)] + + def generate_questions(self, chunk_sets: Sequence[List[Chunk]], questions_per_chunkset: int) -> List[Question]: + questions = self.wrapper.run_batch_sync( + self.questions.generate_questions(context="\n\n".join(chunks), number=questions_per_chunkset) + for chunks in chunk_sets + ) + return [q for qs in questions for q in qs] + + def generate_answers(self, questions: List[Question], relevant_chunks: List[List[Chunk]]) -> List[str]: + return self.wrapper.run_batch_sync( + self.answers.generate_answers(question=question, context="\n".join(chunks)) + for question, chunks in zip(questions, relevant_chunks) + ) + + +class QADatasetFromSeedGenerator(BaseLLMDatasetGenerator): + class Config: + type_alias = "evidently:dataset_generator:QADatasetFromSeedGenerator" + + seed_question: str + num_questions: int + prompt: QuestionsFromSeedPromptTemplate = QuestionsFromSeedPromptTemplate() + + def generate(self) -> DatasetGeneratorResult: + response = self.wrapper.run_sync( + self.prompt.generate(number=self.num_questions, seed_question=self.seed_question) + ) + + return pd.DataFrame({"questions": response}) diff --git a/src/evidently/experimental/dataset_generators/llm/splitter.py b/src/evidently/experimental/dataset_generators/llm/splitter.py new file mode 100644 index 0000000000..e4b775eb29 --- /dev/null +++ b/src/evidently/experimental/dataset_generators/llm/splitter.py @@ -0,0 +1,130 @@ +import re +from abc import ABC +from abc import abstractmethod +from enum import Enum +from typing import ClassVar +from typing import List +from typing import Optional +from typing import Sequence +from typing import Union + +from evidently._pydantic_compat import PrivateAttr +from evidently.pydantic_utils import EvidentlyBaseModel + + +class TextSource: + @classmethod + def from_any(cls, text_source: "AnyTextSource"): + if isinstance(text_source, TextSource): + return text_source + if isinstance(text_source, str): + return StrSource(text_source) + raise NotImplementedError(f"Cannot create TextSource from {text_source.__class__.__name__}") + + @abstractmethod + def get_text(self) -> str: + raise NotImplementedError + + +class StrSource(TextSource): + def __init__(self, value: str): + self.value = value + + def get_text(self) -> str: + return self.value + + +AnyTextSource = Union[str, bytes, TextSource] + +Chunk = str +Split = str + + +class Splitters(str, Enum): + Simple = "simple" + LlamaIndex = "llama_index" + + +AnySplitter = Union[str, Splitters, "Splitter"] + + +class Splitter(EvidentlyBaseModel, ABC): + class Config: + is_base_type = True + + chunk_size: int + chunk_overlap: int + + def split(self, texts: Union[AnyTextSource, List[AnyTextSource]]) -> Sequence[Chunk]: + if not isinstance(texts, list): + texts = [texts] + + for text in texts: + yield from self.split_text(TextSource.from_any(text)) + + @abstractmethod + def split_text(self, text: TextSource) -> Sequence[Chunk]: + raise NotImplementedError + + @classmethod + def from_any(cls, splitter: AnySplitter, chunk_size: int, chunk_overlap: int, **kwargs): + if isinstance(splitter, Splitter): + return splitter + if isinstance(splitter, str): + splitter = Splitters(splitter) + if isinstance(splitter, Splitters): + if splitter == Splitters.Simple: + return SimpleSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + if splitter == Splitters.LlamaIndex: + return LlamaIndexSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs) + raise ValueError(f"Unknown splitter {splitter}") + raise NotImplementedError(f"Cannot create splitter from {splitter.__class__.__name__}") + + +class SimpleSplitter(Splitter): + class Config: + type_alias = "evidently:splitter:SimpleSplitter" + + split_re: ClassVar = re.compile(r"([^,.;。?!]+[,.;。?!]?)") + + def split_text(self, text: TextSource) -> Sequence[Chunk]: + current_splits: List[str] = [] + current_size = 0 + for split in self.split_re.split(text.get_text()): + split_size = len(split) + if len(current_splits) > 0 and current_size + split_size > self.chunk_size: + yield "".join(current_splits) + while current_size > self.chunk_overlap and len(current_splits) > 0: + last, *current_splits = current_splits + last_size = len(last) + current_size -= last_size + current_size += split_size + current_splits.append(split) + if current_size > 0: + yield "".join(current_splits) + + +class LlamaIndexSplitter(Splitter): + class Config: + type_alias = "evidently:splitter:LlamaIndexSplitter" + + separator: str = " " + paragraph_separator: Optional[str] = None + _splitter = PrivateAttr(None) + + @property + def splitter(self): + if self._splitter is None: + from llama_index.core.node_parser import SentenceSplitter + from llama_index.core.node_parser.text.sentence import DEFAULT_PARAGRAPH_SEP + + self._splitter = SentenceSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + separator=self.separator, + paragraph_separator=self.paragraph_separator or DEFAULT_PARAGRAPH_SEP, + ) + return self._splitter + + def split_text(self, text: TextSource) -> Sequence[Chunk]: + yield from self.splitter.split_text(text.get_text()) diff --git a/src/evidently/utils/llm/__init__.py b/src/evidently/utils/llm/__init__.py new file mode 100644 index 0000000000..4bfe1f7c80 --- /dev/null +++ b/src/evidently/utils/llm/__init__.py @@ -0,0 +1,3 @@ +from . import _registry + +__all__ = ["_registry"] diff --git a/src/evidently/utils/llm/_registry.py b/src/evidently/utils/llm/_registry.py new file mode 100644 index 0000000000..63f06a4ade --- /dev/null +++ b/src/evidently/utils/llm/_registry.py @@ -0,0 +1,21 @@ +from evidently.pydantic_utils import register_type_alias +from evidently.utils.llm.prompts import PromptBlock +from evidently.utils.llm.prompts import PromptTemplate + +register_type_alias(PromptBlock, "evidently.utils.llm.prompts.Anchor", "evidently:prompt_block:Anchor") +register_type_alias( + PromptBlock, "evidently.utils.llm.prompts.JsonOutputFormatBlock", "evidently:prompt_block:JsonOutputFormatBlock" +) +register_type_alias( + PromptBlock, "evidently.utils.llm.prompts.NoopOutputFormat", "evidently:prompt_block:NoopOutputFormat" +) +register_type_alias(PromptBlock, "evidently.utils.llm.prompts.SimpleBlock", "evidently:prompt_block:SimpleBlock") +register_type_alias( + PromptBlock, "evidently.utils.llm.prompts.StringFormatBlock", "evidently:prompt_block:StringFormatBlock" +) +register_type_alias( + PromptBlock, "evidently.utils.llm.prompts.StringListFormatBlock", "evidently:prompt_block:StringListFormatBlock" +) +register_type_alias( + PromptTemplate, "evidently.utils.llm.prompts.BlockPromptTemplate", "evidently:prompt_template:BlockPromptTemplate" +) diff --git a/src/evidently/utils/llm/base.py b/src/evidently/utils/llm/base.py new file mode 100644 index 0000000000..2abf77b571 --- /dev/null +++ b/src/evidently/utils/llm/base.py @@ -0,0 +1,20 @@ +import dataclasses +from typing import Any +from typing import Dict + + +@dataclasses.dataclass +class LLMMessage: + role: str + content: str + + @classmethod + def user(cls, message: str): + return LLMMessage("user", message) + + @classmethod + def system(cls, message: str): + return LLMMessage("system", message) + + +LLMResponse = Dict[str, Any] diff --git a/src/evidently/utils/llm/errors.py b/src/evidently/utils/llm/errors.py new file mode 100644 index 0000000000..606fb62542 --- /dev/null +++ b/src/evidently/utils/llm/errors.py @@ -0,0 +1,13 @@ +from evidently.errors import EvidentlyError + + +class EvidentlyLLMError(EvidentlyError): + pass + + +class LLMResponseParseError(EvidentlyLLMError): + pass + + +class LLMRequestError(EvidentlyLLMError): + pass diff --git a/src/evidently/utils/llm/prompts.py b/src/evidently/utils/llm/prompts.py new file mode 100644 index 0000000000..bc0eed4749 --- /dev/null +++ b/src/evidently/utils/llm/prompts.py @@ -0,0 +1,275 @@ +import inspect +import json +import re +from abc import ABC +from abc import abstractmethod +from functools import wraps +from typing import Any +from typing import Callable +from typing import ClassVar +from typing import Dict +from typing import Generic +from typing import Iterator +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Type +from typing import TypeVar +from typing import Union + +import typing_inspect + +from evidently.pydantic_utils import EvidentlyBaseModel +from evidently.utils.llm.base import LLMMessage +from evidently.utils.llm.errors import LLMResponseParseError +from evidently.utils.llm.wrapper import LLMRequest + +TResult = TypeVar("TResult") + + +class PromptBlock(EvidentlyBaseModel): + class Config: + is_base_type = True + + def render(self): + # ))) + result = self._render() + for field in self.__fields__: + placeholder = f"{{{field}}}" + if placeholder in result: + result = result.replace(placeholder, getattr(self, field)) + return result + + @abstractmethod + def _render(self) -> str: + raise NotImplementedError + + @classmethod + def simple(cls, value: str): + return SimpleBlock(value=value) + + @classmethod + def input(cls, placeholder_name: str = "input"): + return SimpleBlock(value=f"{{{placeholder_name}}}") + + @classmethod + def json_output(cls, **fields: Union[str, Tuple[str, str]]): + return JsonOutputFormatBlock(fields=fields) + + @classmethod + def string_list_output(cls, of_what: str): + return StringListFormatBlock(of_what=of_what) + + @classmethod + def string_output(cls, what: str): + return StringFormatBlock(what=what) + + def anchored(self, start: str = "__start__", end: str = "__end__"): + return Anchor(start=start, block=self, end=end) + + +class Anchor(PromptBlock): + class Config: + type_alias = "evidently:prompt_block:Anchor" + + start: str + block: PromptBlock + end: str + + def _render(self) -> str: + return f"{self.start}\n{self.block.render()}\n{self.end}" + + +class SimpleBlock(PromptBlock): + class Config: + type_alias = "evidently:prompt_block:SimpleBlock" + + value: str + + def _render(self) -> str: + return self.value + + +class OutputFormatBlock(PromptBlock, ABC, Generic[TResult]): + @abstractmethod + def parse_response(self, response: str) -> TResult: + raise NotImplementedError + + +class NoopOutputFormat(OutputFormatBlock[str]): + class Config: + type_alias = "evidently:prompt_block:NoopOutputFormat" + + def _render(self) -> str: + return "" + + def parse_response(self, response: str) -> str: + return response + + +class JsonOutputFormatBlock(OutputFormatBlock[Dict[str, Any]]): + class Config: + type_alias = "evidently:prompt_block:JsonOutputFormatBlock" + + fields: Dict[str, Union[Tuple[str, str], str]] + + def _render(self) -> str: + values = [] + example_rows = [] + for field, descr in self.fields.items(): + if isinstance(descr, tuple): + descr, field_key = descr + else: + field_key = field + values.append(field) + example_rows.append(f'"{field_key}": "{descr}"') + + example_rows_str = "\n".join(example_rows) + return f"Return {', '.join(values)} formatted as json without formatting as follows:\n{{{{\n{example_rows_str}\n}}}}" + + def parse_response(self, response: str) -> Dict[str, Any]: + try: + return json.loads(response) + except json.JSONDecodeError as e: + raise LLMResponseParseError(f"Failed to parse response '{response}' as json") from e + + +class StringListFormatBlock(OutputFormatBlock[List[str]]): + class Config: + type_alias = "evidently:prompt_block:StringListFormatBlock" + + of_what: str + + def _render(self) -> str: + return f"""Return a list of {self.of_what}. +This should be only a list of string {self.of_what}, each one on a new line with no enumeration""" + + def parse_response(self, response: str) -> List[str]: + return response.split("\n") + + +class StringFormatBlock(OutputFormatBlock[str]): + class Config: + type_alias = "evidently:prompt_block:StringFormatBlock" + + what: str + + def _render(self) -> str: + return f"""Return {self.what} only.""" + + def parse_response(self, response: str) -> str: + return response + + +def llm_call(f: Callable) -> Callable[..., LLMRequest]: + sig = inspect.getfullargspec(f) + response_type = sig.annotations.get("return", str) + + @wraps(f) + def inner(self: PromptTemplate, *args, **kwargs): + kwargs = inspect.getcallargs(f, *args, **kwargs, self=self) + del kwargs["self"] + template = self.get_template() + placeholders = self.list_placeholders(template) + if set(placeholders) != set(kwargs.keys()): + raise TypeError( + f"{f} arg signature ({list(kwargs)}) does not correspond to placeholders in prompt ({placeholders})" + ) + + output_format = self.get_output_format() + prompt_response_type = _get_genric_arg(output_format.__class__) + if prompt_response_type != response_type: + raise TypeError( + f"{f} response type ({response_type}) does not correspond to prompt output type {prompt_response_type}" + ) + + # todo: validate kwargs against sig.annotations + # todo: define response parser with validation against response_type + + return LLMRequest( + messages=self.get_messages(kwargs, template=template), + response_parser=self.parse, + response_type=response_type, + ) + + return inner + + +def _get_genric_arg(cls: Type): + return typing_inspect.get_args(next(b for b in cls.__orig_bases__ if typing_inspect.is_generic_type(b)))[0] + + +placeholders_re = re.compile(r"\{([a-zA-Z0-9_]+)}") + + +class PromptTemplate(EvidentlyBaseModel): + class Config: + is_base_type = True + + # __run_func__ : ClassVar[Callable] + @abstractmethod + def get_blocks(self) -> Sequence[PromptBlock]: + raise NotImplementedError + + def iterate(self, values: Sequence[Dict[str, str]]) -> Iterator[str]: + template = self.get_template() + for vals in values: + yield self.render(vals, template) + + def render(self, values: dict, template: Optional[str] = None): + return (template or self.get_template()).format(**values) + + def get_template(self) -> str: + return "\n".join(block.render() for block in self.get_blocks()) + + def list_placeholders(self, template: Optional[str] = None): + template = template or self.get_template() + return list(placeholders_re.findall(template)) + + def get_output_format(self) -> OutputFormatBlock: + output: Optional[OutputFormatBlock] = next( + (b for b in self.get_blocks() if isinstance(b, OutputFormatBlock)), None + ) + return output if output is not None else NoopOutputFormat() # type: ignore[return-value] + + def parse(self, response: str, keys: Optional[List[str]] = None) -> Dict[str, Any]: + output = self.get_output_format() + parsed = output.parse_response(response) + if keys is not None and set(keys) != set(parsed.keys()): + raise LLMResponseParseError(f"Keys {keys} are required but got {list(parsed.keys())}") + return parsed + + def get_messages(self, values, template: Optional[str] = None) -> List[LLMMessage]: + return [LLMMessage.user(self.render(values, template))] + + +class WithSystemPrompt(PromptTemplate, ABC): + system_prompt: str + + def get_messages(self, values, template: Optional[str] = None) -> List[LLMMessage]: + msgs = super().get_messages(values, template) + msgs.insert(0, LLMMessage.system(self.system_prompt)) + return msgs + + +AnyBlock = Union[str, PromptBlock, Callable] + + +class BlockPromptTemplate(PromptTemplate): + class Config: + type_alias = "evidently:prompt_template:BlockPromptTemplate" + + blocks: ClassVar[List[AnyBlock]] + + def get_blocks(self) -> Sequence[PromptBlock]: + return [self._to_block(b) for b in self.blocks] + + def _to_block(self, block: AnyBlock) -> PromptBlock: + if isinstance(block, PromptBlock): + return block + if isinstance(block, str): + return PromptBlock.simple(block) + # if callable(block): todo + # return PromptBlock.func(block) + raise NotImplementedError(f"Cannot create promt block from {block}") diff --git a/src/evidently/utils/llm/wrapper.py b/src/evidently/utils/llm/wrapper.py new file mode 100644 index 0000000000..ef26cdb68d --- /dev/null +++ b/src/evidently/utils/llm/wrapper.py @@ -0,0 +1,215 @@ +import asyncio +import dataclasses +import datetime +from abc import ABC +from abc import abstractmethod +from asyncio import Lock +from asyncio import Semaphore +from asyncio import sleep +from typing import Callable +from typing import ClassVar +from typing import Dict +from typing import Generic +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Type +from typing import TypeVar + +from evidently._pydantic_compat import SecretStr +from evidently.options.base import Options +from evidently.options.option import Option +from evidently.ui.base import sync_api +from evidently.utils.llm.base import LLMMessage +from evidently.utils.llm.errors import LLMRequestError + +TResult = TypeVar("TResult") + + +class RateLimiter: + def __init__(self, rate: Optional[int], interval: datetime.timedelta): + self.rate = rate + self.interval = interval + self.enters: List[datetime.datetime] = [] + self.lock = Lock() + + async def __aenter__(self): + if self.rate is None: + return + while True: + async with self.lock: + await self._clean() + if len(self.enters) < self.rate: + self.enters.append(datetime.datetime.now()) + break + await sleep(0.1) + + async def __aexit__(self, exc_type, exc_val, exc_tb): + pass + + async def _clean(self): + now = datetime.datetime.now() + self.enters = [e for e in self.enters if now - e < self.interval] + + +@dataclasses.dataclass +class LLMRequest(Generic[TResult]): + messages: List[LLMMessage] + response_parser: Callable[[str], TResult] + response_type: Type[TResult] + retries: int = 1 + + +class LLMWrapper(ABC): + __used_options__: ClassVar[List[Type[Option]]] = [] + + @abstractmethod + async def complete(self, messages: List[LLMMessage]) -> str: + raise NotImplementedError + + async def complete_batch( + self, messages_batch: List[List[LLMMessage]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None + ) -> List[str]: + if batch_size is None: + batch_size = self.get_batch_size() + if rpm_limit is None: + rpm_limit = self.get_rpm_limit() + rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1)) + semaphore = Semaphore(batch_size) + + async def work(messages: List[LLMMessage]) -> str: + async with semaphore, rate_limiter: + return await self.complete(messages) + + return await asyncio.gather(*[work(msgs) for msgs in messages_batch]) + + async def run(self, request: LLMRequest[TResult]) -> TResult: + num_retries = request.retries + error = None + while num_retries >= 0: + num_retries -= 1 + try: + response = await self.complete(request.messages) + return request.response_parser(response) + except Exception as e: + error = e + raise error + + async def run_batch( + self, requests: Sequence[LLMRequest[TResult]], batch_size: Optional[int] = None, rpm_limit: Optional[int] = None + ) -> List[TResult]: + if batch_size is None: + batch_size = self.get_batch_size() + if rpm_limit is None: + rpm_limit = self.get_rpm_limit() + rate_limiter = RateLimiter(rate=rpm_limit, interval=datetime.timedelta(minutes=1)) + semaphore = Semaphore(batch_size) + + async def work(request: LLMRequest[TResult]) -> TResult: + async with semaphore, rate_limiter: + return await self.run(request) + + return await asyncio.gather(*[work(r) for r in requests]) + + def get_batch_size(self) -> int: + return 100 + + def get_rpm_limit(self) -> Optional[int]: + return None + + def get_used_options(self) -> List[Type[Option]]: + return self.__used_options__ + + complete_batch_sync = sync_api(complete_batch) + run_sync = sync_api(run) + run_batch_sync = sync_api(run_batch) + + +LLMProvider = str +LLMModel = str +LLMWrapperProvider = Callable[[LLMModel, Options], LLMWrapper] +_wrappers: Dict[Tuple[LLMProvider, Optional[LLMModel]], LLMWrapperProvider] = {} + + +def llm_provider(name: LLMProvider, model: Optional[LLMModel]): + def dec(f: LLMWrapperProvider): + _wrappers[(name, model)] = f + return f + + return dec + + +def get_llm_wrapper(provider: LLMProvider, model: LLMModel, options: Options) -> LLMWrapper: + key: Tuple[str, Optional[str]] = (provider, model) + if key in _wrappers: + return _wrappers[key](model, options) + key = (provider, None) + if key in _wrappers: + return _wrappers[key](model, options) + raise ValueError(f"LLM wrapper for provider {provider} model {model} not found") + + +class OpenAIKey(Option): + api_key: Optional[SecretStr] = None + rpm_limit: int = 500 + + def __init__(self, api_key: Optional[str] = None): + self.api_key = SecretStr(api_key) if api_key is not None else None + super().__init__() + + def get_api_key(self) -> Optional[str]: + if self.api_key is None: + return None + return self.api_key.get_secret_value() + + +@llm_provider("openai", None) +class OpenAIWrapper(LLMWrapper): + __used_options__: ClassVar = [OpenAIKey] + + def __init__(self, model: str, options: Options): + import openai + + self.model = model + self.options = options.get(OpenAIKey) + self._clients: Dict[int, openai.AsyncOpenAI] = {} + + @property + def client(self): + import openai + + try: + loop = asyncio.get_running_loop() + except RuntimeError as e: + raise RuntimeError("Cannot access OpenAIWrapper client without loop") from e + loop_id = id(loop) + if loop_id not in self._clients: + self._clients[loop_id] = openai.AsyncOpenAI(api_key=self.options.get_api_key()) + return self._clients[loop_id] + + async def complete(self, messages: List[LLMMessage]) -> str: + import openai + + messages = [{"role": msg.role, "content": msg.content} for msg in messages] + try: + response = await self.client.chat.completions.create(model=self.model, messages=messages) # type: ignore[arg-type] + except openai.OpenAIError as e: + raise LLMRequestError("Failed to call OpenAI complete API") from e + content = response.choices[0].message.content + assert content is not None # todo: better error + return content + + def get_rpm_limit(self) -> Optional[int]: + return self.options.rpm_limit + + +@llm_provider("litellm", None) +class LiteLLMWrapper(LLMWrapper): + def __init__(self, model: str): + self.model = model + + async def complete(self, messages: List[LLMMessage]) -> str: + from litellm import completion + + return completion(model=self.model, messages=messages).choices[0].message.content diff --git a/tests/dataset_generator/__init__.py b/tests/dataset_generator/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_pydantic_aliases.py b/tests/test_pydantic_aliases.py index 488322edd3..0cd96d923c 100644 --- a/tests/test_pydantic_aliases.py +++ b/tests/test_pydantic_aliases.py @@ -16,6 +16,9 @@ from evidently.base_metric import MetricResult from evidently.collector.config import CollectorTrigger from evidently.collector.storage import CollectorStorage +from evidently.experimental.dataset_generators.base import BaseDatasetGenerator +from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider +from evidently.experimental.dataset_generators.llm.splitter import Splitter from evidently.features.generated_features import BaseDescriptor from evidently.features.generated_features import GeneratedFeatures from evidently.features.llm_judge import BaseLLMPromptTemplate @@ -32,6 +35,8 @@ from evidently.tests.base_test import TestParameters from evidently.ui.components.base import Component from evidently.ui.dashboards.base import DashboardPanel +from evidently.utils.llm.prompts import PromptBlock +from evidently.utils.llm.prompts import PromptTemplate T = TypeVar("T") @@ -105,6 +110,11 @@ def test_all_aliases_correct(): CollectorStorage: "collector_storage", BaseLLMPromptTemplate: "prompt_template", DashboardPanel: "dashboard_panel", + BaseDatasetGenerator: "dataset_generator", + Splitter: "splitter", + DataCollectionProvider: "data_collecton_provider", + PromptBlock: "prompt_block", + PromptTemplate: "prompt_template", } skip = [Component] skip_literal = [EvidentlyBaseModel, WithTestAndMetricDependencies, BasePreset]