Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
d0729f1
base
mike0sv Oct 9, 2024
3c59469
i am a banana
mike0sv Oct 9, 2024
57e975a
fix example
mike0sv Oct 9, 2024
5f85465
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
79d58b0
wip
mike0sv Oct 9, 2024
3927121
wip
mike0sv Oct 9, 2024
74660bf
wip
mike0sv Oct 9, 2024
9d19718
wip
mike0sv Oct 9, 2024
b827f8f
a draft code for a RAG dataset generation
emeli-dral Oct 9, 2024
8c94d2a
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
e537bed
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
da7aa09
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
94452be
readable version
emeli-dral Oct 9, 2024
24e1f8d
wip
mike0sv Oct 9, 2024
7c4f536
wip
mike0sv Oct 9, 2024
c95b6ac
wip
mike0sv Oct 9, 2024
180b822
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
426a1fa
prompts
mike0sv Oct 9, 2024
90cfe1a
Merge remote-tracking branch 'origin/feature/dataset-generator' into …
mike0sv Oct 9, 2024
7369993
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
bbbfd55
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
996aab7
fixes
mike0sv Oct 9, 2024
f229b66
Merge remote-tracking branch 'origin/feature/dataset-generator' into …
mike0sv Oct 9, 2024
3dee2c3
fixes
mike0sv Oct 9, 2024
761946e
generate_dataset_from_docs
svetleo-evidently Oct 9, 2024
e36c8ce
chunks count
mike0sv Oct 10, 2024
49e821a
Merge remote-tracking branch 'origin/feature/dataset-generator' into …
mike0sv Oct 10, 2024
8051121
async
mike0sv Oct 10, 2024
37257bc
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
bd4ba86
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
446b275
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
d37473b
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
983b040
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
8eb5017
WIP
mike0sv Oct 10, 2024
466d77b
fix
mike0sv Oct 10, 2024
27166ae
rename
mike0sv Oct 10, 2024
dbf8907
fix import
mike0sv Oct 10, 2024
48cc469
move system prompts into user
mike0sv Oct 10, 2024
7b02572
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
c5aed45
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
efdc72f
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
468d42e
generate_dataset_from_docs
svetleo-evidently Oct 10, 2024
cb1dd09
prompt function signature
mike0sv Oct 11, 2024
ba7f0d1
Merge remote-tracking branch 'origin/feature/dataset-generator' into …
mike0sv Oct 11, 2024
4100c75
function signature validation
mike0sv Oct 11, 2024
42c58ce
requirements
mike0sv Oct 11, 2024
cdcadd5
requirements
mike0sv Oct 11, 2024
64533cd
requirements
mike0sv Oct 11, 2024
9e61435
requirements
mike0sv Oct 11, 2024
dbf543f
requirements
mike0sv Oct 11, 2024
f697955
requirements
mike0sv Oct 11, 2024
cecf4c2
lil cleanup
mike0sv Oct 11, 2024
f22f1f9
mypy
mike0sv Oct 11, 2024
d3f9122
move, add splitter
mike0sv Oct 14, 2024
a793433
fix example and deps
mike0sv Oct 14, 2024
fc320ee
rename
mike0sv Oct 14, 2024
fa1563c
lint
mike0sv Oct 14, 2024
f0ed6d4
lint
mike0sv Oct 14, 2024
8d50d7b
audit
mike0sv Oct 14, 2024
bb0ecc7
type aliases and audit
mike0sv Oct 14, 2024
e093f54
type aliases and sudit and llm util refactor and stuff
mike0sv Oct 14, 2024
bfcacb0
fix import
mike0sv Oct 14, 2024
3041225
pip audit
mike0sv Oct 14, 2024
70dfe57
pip audit
mike0sv Oct 14, 2024
71f236b
reg
mike0sv Oct 14, 2024
8ca6e53
pip audit
mike0sv Oct 15, 2024
df8ca75
remove
mike0sv Oct 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,12 @@ jobs:
- name: Install minimal dependencies
run: pip install -r requirements.min.txt
- name: Install package
run: pip install -e .[dev,spark,fsspec]
run: pip install -e .[dev,spark,fsspec,llm]
- name: Run pip-audit
run: pip-audit --ignore-vuln PYSEC-2024-48 --ignore-vuln GHSA-jw8x-6495-233v --ignore-vuln GHSA-4hq2-rpgc-r8r7
run: |
pip-audit \
--ignore-vuln GHSA-jw8x-6495-233v \
--ignore-vuln PYSEC-2024-38
- name: Run Tests
run: python -m pytest --durations=50
test:
Expand Down Expand Up @@ -155,7 +158,7 @@ jobs:
uses: ./.github/share-actions/get-bikes-dataset-cached

- name: Install package
run: pip install -e .[dev,spark,fsspec]
run: pip install -e .[dev,spark,fsspec,llm]
- name: Run Tests
run: python -m pytest --durations=50

Expand All @@ -173,7 +176,7 @@ jobs:
cache: "pip"
cache-dependency-path: setup.py
- name: Install dependencies
run: pip install -e ".[dev]"
run: pip install -e .
- name: Install wheel
run: pip install wheel
- name: Build package
Expand Down
66 changes: 66 additions & 0 deletions examples/data_generators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from evidently.experimental.dataset_generators.llm.questions import QADatasetFromSeedGenerator, QADatasetGenerator
from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
from evidently.options.base import Options


def generate_from_file():
file_path = "../cloud_quickstart_tracing.pdf"
data = DataCollectionProvider.from_files(file_path, chunk_size=50, chunk_overlap=20, splitter="simple")

generator = QADatasetGenerator(
data_collection=data,
provider="openai",
model="gpt-4o-mini",
num_questions=5,
options=Options.from_any_options(None)
)
generated = generator.generate()
for _, a in generated.iterrows():
print("Q", a["questions"])
if "answers" in a:
print("A", a["answers"])
if "context" in a:
print("C", a["context"])
print()


def main():
data = DataCollectionProvider.from_chunks(chunks=["I am a banana", "My spoon is too big"])
generator = QADatasetGenerator(
data_collection=data,
provider="openai",
model="gpt-4o-mini",
num_questions=5,
options=Options.from_any_options(None)
)

generated = generator.generate()
for _, a in generated.iterrows():
print("Q", a["questions"])
if "answers" in a:
print("A", a["answers"])
if "context" in a:
print("C", a["context"])
print()

generator = QADatasetFromSeedGenerator(
seed_question="What is 'kek'?",
num_questions=5,
provider="openai",
model="gpt-4o-mini",
options=Options.from_any_options(None)
)

generated = generator.generate()
for _, a in generated.iterrows():
print("Q", a["questions"])
if "answers" in a:
print("A", a["answers"])
if "context" in a:
print("C", a["context"])
print()


if __name__ == '__main__':
main()
# generate_from_file()
1 change: 1 addition & 0 deletions requirements.min.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ openai==1.16.2
evaluate==0.4.1
transformers[torch]==4.39.3
sentence-transformers==2.7.0
chromadb==0.4.0
9 changes: 9 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,15 @@ ignore_missing_imports = True
[mypy-litellm.*]
ignore_missing_imports = True

[mypy-chromadb.*]
ignore_missing_imports = True

[mypy-llama_index.*]
ignore_missing_imports = True

[mypy-pypdf.*]
ignore_missing_imports = True

[tool:pytest]
testpaths=tests
python_classes=*Test
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
"types-python-dateutil==2.8.19",
"types-ujson>=5.4.0",
"pillow==10.3.0",
"httpx==0.24.1",
"httpx==0.27.0",
"ruff==0.3.7",
"pre-commit==3.5.0",
"pytest-asyncio==0.23.7",
Expand All @@ -102,6 +102,7 @@
"evaluate>=0.4.1",
"transformers[torch]>=4.39.3",
"sentence-transformers>=2.7.0",
"chromadb>=0.4.0",
],
"spark": ["pyspark>=3.4.0"],
"fsspec": [
Expand Down
3 changes: 3 additions & 0 deletions src/evidently/experimental/dataset_generators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import _registry

__all__ = ["_registry"]
67 changes: 67 additions & 0 deletions src/evidently/experimental/dataset_generators/_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
from evidently.experimental.dataset_generators.llm.index import DataCollectionProvider
from evidently.experimental.dataset_generators.llm.splitter import Splitter
from evidently.pydantic_utils import register_type_alias
from evidently.utils.llm.prompts import PromptTemplate

register_type_alias(
BaseDatasetGenerator,
"evidently.experimental.dataset_generators.llm.questions.QADatasetFromSeedGenerator",
"evidently:dataset_generator:QADatasetFromSeedGenerator",
)
register_type_alias(
BaseDatasetGenerator,
"evidently.experimental.dataset_generators.llm.questions.QADatasetGenerator",
"evidently:dataset_generator:QADatasetGenerator",
)
register_type_alias(
DataCollectionProvider,
"evidently.experimental.dataset_generators.llm.index.ChunksDataCollectionProvider",
"evidently:data_collecton_provider:ChunksDataCollectionProvider",
)
register_type_alias(
DataCollectionProvider,
"evidently.experimental.dataset_generators.llm.index.FileDataCollectionProvider",
"evidently:data_collecton_provider:FileDataCollectionProvider",
)

register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.BaselineAnswerPromptTemplate",
"evidently:prompt_template:BaselineAnswerPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.NaiveQuestionsFromContextPromptTemplate",
"evidently:prompt_template:NaiveQuestionsFromContextPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.QuestionsFromContextPromptTemplate",
"evidently:prompt_template:QuestionsFromContextPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.QuestionsFromSeedPromptTemplate",
"evidently:prompt_template:QuestionsFromSeedPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.ReformulateQuestionPromptTemplate",
"evidently:prompt_template:ReformulateQuestionPromptTemplate",
)
register_type_alias(
PromptTemplate,
"evidently.experimental.dataset_generators.llm.prompts.SimpleQuestionPromptTemplate",
"evidently:prompt_template:SimpleQuestionPromptTemplate",
)
register_type_alias(
Splitter,
"evidently.experimental.dataset_generators.llm.splitter.LlamaIndexSplitter",
"evidently:splitter:LlamaIndexSplitter",
)
register_type_alias(
Splitter,
"evidently.experimental.dataset_generators.llm.splitter.SimpleSplitter",
"evidently:splitter:SimpleSplitter",
)
21 changes: 21 additions & 0 deletions src/evidently/experimental/dataset_generators/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from abc import ABC
from abc import abstractmethod

import pandas as pd
from typing_extensions import TypeAlias

from evidently.options.base import Options
from evidently.pydantic_utils import EvidentlyBaseModel

DatasetGeneratorResult: TypeAlias = pd.DataFrame


class BaseDatasetGenerator(EvidentlyBaseModel, ABC):
class Config:
is_base_type = True

options: Options

@abstractmethod
def generate(self) -> DatasetGeneratorResult:
raise NotImplementedError
Empty file.
22 changes: 22 additions & 0 deletions src/evidently/experimental/dataset_generators/llm/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import Optional

from evidently._pydantic_compat import PrivateAttr
from evidently.experimental.dataset_generators.base import BaseDatasetGenerator
from evidently.options.base import Options
from evidently.utils.llm.wrapper import LLMWrapper
from evidently.utils.llm.wrapper import get_llm_wrapper


class BaseLLMDatasetGenerator(BaseDatasetGenerator):
provider: str
model: str
_llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)

def get_llm_wrapper(self, options: Options) -> LLMWrapper:
if self._llm_wrapper is None:
self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
return self._llm_wrapper

@property
def wrapper(self):
return self.get_llm_wrapper(self.options)
Loading