Skip to content

Commit 746d723

Browse files
mspronestishahules786jjmachan
authored
feat(generator): extend construction to any langchain LLM and Embeddings (explodinggradients#670)
## **User description** The current version of `with_openai` contains a hardcoded instantiation of `langchain_openai.chat_models.ChatOpenAI`, which makes `TestsetGenerator` very limited and not compatible with completion models, Azure OpenAI models, and open-source models. This PR extends `TestsetGenerator` to any `BaseLanguageModel` and `Embeddings` from langchain for versatility, addressing explodinggradients#230, explodinggradients#342, explodinggradients#635, and explodinggradients#636. Lastly, I've removed all the occurrences of mutable default arguments (bad antipattern, read [here](https://docs.python-guide.org/writing/gotchas/#mutable-default-arguments)). --------- Co-authored-by: Shahules786 <[email protected]> Co-authored-by: jjmachan <[email protected]>
1 parent 7c4e7e6 commit 746d723

File tree

13 files changed

+365
-69
lines changed

13 files changed

+365
-69
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ cython_debug/
162162
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163163
# and can be added to the global gitignore or merged into this file. For a more nuclear
164164
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
165-
#.idea/
165+
.idea/
166166

167167
# Ragas specific
168168
experiments/

docs/alfred.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
from __future__ import annotations
22

3-
import os
4-
from collections import namedtuple
53
import argparse
64
import asyncio
7-
from tqdm.asyncio import tqdm
5+
import os
86
import typing as t
9-
from langchain_openai.chat_models import ChatOpenAI
10-
from langchain_core.language_models.chat_models import BaseChatModel
7+
from collections import namedtuple
8+
119
from langchain.prompts import ChatPromptTemplate
10+
from langchain_core.language_models.chat_models import BaseChatModel
11+
from langchain_openai.chat_models import ChatOpenAI
12+
from tqdm.asyncio import tqdm
1213

1314
File = namedtuple("File", "name content")
1415

docs/concepts/testset_generation.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,20 @@ Checkout [llama-index](https://gpt-index.readthedocs.io/en/stable/core_modules/d
6060
:caption: Customising test data distribution
6161
from ragas.testset.generator import TestsetGenerator
6262
from ragas.testset.evolutions import simple, reasoning, multi_context
63+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
6364
6465
# documents = load your documents
6566
6667
# generator with openai models
67-
generator = TestsetGenerator.with_openai()
68+
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
69+
critic_llm = ChatOpenAI(model="gpt-4")
70+
embeddings = OpenAIEmbeddings()
71+
72+
generator = TestsetGenerator.from_langchain(
73+
generator_llm,
74+
critic_llm,
75+
embeddings
76+
)
6877
6978
# Change resulting question type distribution
7079
distributions = {

docs/getstarted/testset_generation.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,18 @@ Now, we'll import and use Ragas' `TestsetGenerator` to quickly generate a synthe
4141
:caption: Create 10 samples using default configuration
4242
from ragas.testset.generator import TestsetGenerator
4343
from ragas.testset.evolutions import simple, reasoning, multi_context
44+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4445
4546
# generator with openai models
46-
generator = TestsetGenerator.with_openai()
47+
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
48+
critic_llm = ChatOpenAI(model="gpt-4")
49+
embeddings = OpenAIEmbeddings()
50+
51+
generator = TestsetGenerator.from_langchain(
52+
generator_llm,
53+
critic_llm,
54+
embeddings
55+
)
4756
4857
# generate testset
4958
testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

docs/howtos/applications/compare_embeddings.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,24 @@ For this tutorial notebook, I am using papers from Semantic Scholar that is rela
2929
:caption: load documents using llama-hub and create test data
3030
from llama_index import download_loader
3131
from ragas.testset.evolutions import simple, reasoning, multi_context
32+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
3233
3334
SemanticScholarReader = download_loader("SemanticScholarReader")
3435
loader = SemanticScholarReader()
3536
query_space = "large language models"
3637
documents = loader.load_data(query=query_space, limit=100)
3738
3839
# generator with openai models
39-
generator = TestsetGenerator.with_openai()
40+
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
41+
critic_llm = ChatOpenAI(model="gpt-4")
42+
embeddings = OpenAIEmbeddings()
43+
44+
generator = TestsetGenerator.from_langchain(
45+
generator_llm,
46+
critic_llm,
47+
embeddings
48+
)
49+
4050
4151
distributions = {
4252
simple: 0.5,

docs/howtos/applications/compare_llms.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ from llama_index import download_loader, SimpleDirectoryReader
3535
from ragas.testset import TestsetGenerator
3636
from ragas.testset.generator import TestsetGenerator
3737
from ragas.testset.evolutions import simple, reasoning, multi_context
38+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
3839
3940
os.environ['OPENAI_API_KEY'] = 'Your OPEN AI key'
4041
@@ -43,7 +44,15 @@ reader = SimpleDirectoryReader("./arxiv-papers/",num_files_limit=30)
4344
documents = reader.load_data()
4445
4546
# generator with openai models
46-
generator = TestsetGenerator.with_openai()
47+
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
48+
critic_llm = ChatOpenAI(model="gpt-4")
49+
embeddings = OpenAIEmbeddings()
50+
51+
generator = TestsetGenerator.from_langchain(
52+
generator_llm,
53+
critic_llm,
54+
embeddings
55+
)
4756
4857
distributions = {
4958
simple: 0.5,

docs/howtos/applications/use_prompt_adaptation.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,18 @@ Now we can import all the required evolutions and adapt it using `generator.adap
125125
126126
from ragas.testset.generator import TestsetGenerator
127127
from ragas.testset.evolutions import simple, reasoning, multi_context,conditional
128+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
128129
129130
# generator with openai models
130-
generator = TestsetGenerator.with_openai()
131+
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
132+
critic_llm = ChatOpenAI(model="gpt-4")
133+
embeddings = OpenAIEmbeddings()
134+
135+
generator = TestsetGenerator.from_langchain(
136+
generator_llm,
137+
critic_llm,
138+
embeddings
139+
)
131140
132141
# adapt to language
133142
language = "hindi"

docs/howtos/customisations/azure-openai.ipynb

+75-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
"source": [
88
"# Using Azure OpenAI\n",
99
"\n",
10-
"This tutorial will show you how to use Azure OpenAI endpoints instead of OpenAI endpoints."
10+
"This tutorial will show you how to use Azure OpenAI endpoints instead of OpenAI endpoints.\n",
11+
"\n",
12+
"\n",
13+
"- [Evaluation](#load-sample-dataset)\n",
14+
"- [Test set generation](#test-set-generation)"
1115
]
1216
},
1317
{
@@ -416,6 +420,75 @@
416420
"\n",
417421
"if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁"
418422
]
423+
},
424+
{
425+
"cell_type": "markdown",
426+
"id": "3cee41e9",
427+
"metadata": {},
428+
"source": [
429+
"### Test set generation\n",
430+
"\n",
431+
"Here you will learn how to generate a test set from your dataset using the Azure OpenAI endpoints."
432+
]
433+
},
434+
{
435+
"cell_type": "code",
436+
"execution_count": null,
437+
"id": "aa9ff398",
438+
"metadata": {},
439+
"outputs": [],
440+
"source": [
441+
"! git clone https://huggingface.co/datasets/explodinggradients/2023-llm-papers"
442+
]
443+
},
444+
{
445+
"cell_type": "code",
446+
"execution_count": 2,
447+
"id": "d935a561",
448+
"metadata": {},
449+
"outputs": [],
450+
"source": [
451+
"from langchain.document_loaders import DirectoryLoader\n",
452+
"from ragas.testset.generator import TestsetGenerator\n",
453+
"from ragas.testset.evolutions import simple, reasoning, multi_context\n",
454+
"\n",
455+
"\n",
456+
"loader = DirectoryLoader(\"./2023-llm-papers/\", use_multithreading=True, silent_errors=True,sample_size=1)\n",
457+
"documents = loader.load()\n",
458+
"\n",
459+
"for document in documents:\n",
460+
" document.metadata['filename'] = document.metadata['source']"
461+
]
462+
},
463+
{
464+
"cell_type": "markdown",
465+
"id": "c8f735a7",
466+
"metadata": {},
467+
"source": [
468+
"Use the `azure_model` and `azure_embedding` that we initialized in above section to generate the test set"
469+
]
470+
},
471+
{
472+
"cell_type": "code",
473+
"execution_count": null,
474+
"id": "04abc4b1",
475+
"metadata": {},
476+
"outputs": [],
477+
"source": [
478+
"generator = TestsetGenerator.from_langchain(generator_llm=azure_model,critic_llm=azure_model,embeddings=azure_embeddings)\n",
479+
"\n",
480+
"testset = generator.generate_with_langchain_docs(documents, test_size=10, \n",
481+
" raise_exceptions=False, with_debugging_logs=False,\n",
482+
" distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}) "
483+
]
484+
},
485+
{
486+
"cell_type": "markdown",
487+
"id": "d2f5a7f7",
488+
"metadata": {},
489+
"source": [
490+
"testset.to_pandas()"
491+
]
419492
}
420493
],
421494
"metadata": {
@@ -434,7 +507,7 @@
434507
"name": "python",
435508
"nbconvert_exporter": "python",
436509
"pygments_lexer": "ipython3",
437-
"version": "3.10.12"
510+
"version": "3.10.8"
438511
}
439512
},
440513
"nbformat": 4,

0 commit comments

Comments
 (0)