|
13 | 13 | from deepeval.test_case.llm_test_case import LLMTestCaseParams
|
14 | 14 |
|
15 | 15 | test_case = ConversationalTestCase(
|
| 16 | + chatbot_role="A programmer", |
16 | 17 | turns=[
|
17 | 18 | LLMTestCase(
|
18 | 19 | input="Message input", actual_output="Message actual output"
|
19 | 20 | )
|
20 |
| - ] |
| 21 | + ], |
21 | 22 | )
|
22 | 23 | test_case2 = ConversationalTestCase(
|
23 | 24 | turns=[
|
|
36 | 37 |
|
37 | 38 | from deepeval.metrics import GEval
|
38 | 39 |
|
39 |
| -correctness_metric = GEval( |
40 |
| - name="Correctness", |
41 |
| - criteria="Determine whether the actual output is factually correct based on the expected output.", |
42 |
| - # NOTE: you can only provide either criteria or evaluation_steps, and not both |
43 |
| - evaluation_steps=[ |
44 |
| - "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", |
45 |
| - "You should also heavily penalize omission of detail", |
46 |
| - "Vague language, or contradicting OPINIONS, are OK", |
47 |
| - ], |
48 |
| - evaluation_params=[ |
49 |
| - LLMTestCaseParams.INPUT, |
50 |
| - LLMTestCaseParams.ACTUAL_OUTPUT, |
51 |
| - ], |
52 |
| -) |
| 40 | +# correctness_metric = GEval( |
| 41 | +# name="Correctness", |
| 42 | +# criteria="Determine whether the actual output is factually correct based on the expected output.", |
| 43 | +# # NOTE: you can only provide either criteria or evaluation_steps, and not both |
| 44 | +# evaluation_steps=[ |
| 45 | +# "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", |
| 46 | +# "You should also heavily penalize omission of detail", |
| 47 | +# "Vague language, or contradicting OPINIONS, are OK", |
| 48 | +# ], |
| 49 | +# evaluation_params=[ |
| 50 | +# LLMTestCaseParams.INPUT, |
| 51 | +# LLMTestCaseParams.ACTUAL_OUTPUT, |
| 52 | +# ], |
| 53 | +# ) |
53 | 54 |
|
54 |
| -evaluate( |
55 |
| - test_cases=[ |
56 |
| - LLMTestCase( |
57 |
| - input="Message input number 1!", |
58 |
| - actual_output="Message actual output number 1...", |
59 |
| - retrieval_context=["I love dogs"], |
60 |
| - ), |
61 |
| - LLMTestCase( |
62 |
| - input="Message input 2, this is just a test", |
63 |
| - actual_output="Message actual output 2, this is just a test", |
64 |
| - retrieval_context=["I love dogs"], |
65 |
| - ), |
66 |
| - ], |
67 |
| - metrics=[ |
68 |
| - # correctness_metric, |
69 |
| - # AnswerRelevancyMetric(), |
70 |
| - # BiasMetric(), |
71 |
| - SummarizationMetric(verbose_mode=True, truths_extraction_limit=3), |
72 |
| - FaithfulnessMetric(verbose_mode=True, truths_extraction_limit=3), |
73 |
| - ], |
74 |
| - # throttle_value=10, |
75 |
| - # max_concurrent=1, |
76 |
| -) |
| 55 | +# evaluate( |
| 56 | +# test_cases=[ |
| 57 | +# LLMTestCase( |
| 58 | +# input="Message input number 1!", |
| 59 | +# actual_output="Message actual output number 1...", |
| 60 | +# retrieval_context=["I love dogs"], |
| 61 | +# ), |
| 62 | +# LLMTestCase( |
| 63 | +# input="Message input 2, this is just a test", |
| 64 | +# actual_output="Message actual output 2, this is just a test", |
| 65 | +# retrieval_context=["I love dogs"], |
| 66 | +# ), |
| 67 | +# ], |
| 68 | +# metrics=[ |
| 69 | +# # correctness_metric, |
| 70 | +# # AnswerRelevancyMetric(), |
| 71 | +# # BiasMetric(), |
| 72 | +# SummarizationMetric(verbose_mode=True, truths_extraction_limit=3), |
| 73 | +# FaithfulnessMetric(verbose_mode=True, truths_extraction_limit=3), |
| 74 | +# ], |
| 75 | +# # throttle_value=10, |
| 76 | +# # max_concurrent=1, |
| 77 | +# ) |
77 | 78 |
|
78 |
| -# confident_evaluate(experiment_name="Convo", test_cases=[test_case]) |
| 79 | +confident_evaluate(experiment_name="Convo", test_cases=[test_case]) |
79 | 80 |
|
80 | 81 |
|
81 | 82 | # evaluate(
|
|
0 commit comments