NVIDIA · rapids-bot · May 1, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
@@ -208,6 +208,55 @@ eval:
 ```
 The swe-bench evaluator uses unstructured dataset entries. The entire row is provided as input to the workflow.
 
+### Tunable RAG Evaluator
+The tunable RAG evaluator is a customizable LLM evaluator that allows for flexible evaluation of RAG workflows. 
+It includes a default scoring mechanism based on an expected answer description rather than a ground truth answer. 
+
+The judge LLM prompt is tunable and can be provided in the `config.yml` file.
+
+A default scoring method is provided as follows:
+- Coverage: Evaluates if the answer covers all mandatory elements of the expected answer.
+- Correctness: Evaluates if the answer is correct compared to the expected answer.
+- Relevance: Evaluates if the answer is relevant to the question.
+
+These weights can be optionally tuned by setting the `default_score_weights` parameter in the `config.yml` file. If not set, each score will be equally weighted.
+
+The default scoring can be overridden by setting the config boolean `default_scoring` to false and providing your own scoring mechanism which you describe in your custom judge LLM prompt.
+Note: if you do choose to use the default scoring method, you are still able to tune the judge LLM prompt.
+
+**Example:**
+```yaml
+eval:
+  evaluators:
+    custom_rag_evaluation:
+      _type: tunable_rag_evaluator
+      llm_name: nim_rag_eval_llm
+      default_scoring: false
+      default_score_weights:
+        coverage: 0.5
+        correctness: 0.3
+        relevance: 0.2
+      judge_llm_prompt: >
+        You are an intelligent evaluator that scores the generated answer based on the description of the expected answer.
+        The score is a measure of how well the generated answer matches the description of the expected answer based on the question.
+        Take into account the question, the relevance of the answer to the question and the quality compared to the description of the expected answer.
+
+        Rules:
+        - The score must be a float of any value between 0.0 and 1.0 on a sliding scale.
+        - The reasoning string must be concise and to the point. It should be 1 sentence and 2 only if extra description is needed. It must explain why the score was given and what is different between the generated answer and the expected answer.
+```
+
+Note: In your evaluation dataset, make sure that the `answer` field is a description of the expected answer with details on what is expected from the generated answer.
+
+**Example:**
+```json
+{
+  "id": 1,
+  "question": "What is the product of 3 and 7, and is it greater than the current hour?",
+  "answer": "Answer must have the answer of product of 3 and 7 and whether it is greater than the current hour"
+}
+```
+
 ## Adding Custom Evaluators
 You can add custom evaluators to evaluate the workflow output. To add a custom evaluator, you need to implement the evaluator and register it with the AgentIQ evaluator system. See the [Custom Evaluator](../guides/custom-evaluator.md) documentation for more information.
 

diff --git a/examples/simple_calculator/src/aiq_simple_calculator/configs/config-custom-eval.yml b/examples/simple_calculator/src/aiq_simple_calculator/configs/config-custom-eval.yml
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+general:
+  use_uvloop: true
+
+functions:
+  calculator_multiply:
+    _type: calculator_multiply
+  calculator_inequality:
+    _type: calculator_inequality
+  calculator_divide:
+    _type: aiq_simple_calculator/calculator_divide
+  current_datetime:
+    _type: current_datetime
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: nvdev/meta/llama-3.1-70b-instruct
+    temperature: 0.0
+    max_tokens: 1024
+  eval_llm:
+    _type: nim
+    model_name: mistralai/mixtral-8x22b-instruct-v0.1
+    temperature: 0.0
+    max_tokens: 1024
+  openai_llm:
+    _type: openai
+    model_name: gpt-3.5-turbo
+    max_tokens: 2000
+
+workflow:
+  _type: react_agent
+  tool_names:
+    - calculator_multiply
+    - calculator_inequality
+    - current_datetime
+    - calculator_divide
+  llm_name: nim_llm
+  verbose: true
+  retry_parsing_errors: true
+  max_retries: 3
+
+
+eval:
+  general:
+    output_dir: examples/simple_calculator/.tmp/eval/simple_calculator
+    dataset:
+      _type: json
+      file_path: examples/simple_calculator/data/simple_calculator_questions_custom.json
+
+  evaluators:
+    custom_rag_evaluation:
+      _type: tunable_rag_evaluator
+      llm_name: nim_llm
+      default_scoring: true
+      default_score_weights:
+        coverage: 0.5
+        correctness: 0.3
+        relevance: 0.2
+      judge_llm_prompt: >
+        You are an intelligent evaluator that scores the generated answer based on the description of the expected answer.
+        The score is a measure of how well the generated answer matches the description of the expected answer based on the question.
+        Take into account the question, the relevance of the answer to the question and the quality compared to the description of the expected answer.
+
+        Rules:
+        - The score must be a float of any value between 0.0 and 1.0 on a sliding scale.
+        - The reasoning string must be concise and to the point. It should be 1 sentence and 2 only if extra description is needed. It must explain why the score was given and what is different between the generated answer and the expected answer.
+        - The tags <image> and <chart> are real images and charts.
diff --git a/.../simple_calculator/src/aiq_simple_calculator/data/simple_calculator_questions_custom.json b/.../simple_calculator/src/aiq_simple_calculator/data/simple_calculator_questions_custom.json
diff --git a/src/aiq/eval/custom_rag_evaluator/__init__.py b/src/aiq/eval/custom_rag_evaluator/__init__.py
diff --git a/src/aiq/eval/custom_rag_evaluator/evaluate.py b/src/aiq/eval/custom_rag_evaluator/evaluate.py
@@ -0,0 +1,206 @@
+import asyncio
+import logging
+from tqdm import tqdm
+
+from langchain_core.language_models import BaseChatModel
+
+from langchain.output_parsers import StructuredOutputParser, ResponseSchema
+from langchain.schema import SystemMessage, HumanMessage
+
+from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.evaluator_model import EvalOutput
+from aiq.eval.evaluator.evaluator_model import EvalOutputItem
+from aiq.eval.evaluator.evaluator_model import EvalInputItem
+from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
+
+logger = logging.getLogger(__name__)
+
+
+def evaluation_prompt(judge_llm_prompt: str, question: str, answer_description: str, generated_answer: str, format_instructions: str, default_scoring: bool):
+    """
+    This function generates a prompt for the judge LLM to evaluate the generated answer.
+    """
+
+    DEFAULT_SCORING_INSTRUCTIONS = """
+    The coverage score is a measure of how well the generated answer covers the critical aspects mentioned in the expected answer. A low coverage score indicates that the generated answer misses critical aspects of the expected answer. A middle coverage score indicates that the generated answer covers some of the must-haves of the expected answer but lacks other details. A high coverage score indicates that all of the expected aspects are present in the generated answer.
+    The correctness score is a measure of how well the generated answer matches the expected answer. A low correctness score indicates that the generated answer is incorrect or does not match the expected answer. A middle correctness score indicates that the generated answer is correct but lacks some details. A high correctness score indicates that the generated answer is exactly the same as the expected answer.
+    The relevance score is a measure of how well the generated answer is relevant to the question. A low relevance score indicates that the generated answer is not relevant to the question. A middle relevance score indicates that the generated answer is somewhat relevant to the question. A high relevance score indicates that the generated answer is exactly relevant to the question.
+    The reasoning is a 1-2 sentence explanation for the scoring.
+    """
+
+    DEFAULT_EVAL_PROMPT = (
+        f"You are an intelligent assistant that responds strictly in JSON format."
+        f"Judge based on the following scoring rubric: {DEFAULT_SCORING_INSTRUCTIONS}"
+        f"{judge_llm_prompt}\n"
+        f"{format_instructions}\n"
+        f"Here is the user's query: {question}"
+        f"Here is the description of the expected answer: {answer_description}"
+        f"Here is the generated answer: {generated_answer}"
+    )
+
+    EVAL_PROMPT = (
+        f"You are an intelligent assistant that responds strictly in JSON format. {judge_llm_prompt}\n"
+        f"{format_instructions}\n"
+        f"Here is the user's query: {question}"
+        f"Here is the description of the expected answer: {answer_description}"
+        f"Here is the generated answer: {generated_answer}"
+    )
+
+    return EVAL_PROMPT if not default_scoring else DEFAULT_EVAL_PROMPT
+
+class TunableRagEvaluator:
+    '''Customizable RAG evaluator class with customizable LLM prompt for scoring.'''
+
+    def __init__(self, llm: BaseChatModel, judge_llm_prompt: str, max_concurrency: int, default_scoring: bool, default_score_weights: dict):
+        self.llm = llm
+        self.max_concurrency = max_concurrency
+        self.judge_llm_prompt = judge_llm_prompt
+        self.semaphore = asyncio.Semaphore(self.max_concurrency)
+        self.default_scoring = default_scoring
+        # Set equal weights for each score
+        self.default_score_weights = {
+            "coverage": 1/3,
+            "correctness": 1/3,
+            "relevance": 1/3
+        }
+
+    async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
+        '''Evaluate function'''
+
+        async def process_item(item):
+            """Compute RAG evaluation for an individual item"""
+            question = item.input_obj
+            answer_description = item.expected_output_obj
+            generated_answer = item.output_obj
+
+            # Call judge LLM to generate score
+            score = 0.0
+
+            default_evaluation_schema = [
+                ResponseSchema(name="coverage_score", description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5", type="float"),
+                ResponseSchema(name="correctness_score", description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5", type="float"),
+                ResponseSchema(name="relevance_score", description="Score for the relevance of the generated answer to the question. Ex. 0.5", type="float"),
+                ResponseSchema(name="reasoning", description="1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'", type="string"),
+            ]
+
+            custom_evaluation_schema = [
+                ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
+                ResponseSchema(name="reasoning", description="1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'", type="string"),
+            ]
+
+            if self.default_scoring:
+                evaluation_schema = default_evaluation_schema
+            else:
+                evaluation_schema = custom_evaluation_schema
+
+            llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
+            format_instructions = llm_input_response_parser.get_format_instructions()
+
+            eval_prompt = evaluation_prompt(
+                judge_llm_prompt=self.judge_llm_prompt,
+                question = question,
+                answer_description = answer_description,
+                generated_answer = generated_answer,
+                format_instructions=format_instructions,
+                default_scoring=self.default_scoring
+            )
+
+            messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
+
+            response = await self.llm.ainvoke(messages)
+            try:
+                parsed_response = llm_input_response_parser.parse(response.content)
+                if self.default_scoring:
+                    try:
+                        coverage_score = parsed_response["coverage_score"]
+                        correctness_score = parsed_response["correctness_score"]
+                        relevance_score = parsed_response["relevance_score"]
+                        reasoning = parsed_response["reasoning"]
+                    except KeyError as e:
+                        logger.error(f"Missing required keys in default scoring response: {', '.join(str(arg) for arg in e.args)}")
+                        reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
+                        raise
+
+                    # Calculate score
+                    coverage_weight = self.default_score_weights.get("coverage", 0)
+                    correctness_weight = self.default_score_weights.get("correctness", 0) 
+                    relevance_weight = self.default_score_weights.get("relevance", 0)
+
+                    if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
+                        logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
+                        coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
+                        correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
+                        relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
+
+                    score = (coverage_weight * coverage_score + correctness_weight * correctness_score + relevance_weight * relevance_score)
+
+                else:
+                    try:
+                        score = parsed_response["score"]
+                        reasoning = parsed_response["reasoning"]
+                    except KeyError as e:
+                        logger.error(f"Missing required keys in custom scoring response: {', '.join(str(arg) for arg in e.args)}")
+                        reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
+                        raise
+            except (KeyError, ValueError) as e:
+                logger.error(f"Error parsing judge LLM response: {e}")
+                score = 0.0
+                reasoning = "Error in evaluator from parsing judge LLM response."
+
+            if self.default_scoring:
+                reasoning = {
+                    "question": question,
+                    "answer_description": answer_description,
+                    "generated_answer": generated_answer,
+                    "score_breakdown": {
+                        "coverage_score": coverage_score,
+                        "correctness_score": correctness_score,
+                        "relevance_score": relevance_score,
+                    },
+                    "reasoning": reasoning,
+                }
+            else:
+                reasoning = {
+                    "question": question,
+                    "answer_description": answer_description,
+                    "generated_answer": generated_answer,
+                    "reasoning": reasoning
+                }
+
+            return score, reasoning
+
+        async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
+            """
+            Process an item asynchronously and update the progress bar.
+            Use the semaphore to limit the number of concurrent items.
+            """
+            async with self.semaphore:
+              result = await process_item(item)
+              # Update the progress bar
+              pbar.update(1)
+              return result
+
+        try:
+            # Claim a tqdm position to display the progress bar
+            tqdm_position = TqdmPositionRegistry.claim()
+            # Create a progress bar
+            pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating RAG", position=tqdm_position)
+            # Process items concurrently with a limit on concurrency
+            results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
+        finally:
+            pbar.close()
+            TqdmPositionRegistry.release(tqdm_position)
+
+        # Extract scores and reasonings
+        sample_scores, sample_reasonings = zip(*results) if results else ([], [])
+
+        # Compute average score
+        avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
+
+        # Construct EvalOutputItems
+        eval_output_items = [
+            EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
+            for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
+        ]
+
+        return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
diff --git a/src/aiq/eval/custom_rag_evaluator/register.py b/src/aiq/eval/custom_rag_evaluator/register.py
@@ -0,0 +1,32 @@
+from pydantic import Field
+
+from aiq.builder.builder import EvalBuilder
+from aiq.builder.evaluator import EvaluatorInfo
+from aiq.builder.framework_enum import LLMFrameworkEnum
+from aiq.cli.register_workflow import register_evaluator
+from aiq.data_models.evaluator import EvaluatorBaseConfig
+
+
+class TunableRagEvaluatorConfig(EvaluatorBaseConfig, name="tunable_rag_evaluator"):
+    '''Configuration for custom RAG evaluator'''
+    llm_name: str = Field(description="Name of the judge LLM")
+    judge_llm_prompt: str = Field(description="LLM prompt for the judge LLM")
+    default_scoring: bool = Field(description="Whether to use default scoring", default=False)
+    default_score_weights: dict = Field(
+        default={
+            "coverage": 0.5,
+            "correctness": 0.3,
+            "relevance": 0.2
+        },
+        description="Weights for the different scoring components when using default scoring"
+    )
+
+@register_evaluator(config_type=TunableRagEvaluatorConfig)
+async def register_tunable_rag_evaluator(config: TunableRagEvaluatorConfig, builder: EvalBuilder):
+    '''Register customizable RAG evaluator'''
+    from .evaluate import TunableRagEvaluator
+
+    llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
+    evaluator = TunableRagEvaluator(llm, config.judge_llm_prompt, builder.get_max_concurrency(), config.default_scoring, config.default_score_weights)
+
+    yield EvaluatorInfo(config=config, evaluate_fn=evaluator.evaluate, description="Customizable RAG Evaluator")
@@ -20,3 +20,4 @@
 from .rag_evaluator.register import register_ragas_evaluator
 from .swe_bench_evaluator.register import register_swe_bench_evaluator
 from .trajectory_evaluator.register import register_trajectory_evaluator
+from .custom_rag_evaluator.register import register_custom_rag_evaluator