-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'microsoft:main' into huggingface_agent
- Loading branch information
Showing
210 changed files
with
13,832 additions
and
8,361 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
Agents for running the AgentEval pipeline. | ||
|
||
AgentEval is a process for evaluating a LLM-based system's performance on a given task. | ||
|
||
When given a task to evaluate and a few example runs, the critic and subcritic agents create evaluation criteria for evaluating a system's solution. Once the criteria has been created, the quantifier agent can evaluate subsequent task solutions based on the generated criteria. | ||
|
||
For more information see: [AgentEval Integration Roadmap](https://github.com/microsoft/autogen/issues/2162) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
from typing import Dict, List, Literal, Optional, Union | ||
|
||
import autogen | ||
from autogen.agentchat.contrib.agent_eval.criterion import Criterion | ||
from autogen.agentchat.contrib.agent_eval.critic_agent import CriticAgent | ||
from autogen.agentchat.contrib.agent_eval.quantifier_agent import QuantifierAgent | ||
from autogen.agentchat.contrib.agent_eval.subcritic_agent import SubCriticAgent | ||
from autogen.agentchat.contrib.agent_eval.task import Task | ||
|
||
|
||
def generate_criteria( | ||
llm_config: Optional[Union[Dict, Literal[False]]] = None, | ||
task: Task = None, | ||
additional_instructions: str = "", | ||
max_round=2, | ||
use_subcritic: bool = False, | ||
): | ||
""" | ||
Creates a list of criteria for evaluating the utility of a given task. | ||
Args: | ||
llm_config (dict or bool): llm inference configuration. | ||
task (Task): The task to evaluate. | ||
additional_instructions (str): Additional instructions for the criteria agent. | ||
max_round (int): The maximum number of rounds to run the conversation. | ||
use_subcritic (bool): Whether to use the subcritic agent to generate subcriteria. | ||
Returns: | ||
list: A list of Criterion objects for evaluating the utility of the given task. | ||
""" | ||
critic = CriticAgent( | ||
system_message=CriticAgent.DEFAULT_SYSTEM_MESSAGE + "\n" + additional_instructions, | ||
llm_config=llm_config, | ||
) | ||
|
||
critic_user = autogen.UserProxyAgent( | ||
name="critic_user", | ||
max_consecutive_auto_reply=0, # terminate without auto-reply | ||
human_input_mode="NEVER", | ||
code_execution_config={"use_docker": False}, | ||
) | ||
|
||
agents = [critic_user, critic] | ||
|
||
if use_subcritic: | ||
subcritic = SubCriticAgent( | ||
llm_config=llm_config, | ||
) | ||
agents.append(subcritic) | ||
|
||
groupchat = autogen.GroupChat( | ||
agents=agents, messages=[], max_round=max_round, speaker_selection_method="round_robin" | ||
) | ||
critic_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config) | ||
|
||
critic_user.initiate_chat(critic_manager, message=task.get_sys_message()) | ||
criteria = critic_user.last_message() | ||
content = criteria["content"] | ||
# need to strip out any extra code around the returned json | ||
content = content[content.find("[") : content.rfind("]") + 1] | ||
criteria = Criterion.parse_json_str(content) | ||
return criteria | ||
|
||
|
||
def quantify_criteria( | ||
llm_config: Optional[Union[Dict, Literal[False]]] = None, | ||
criteria: List[Criterion] = None, | ||
task: Task = None, | ||
test_case: str = "", | ||
ground_truth: str = "", | ||
): | ||
""" | ||
Quantifies the performance of a system using the provided criteria. | ||
Args: | ||
llm_config (dict or bool): llm inference configuration. | ||
criteria ([Criterion]): A list of criteria for evaluating the utility of a given task. | ||
task (Task): The task to evaluate. | ||
test_case (str): The test case to evaluate. | ||
ground_truth (str): The ground truth for the test case. | ||
Returns: | ||
dict: A dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria. | ||
""" | ||
quantifier = QuantifierAgent( | ||
llm_config=llm_config, | ||
) | ||
|
||
quantifier_user = autogen.UserProxyAgent( | ||
name="quantifier_user", | ||
max_consecutive_auto_reply=0, # terminate without auto-reply | ||
human_input_mode="NEVER", | ||
code_execution_config={"use_docker": False}, | ||
) | ||
|
||
quantifier_user.initiate_chat( # noqa: F841 | ||
quantifier, | ||
message=task.get_sys_message() | ||
+ "Evaluation dictionary: " | ||
+ Criterion.write_json(criteria) | ||
+ "actual test case to evaluate: " | ||
+ test_case, | ||
) | ||
quantified_results = quantifier_user.last_message() | ||
return {"actual_success": ground_truth, "estimated_performance": quantified_results["content"]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from __future__ import annotations | ||
|
||
import json | ||
from typing import List | ||
|
||
import pydantic_core | ||
from pydantic import BaseModel | ||
from pydantic.json import pydantic_encoder | ||
|
||
|
||
class Criterion(BaseModel): | ||
""" | ||
A class that represents a criterion for agent evaluation. | ||
""" | ||
|
||
name: str | ||
description: str | ||
accepted_values: List[str] | ||
sub_criteria: List[Criterion] = list() | ||
|
||
@staticmethod | ||
def parse_json_str(criteria: str): | ||
""" | ||
Create a list of Criterion objects from a json string. | ||
Args: | ||
criteria (str): Json string that represents the criteria | ||
returns: | ||
[Criterion]: A list of Criterion objects that represents the json criteria information. | ||
""" | ||
return [Criterion(**crit) for crit in json.loads(criteria)] | ||
|
||
@staticmethod | ||
def write_json(criteria): | ||
""" | ||
Create a json string from a list of Criterion objects. | ||
Args: | ||
criteria ([Criterion]): A list of Criterion objects. | ||
Returns: | ||
str: A json string that represents the list of Criterion objects. | ||
""" | ||
return json.dumps([crit.model_dump() for crit in criteria], indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from typing import Optional | ||
|
||
from autogen.agentchat.conversable_agent import ConversableAgent | ||
|
||
|
||
class CriticAgent(ConversableAgent): | ||
""" | ||
An agent for creating list of criteria for evaluating the utility of a given task. | ||
""" | ||
|
||
DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant. | ||
Convert the evaluation criteria into a list where each item is a criteria which consists of the following dictionary as follows | ||
{"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key} | ||
Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description. | ||
Output just the criteria string you have created, no code. | ||
""" | ||
|
||
DEFAULT_DESCRIPTION = "An AI agent for creating list criteria for evaluating the utility of a given task." | ||
|
||
def __init__( | ||
self, | ||
name="critic", | ||
system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE, | ||
description: Optional[str] = DEFAULT_DESCRIPTION, | ||
**kwargs, | ||
): | ||
""" | ||
Args: | ||
name (str): agent name. | ||
system_message (str): system message for the ChatCompletion inference. | ||
Please override this attribute if you want to reprogram the agent. | ||
description (str): The description of the agent. | ||
**kwargs (dict): Please refer to other kwargs in | ||
[ConversableAgent](../../conversable_agent#__init__). | ||
""" | ||
super().__init__( | ||
name=name, | ||
system_message=system_message, | ||
description=description, | ||
**kwargs, | ||
) |
Oops, something went wrong.