diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 8a75a1487ff1..755531953f4e 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,32 +1,32 @@ -#------------------------------------------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE file in the project root for license information. -#------------------------------------------------------------------------------------------------------------- - -FROM mcr.microsoft.com/vscode/devcontainers/python:3.10 - -# -# Update the OS and maybe install packages -# -ENV DEBIAN_FRONTEND=noninteractive - -# add git lhs to apt -RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get -y install --no-install-recommends build-essential npm git-lfs \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && arch=$(arch | sed s/aarch64/arm64/ | sed s/x86_64/amd64/) \ - && wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.5.23/quarto-1.5.23-linux-${arch}.deb \ - && dpkg -i quarto-1.5.23-linux-${arch}.deb \ - && rm -rf /var/lib/apt/lists/* quarto-1.5.23-linux-${arch}.deb -ENV DEBIAN_FRONTEND=dialog - -# For docs -RUN npm install --global yarn -RUN pip install --upgrade pip -RUN pip install pydoc-markdown -RUN pip install pyyaml -RUN pip install colored +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE file in the project root for license information. +#------------------------------------------------------------------------------------------------------------- + +FROM mcr.microsoft.com/vscode/devcontainers/python:3.10 + +# +# Update the OS and maybe install packages +# +ENV DEBIAN_FRONTEND=noninteractive + +# add git lhs to apt +RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get -y install --no-install-recommends build-essential npm git-lfs \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && arch=$(arch | sed s/aarch64/arm64/ | sed s/x86_64/amd64/) \ + && wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.5.23/quarto-1.5.23-linux-${arch}.deb \ + && dpkg -i quarto-1.5.23-linux-${arch}.deb \ + && rm -rf /var/lib/apt/lists/* quarto-1.5.23-linux-${arch}.deb +ENV DEBIAN_FRONTEND=dialog + +# For docs +RUN npm install --global yarn +RUN pip install --upgrade pip +RUN pip install pydoc-markdown +RUN pip install pyyaml +RUN pip install colored diff --git a/.devcontainer/studio/Dockerfile b/.devcontainer/studio/Dockerfile index d612cea9dabf..4a08aea98724 100644 --- a/.devcontainer/studio/Dockerfile +++ b/.devcontainer/studio/Dockerfile @@ -1,27 +1,27 @@ -#------------------------------------------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE file in the project root for license information. -#------------------------------------------------------------------------------------------------------------- - -FROM mcr.microsoft.com/vscode/devcontainers/python:3.10 - -# -# Update the OS and maybe install packages -# -ENV DEBIAN_FRONTEND=noninteractive - -# add git lhs to apt -RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash - -RUN apt-get update \ - && apt-get upgrade -y \ - && apt-get -y install --no-install-recommends build-essential npm git-lfs \ - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /var/lib/apt/lists/* -ENV DEBIAN_FRONTEND=dialog - -# For docs -RUN npm install --global yarn -RUN pip install --upgrade pip -RUN pip install pydoc-markdown +#------------------------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE file in the project root for license information. +#------------------------------------------------------------------------------------------------------------- + +FROM mcr.microsoft.com/vscode/devcontainers/python:3.10 + +# +# Update the OS and maybe install packages +# +ENV DEBIAN_FRONTEND=noninteractive + +# add git lhs to apt +RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get -y install --no-install-recommends build-essential npm git-lfs \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* +ENV DEBIAN_FRONTEND=dialog + +# For docs +RUN npm install --global yarn +RUN pip install --upgrade pip +RUN pip install pydoc-markdown diff --git a/test/test_notebook.py b/test/test_notebook.py index 46622c287eb5..9d05533c9139 100755 --- a/test/test_notebook.py +++ b/test/test_notebook.py @@ -1,137 +1,137 @@ -#!/usr/bin/env python3 -m pytest - -import os -import sys - -import pytest -from conftest import skip_openai - -try: - import openai -except ImportError: - skip = True -else: - skip = False or skip_openai - - -here = os.path.abspath(os.path.dirname(__file__)) - - -def run_notebook(input_nb, output_nb="executed_openai_notebook.ipynb", save=False): - import nbformat - from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor - - try: - nb_loc = os.path.join(here, os.pardir, "notebook") - file_path = os.path.join(nb_loc, input_nb) - with open(file_path) as nb_file: - nb = nbformat.read(nb_file, as_version=4) - preprocessor = ExecutePreprocessor(timeout=4800, kernel_name="python3") - preprocessor.preprocess(nb, {"metadata": {"path": nb_loc}}) - - output_file_name = "executed_openai_notebook_output.txt" - output_file = os.path.join(here, output_file_name) - with open(output_file, "a") as nb_output_file: - for cell in nb.cells: - if cell.cell_type == "code" and "outputs" in cell: - for output in cell.outputs: - if "text" in output: - nb_output_file.write(output["text"].strip() + "\n") - elif "data" in output and "text/plain" in output["data"]: - nb_output_file.write(output["data"]["text/plain"].strip() + "\n") - except CellExecutionError: - raise - finally: - if save: - with open(os.path.join(here, output_nb), "w", encoding="utf-8") as nb_executed_file: - nbformat.write(nb, nb_executed_file) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.10"), - reason="do not run if openai is not installed or py!=3.10", -) -def test_agentchat_auto_feedback_from_code(save=False): - run_notebook("agentchat_auto_feedback_from_code_execution.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.11"), - reason="do not run if openai is not installed or py!=3.11", -) -def _test_oai_completion(save=False): - run_notebook("oai_completion.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.12"), - reason="do not run if openai is not installed or py!=3.12", -) -def test_agentchat_function_call(save=False): - run_notebook("agentchat_function_call.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.10"), - reason="do not run if openai is not installed or py!=3.10", -) -def test_agentchat_function_call_currency_calculator(save=False): - run_notebook("agentchat_function_call_currency_calculator.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.11"), - reason="do not run if openai is not installed or py!=3.11", -) -def test_agentchat_function_call_async(save=False): - run_notebook("agentchat_function_call_async.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.12"), - reason="do not run if openai is not installed or py!=3.12", -) -def _test_agentchat_MathChat(save=False): - run_notebook("agentchat_MathChat.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.10"), - reason="do not run if openai is not installed or py!=3.10", -) -def _test_oai_chatgpt_gpt4(save=False): - run_notebook("oai_chatgpt_gpt4.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.12"), - reason="do not run if openai is not installed or py!=3.12", -) -def test_agentchat_groupchat_finite_state_machine(save=False): - run_notebook("agentchat_groupchat_finite_state_machine.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.10"), - reason="do not run if openai is not installed or py!=3.10", -) -def test_agentchat_cost_token_tracking(save=False): - run_notebook("agentchat_cost_token_tracking.ipynb", save=save) - - -@pytest.mark.skipif( - skip or not sys.version.startswith("3.11"), - reason="do not run if openai is not installed or py!=3.11", -) -def test_agentchat_groupchat_stateflow(save=False): - run_notebook("agentchat_groupchat_stateflow.ipynb", save=save) - - -if __name__ == "__main__": - # test_agentchat_auto_feedback_from_code(save=True) - # test_oai_chatgpt_gpt4(save=True) - # test_oai_completion(save=True) - # test_agentchat_MathChat(save=True) - # test_agentchat_function_call(save=True) - # test_graph_modelling_language_using_select_speaker(save=True) - test_agentchat_function_call_async(save=True) +#!/usr/bin/env python3 -m pytest + +import os +import sys + +import pytest +from conftest import skip_openai + +try: + import openai +except ImportError: + skip = True +else: + skip = False or skip_openai + + +here = os.path.abspath(os.path.dirname(__file__)) + + +def run_notebook(input_nb, output_nb="executed_openai_notebook.ipynb", save=False): + import nbformat + from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor + + try: + nb_loc = os.path.join(here, os.pardir, "notebook") + file_path = os.path.join(nb_loc, input_nb) + with open(file_path) as nb_file: + nb = nbformat.read(nb_file, as_version=4) + preprocessor = ExecutePreprocessor(timeout=4800, kernel_name="python3") + preprocessor.preprocess(nb, {"metadata": {"path": nb_loc}}) + + output_file_name = "executed_openai_notebook_output.txt" + output_file = os.path.join(here, output_file_name) + with open(output_file, "a") as nb_output_file: + for cell in nb.cells: + if cell.cell_type == "code" and "outputs" in cell: + for output in cell.outputs: + if "text" in output: + nb_output_file.write(output["text"].strip() + "\n") + elif "data" in output and "text/plain" in output["data"]: + nb_output_file.write(output["data"]["text/plain"].strip() + "\n") + except CellExecutionError: + raise + finally: + if save: + with open(os.path.join(here, output_nb), "w", encoding="utf-8") as nb_executed_file: + nbformat.write(nb, nb_executed_file) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.10"), + reason="do not run if openai is not installed or py!=3.10", +) +def test_agentchat_auto_feedback_from_code(save=False): + run_notebook("agentchat_auto_feedback_from_code_execution.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.11"), + reason="do not run if openai is not installed or py!=3.11", +) +def _test_oai_completion(save=False): + run_notebook("oai_completion.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.12"), + reason="do not run if openai is not installed or py!=3.12", +) +def test_agentchat_function_call(save=False): + run_notebook("agentchat_function_call.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.10"), + reason="do not run if openai is not installed or py!=3.10", +) +def test_agentchat_function_call_currency_calculator(save=False): + run_notebook("agentchat_function_call_currency_calculator.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.11"), + reason="do not run if openai is not installed or py!=3.11", +) +def test_agentchat_function_call_async(save=False): + run_notebook("agentchat_function_call_async.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.12"), + reason="do not run if openai is not installed or py!=3.12", +) +def _test_agentchat_MathChat(save=False): + run_notebook("agentchat_MathChat.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.10"), + reason="do not run if openai is not installed or py!=3.10", +) +def _test_oai_chatgpt_gpt4(save=False): + run_notebook("oai_chatgpt_gpt4.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.12"), + reason="do not run if openai is not installed or py!=3.12", +) +def test_agentchat_groupchat_finite_state_machine(save=False): + run_notebook("agentchat_groupchat_finite_state_machine.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.10"), + reason="do not run if openai is not installed or py!=3.10", +) +def test_agentchat_cost_token_tracking(save=False): + run_notebook("agentchat_cost_token_tracking.ipynb", save=save) + + +@pytest.mark.skipif( + skip or not sys.version.startswith("3.11"), + reason="do not run if openai is not installed or py!=3.11", +) +def test_agentchat_groupchat_stateflow(save=False): + run_notebook("agentchat_groupchat_stateflow.ipynb", save=save) + + +if __name__ == "__main__": + # test_agentchat_auto_feedback_from_code(save=True) + # test_oai_chatgpt_gpt4(save=True) + # test_oai_completion(save=True) + # test_agentchat_MathChat(save=True) + # test_agentchat_function_call(save=True) + # test_graph_modelling_language_using_select_speaker(save=True) + test_agentchat_function_call_async(save=True) diff --git a/website/blog/2023-11-20-AgentEval/index.mdx b/website/blog/2023-11-20-AgentEval/index.mdx index 070d431b135e..1abb9e6c9f8d 100644 --- a/website/blog/2023-11-20-AgentEval/index.mdx +++ b/website/blog/2023-11-20-AgentEval/index.mdx @@ -1,151 +1,151 @@ ---- -title: How to Assess Utility of LLM-powered Applications? -authors: - - julianakiseleva - - narabzad -tags: [LLM, GPT, evaluation, task utility] ---- - - -![Fig.1: A verification framework](img/agenteval-CQ.png) - -

Fig.1 illustrates the general flow of AgentEval

- -**TL;DR:** -* As a developer of an LLM-powered application, how can you assess the utility it brings to end users while helping them with their tasks? -* To shed light on the question above, we introduce `AgentEval` — the first version of the framework to assess the utility of any LLM-powered application crafted to assist users in specific tasks. AgentEval aims to simplify the evaluation process by automatically proposing a set of criteria tailored to the unique purpose of your application. This allows for a comprehensive assessment, quantifying the utility of your application against the suggested criteria. -* We demonstrate how `AgentEval` work using [math problems dataset](https://microsoft.github.io/autogen/blog/2023/06/28/MathChat) as an example in the [following notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb). Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc). - - -## Introduction - - AutoGen aims to simplify the development of LLM-powered multi-agent systems for various applications, ultimately making end users' lives easier by assisting with their tasks. Next, we all yearn to understand how our developed systems perform, their utility for users, and, perhaps most crucially, how we can enhance them. Directly evaluating multi-agent systems poses challenges as current approaches predominantly rely on success metrics – essentially, whether the agent accomplishes tasks. However, comprehending user interaction with a system involves far more than success alone. Take math problems, for instance; it's not merely about the agent solving the problem. Equally significant is its ability to convey solutions based on various criteria, including completeness, conciseness, and the clarity of the provided explanation. Furthermore, success isn't always clearly defined for every task. - - Rapid advances in LLMs and multi-agent systems have brought forth many emerging capabilities that we're keen on translating into tangible utilities for end users. We introduce the first version of `AgentEval` framework - a tool crafted to empower developers in swiftly gauging the utility of LLM-powered applications designed to help end users accomplish the desired task. - - -![Fig.2: An overview of the tasks taxonomy](img/tasks-taxonomy.png) -

Fig. 2 provides an overview of the tasks taxonomy

- - -Let's first look into an overview of the suggested task taxonomy that a multi-agent system can be designed for. In general, the tasks can be split into two types, where: -* _Success is not clearly defined_ - refer to instances when users utilize a system in an assistive manner, seeking suggestions rather than expecting the system to solve the task. For example, a user might request the system to generate an email. In many cases, this generated content serves as a template that the user will later edit. However, defining success precisely for such tasks is relatively complex. -* _Success is clearly defined_ - refer to instances where we can clearly define whether a system solved the task or not. Consider agents that assist in accomplishing household tasks, where the definition of success is clear and measurable. This category can be further divided into two separate subcategories: - * _The optimal solution exits_ - these are tasks where only one solution is possible. For example, if you ask your assistant to turn on the light, the success of this task is clearly defined, and there is only one way to accomplish it. - * _Multiple solutions exist_ - increasingly, we observe situations where multiple trajectories of agent behavior can lead to either success or failure. In such cases, it is crucial to differentiate between the various successful and unsuccessful trajectories. For example, when you ask the agent to suggest you a food recipe or tell you a joke. - -In our `AgentEval` framework, we are currently focusing on tasks where _Success is clearly defined_. Next, we will introduce the suggested framework. - -## `AgentEval` Framework - -Our previous research on [assistive agents in Minecraft](https://github.com/microsoft/iglu-datasets) suggested that the most optimal way to obtain human judgments is to present humans with two agents side by side and ask for preferences. In this setup of pairwise comparison, humans can develop criteria to explain why they prefer the behavior of one agent over another. For instance, _'the first agent was faster in execution,'_ or _'the second agent moves more naturally.'_ So, the comparative nature led humans to come up with a list of criteria that helps to infer the utility of the task. With this idea in mind, we designed `AgentEval` (shown in Fig. 1), where we employ LLMs to help us understand, verify, and assess task *utility* for the multi-agent system. Namely: - -* The goal of `CriticAgent` is to suggest the list of criteria (Fig. 1), that can be used to assess task utility. This is an example of how `CriticAgent` is defined using `Autogen`: - -```python -critic = autogen.AssistantAgent( - name="critic", - llm_config={"config_list": config_list}, - system_message="""You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable, and not redundant. - Convert the evaluation criteria into a dictionary where the keys are the criteria. - The value of each key is a dictionary as follows {"description": criteria description, "accepted_values": possible accepted inputs for this key} - Make sure the keys are criteria for assessing the given task. "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description. - Return only the dictionary.""" -) -``` - -Next, the critic is given successful and failed examples of the task execution; then, it is able to return a list of criteria (Fig. 1). For reference, use the [following notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb). - -* The goal of `QuantifierAgent` is to quantify each of the suggested criteria (Fig. 1), providing us with an idea of the utility of this system for the given task. Here is an example of how it can be defined: - -```python -quantifier = autogen.AssistantAgent( - name="quantifier", - llm_config={"config_list": config_list}, - system_message = """You are a helpful assistant. You quantify the output of different tasks based on the given criteria. - The criterion is given in a dictionary format where each key is a distinct criteria. - The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key} - You are going to quantify each of the criteria for a given task based on the task description. - Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria. - Return only the dictionary.""" - -) -``` - -## `AgentEval` Results based on Math Problems Dataset - - As an example, after running CriticAgent, we obtained the following criteria to verify the results for math problem dataset: - -| Criteria | Description | Accepted Values| -|-----------|-----|----------------| -| Problem Interpretation | Ability to correctly interpret the problem | ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]| -| Mathematical Methodology | Adequacy of the chosen mathematical or algorithmic methodology for the question | ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"] | -| Calculation Correctness | Accuracy of calculations made and solutions given | ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"] | -| Explanation Clarity | Clarity and comprehensibility of explanations, including language use and structure | ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"] | -| Code Efficiency | Quality of code in terms of efficiency and elegance |["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"] | -| Code Correctness | Correctness of the provided code | ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"] - - -Then, after running QuantifierAgent, we obtained the results presented in Fig. 3, where you can see three models: -* AgentChat -* ReAct -* GPT-4 Vanilla Solver - -Lighter colors represent estimates for failed cases, and brighter colors show how discovered criteria were quantified. - -![Fig.3: Results based on overall math problems dataset `_s` stands for successful cases, `_f` - stands for failed cases](img/math-problems-plot.png) -

Fig.3 presents results based on overall math problems dataset `_s` stands for successful cases, `_f` - stands for failed cases

- -We note that while applying agentEval to math problems, the agent was not exposed to any ground truth information about the problem. As such, this figure illustrates an estimated performance of the three different agents, namely, Autogen (blue), Gpt-4 (red), and ReAct (green). We observe that by comparing the performance of any of the three agents in successful cases (dark bars of any color) versus unsuccessful cases (lighter version of the same bar), we note that AgentEval was able to assign higher quantification to successful cases than that of failed ones. This observation verifies AgentEval's ability for task utility prediction. Additionally, AgentEval allows us to go beyond just a binary definition of success, enabling a more in-depth comparison between successful and failed cases. - -It's important not only to identify what is not working but also to recognize what and why actually went well. - -## Limitations and Future Work -The current implementation of `AgentEval` has a number of limitations which are planning to overcome in the future: -* The list of criteria varies per run (unless you store a seed). We would recommend to run `CriticAgent` at least two times, and pick criteria you think is important for your domain. -* The results of the `QuantifierAgent` can vary with each run, so we recommend conducting multiple runs to observe the extent of result variations. - -To mitigate the limitations mentioned above, we are working on VerifierAgent, whose goal is to stabilize the results and provide additional explanations. - -## Summary -`CriticAgent` and `QuantifierAgent` can be applied to the logs of any type of application, providing you with an in-depth understanding of the utility your solution brings to the user for a given task. - -We would love to hear about how AgentEval works for your application. Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc). - - -## Previous Research - -``` -@InProceedings{pmlr-v176-kiseleva22a, - title = "Interactive Grounded Language Understanding in a Collaborative Environment: IGLU 2021", - author = "Kiseleva, Julia and Li, Ziming and Aliannejadi, Mohammad and Mohanty, Shrestha and ter Hoeve, Maartje and Burtsev, Mikhail and Skrynnik, Alexey and Zholus, Artem and Panov, Aleksandr and Srinet, Kavya and Szlam, Arthur and Sun, Yuxuan and Hofmann, Katja and C{\^o}t{\'e}, Marc-Alexandre and Awadallah, Ahmed and Abdrazakov, Linar and Churin, Igor and Manggala, Putra and Naszadi, Kata and van der Meer, Michiel and Kim, Taewoon", - booktitle = "Proceedings of the NeurIPS 2021 Competitions and Demonstrations Track", - pages = "146--161", - year = 2022, - editor = "Kiela, Douwe and Ciccone, Marco and Caputo, Barbara", - volume = 176, - series = "Proceedings of Machine Learning Research", - month = "06--14 Dec", - publisher = "PMLR", - pdf = {https://proceedings.mlr.press/v176/kiseleva22a/kiseleva22a.pdf}, - url = {https://proceedings.mlr.press/v176/kiseleva22a.html}. -} -``` - - -``` -@InProceedings{pmlr-v220-kiseleva22a, - title = "Interactive Grounded Language Understanding in a Collaborative Environment: Retrospective on Iglu 2022 Competition", - author = "Kiseleva, Julia and Skrynnik, Alexey and Zholus, Artem and Mohanty, Shrestha and Arabzadeh, Negar and C\^{o}t\'e, Marc-Alexandre and Aliannejadi, Mohammad and Teruel, Milagro and Li, Ziming and Burtsev, Mikhail and ter Hoeve, Maartje and Volovikova, Zoya and Panov, Aleksandr and Sun, Yuxuan and Srinet, Kavya and Szlam, Arthur and Awadallah, Ahmed and Rho, Seungeun and Kwon, Taehwan and Wontae Nam, Daniel and Bivort Haiek, Felipe and Zhang, Edwin and Abdrazakov, Linar and Qingyam, Guo and Zhang, Jason and Guo, Zhibin", - booktitle = "Proceedings of the NeurIPS 2022 Competitions Track", - pages = "204--216", - year = 2022, - editor = "Ciccone, Marco and Stolovitzky, Gustavo and Albrecht, Jacob", - volume = 220, - series = "Proceedings of Machine Learning Research", - month = "28 Nov--09 Dec", - publisher = "PMLR", - pdf = "https://proceedings.mlr.press/v220/kiseleva22a/kiseleva22a.pdf", - url = "https://proceedings.mlr.press/v220/kiseleva22a.html". -} -``` +--- +title: How to Assess Utility of LLM-powered Applications? +authors: + - julianakiseleva + - narabzad +tags: [LLM, GPT, evaluation, task utility] +--- + + +![Fig.1: A verification framework](img/agenteval-CQ.png) + +

Fig.1 illustrates the general flow of AgentEval

+ +**TL;DR:** +* As a developer of an LLM-powered application, how can you assess the utility it brings to end users while helping them with their tasks? +* To shed light on the question above, we introduce `AgentEval` — the first version of the framework to assess the utility of any LLM-powered application crafted to assist users in specific tasks. AgentEval aims to simplify the evaluation process by automatically proposing a set of criteria tailored to the unique purpose of your application. This allows for a comprehensive assessment, quantifying the utility of your application against the suggested criteria. +* We demonstrate how `AgentEval` work using [math problems dataset](https://microsoft.github.io/autogen/blog/2023/06/28/MathChat) as an example in the [following notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb). Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc). + + +## Introduction + + AutoGen aims to simplify the development of LLM-powered multi-agent systems for various applications, ultimately making end users' lives easier by assisting with their tasks. Next, we all yearn to understand how our developed systems perform, their utility for users, and, perhaps most crucially, how we can enhance them. Directly evaluating multi-agent systems poses challenges as current approaches predominantly rely on success metrics – essentially, whether the agent accomplishes tasks. However, comprehending user interaction with a system involves far more than success alone. Take math problems, for instance; it's not merely about the agent solving the problem. Equally significant is its ability to convey solutions based on various criteria, including completeness, conciseness, and the clarity of the provided explanation. Furthermore, success isn't always clearly defined for every task. + + Rapid advances in LLMs and multi-agent systems have brought forth many emerging capabilities that we're keen on translating into tangible utilities for end users. We introduce the first version of `AgentEval` framework - a tool crafted to empower developers in swiftly gauging the utility of LLM-powered applications designed to help end users accomplish the desired task. + + +![Fig.2: An overview of the tasks taxonomy](img/tasks-taxonomy.png) +

Fig. 2 provides an overview of the tasks taxonomy

+ + +Let's first look into an overview of the suggested task taxonomy that a multi-agent system can be designed for. In general, the tasks can be split into two types, where: +* _Success is not clearly defined_ - refer to instances when users utilize a system in an assistive manner, seeking suggestions rather than expecting the system to solve the task. For example, a user might request the system to generate an email. In many cases, this generated content serves as a template that the user will later edit. However, defining success precisely for such tasks is relatively complex. +* _Success is clearly defined_ - refer to instances where we can clearly define whether a system solved the task or not. Consider agents that assist in accomplishing household tasks, where the definition of success is clear and measurable. This category can be further divided into two separate subcategories: + * _The optimal solution exits_ - these are tasks where only one solution is possible. For example, if you ask your assistant to turn on the light, the success of this task is clearly defined, and there is only one way to accomplish it. + * _Multiple solutions exist_ - increasingly, we observe situations where multiple trajectories of agent behavior can lead to either success or failure. In such cases, it is crucial to differentiate between the various successful and unsuccessful trajectories. For example, when you ask the agent to suggest you a food recipe or tell you a joke. + +In our `AgentEval` framework, we are currently focusing on tasks where _Success is clearly defined_. Next, we will introduce the suggested framework. + +## `AgentEval` Framework + +Our previous research on [assistive agents in Minecraft](https://github.com/microsoft/iglu-datasets) suggested that the most optimal way to obtain human judgments is to present humans with two agents side by side and ask for preferences. In this setup of pairwise comparison, humans can develop criteria to explain why they prefer the behavior of one agent over another. For instance, _'the first agent was faster in execution,'_ or _'the second agent moves more naturally.'_ So, the comparative nature led humans to come up with a list of criteria that helps to infer the utility of the task. With this idea in mind, we designed `AgentEval` (shown in Fig. 1), where we employ LLMs to help us understand, verify, and assess task *utility* for the multi-agent system. Namely: + +* The goal of `CriticAgent` is to suggest the list of criteria (Fig. 1), that can be used to assess task utility. This is an example of how `CriticAgent` is defined using `Autogen`: + +```python +critic = autogen.AssistantAgent( + name="critic", + llm_config={"config_list": config_list}, + system_message="""You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable, and not redundant. + Convert the evaluation criteria into a dictionary where the keys are the criteria. + The value of each key is a dictionary as follows {"description": criteria description, "accepted_values": possible accepted inputs for this key} + Make sure the keys are criteria for assessing the given task. "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description. + Return only the dictionary.""" +) +``` + +Next, the critic is given successful and failed examples of the task execution; then, it is able to return a list of criteria (Fig. 1). For reference, use the [following notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb). + +* The goal of `QuantifierAgent` is to quantify each of the suggested criteria (Fig. 1), providing us with an idea of the utility of this system for the given task. Here is an example of how it can be defined: + +```python +quantifier = autogen.AssistantAgent( + name="quantifier", + llm_config={"config_list": config_list}, + system_message = """You are a helpful assistant. You quantify the output of different tasks based on the given criteria. + The criterion is given in a dictionary format where each key is a distinct criteria. + The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key} + You are going to quantify each of the criteria for a given task based on the task description. + Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria. + Return only the dictionary.""" + +) +``` + +## `AgentEval` Results based on Math Problems Dataset + + As an example, after running CriticAgent, we obtained the following criteria to verify the results for math problem dataset: + +| Criteria | Description | Accepted Values| +|-----------|-----|----------------| +| Problem Interpretation | Ability to correctly interpret the problem | ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]| +| Mathematical Methodology | Adequacy of the chosen mathematical or algorithmic methodology for the question | ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"] | +| Calculation Correctness | Accuracy of calculations made and solutions given | ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"] | +| Explanation Clarity | Clarity and comprehensibility of explanations, including language use and structure | ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"] | +| Code Efficiency | Quality of code in terms of efficiency and elegance |["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"] | +| Code Correctness | Correctness of the provided code | ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"] + + +Then, after running QuantifierAgent, we obtained the results presented in Fig. 3, where you can see three models: +* AgentChat +* ReAct +* GPT-4 Vanilla Solver + +Lighter colors represent estimates for failed cases, and brighter colors show how discovered criteria were quantified. + +![Fig.3: Results based on overall math problems dataset `_s` stands for successful cases, `_f` - stands for failed cases](img/math-problems-plot.png) +

Fig.3 presents results based on overall math problems dataset `_s` stands for successful cases, `_f` - stands for failed cases

+ +We note that while applying agentEval to math problems, the agent was not exposed to any ground truth information about the problem. As such, this figure illustrates an estimated performance of the three different agents, namely, Autogen (blue), Gpt-4 (red), and ReAct (green). We observe that by comparing the performance of any of the three agents in successful cases (dark bars of any color) versus unsuccessful cases (lighter version of the same bar), we note that AgentEval was able to assign higher quantification to successful cases than that of failed ones. This observation verifies AgentEval's ability for task utility prediction. Additionally, AgentEval allows us to go beyond just a binary definition of success, enabling a more in-depth comparison between successful and failed cases. + +It's important not only to identify what is not working but also to recognize what and why actually went well. + +## Limitations and Future Work +The current implementation of `AgentEval` has a number of limitations which are planning to overcome in the future: +* The list of criteria varies per run (unless you store a seed). We would recommend to run `CriticAgent` at least two times, and pick criteria you think is important for your domain. +* The results of the `QuantifierAgent` can vary with each run, so we recommend conducting multiple runs to observe the extent of result variations. + +To mitigate the limitations mentioned above, we are working on VerifierAgent, whose goal is to stabilize the results and provide additional explanations. + +## Summary +`CriticAgent` and `QuantifierAgent` can be applied to the logs of any type of application, providing you with an in-depth understanding of the utility your solution brings to the user for a given task. + +We would love to hear about how AgentEval works for your application. Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc). + + +## Previous Research + +``` +@InProceedings{pmlr-v176-kiseleva22a, + title = "Interactive Grounded Language Understanding in a Collaborative Environment: IGLU 2021", + author = "Kiseleva, Julia and Li, Ziming and Aliannejadi, Mohammad and Mohanty, Shrestha and ter Hoeve, Maartje and Burtsev, Mikhail and Skrynnik, Alexey and Zholus, Artem and Panov, Aleksandr and Srinet, Kavya and Szlam, Arthur and Sun, Yuxuan and Hofmann, Katja and C{\^o}t{\'e}, Marc-Alexandre and Awadallah, Ahmed and Abdrazakov, Linar and Churin, Igor and Manggala, Putra and Naszadi, Kata and van der Meer, Michiel and Kim, Taewoon", + booktitle = "Proceedings of the NeurIPS 2021 Competitions and Demonstrations Track", + pages = "146--161", + year = 2022, + editor = "Kiela, Douwe and Ciccone, Marco and Caputo, Barbara", + volume = 176, + series = "Proceedings of Machine Learning Research", + month = "06--14 Dec", + publisher = "PMLR", + pdf = {https://proceedings.mlr.press/v176/kiseleva22a/kiseleva22a.pdf}, + url = {https://proceedings.mlr.press/v176/kiseleva22a.html}. +} +``` + + +``` +@InProceedings{pmlr-v220-kiseleva22a, + title = "Interactive Grounded Language Understanding in a Collaborative Environment: Retrospective on Iglu 2022 Competition", + author = "Kiseleva, Julia and Skrynnik, Alexey and Zholus, Artem and Mohanty, Shrestha and Arabzadeh, Negar and C\^{o}t\'e, Marc-Alexandre and Aliannejadi, Mohammad and Teruel, Milagro and Li, Ziming and Burtsev, Mikhail and ter Hoeve, Maartje and Volovikova, Zoya and Panov, Aleksandr and Sun, Yuxuan and Srinet, Kavya and Szlam, Arthur and Awadallah, Ahmed and Rho, Seungeun and Kwon, Taehwan and Wontae Nam, Daniel and Bivort Haiek, Felipe and Zhang, Edwin and Abdrazakov, Linar and Qingyam, Guo and Zhang, Jason and Guo, Zhibin", + booktitle = "Proceedings of the NeurIPS 2022 Competitions Track", + pages = "204--216", + year = 2022, + editor = "Ciccone, Marco and Stolovitzky, Gustavo and Albrecht, Jacob", + volume = 220, + series = "Proceedings of Machine Learning Research", + month = "28 Nov--09 Dec", + publisher = "PMLR", + pdf = "https://proceedings.mlr.press/v220/kiseleva22a/kiseleva22a.pdf", + url = "https://proceedings.mlr.press/v220/kiseleva22a.html". +} +``` diff --git a/website/blog/2023-12-01-AutoGenStudio/index.mdx b/website/blog/2023-12-01-AutoGenStudio/index.mdx index 4d893b144a78..a2558acb01d6 100644 --- a/website/blog/2023-12-01-AutoGenStudio/index.mdx +++ b/website/blog/2023-12-01-AutoGenStudio/index.mdx @@ -1,237 +1,237 @@ ---- -title: "AutoGen Studio: Interactively Explore Multi-Agent Workflows" -authors: - - victordibia - - gagb - - samershi -tags: [AutoGen, UI, web, UX] ---- - -![AutoGen Studio Playground View: Solving a task with multiple agents that generate a pdf document with images.](img/autogenstudio_home.png) - -

- - AutoGen Studio: Solving a task with multiple agents that generate a pdf - document with images. - -

- -## TL;DR - -To help you rapidly prototype multi-agent solutions for your tasks, we are introducing AutoGen Studio, an interface powered by [AutoGen](https://github.com/microsoft/autogen/tree/main/autogen). It allows you to: - -- Declaratively define and modify agents and multi-agent workflows through a point and click, drag and drop interface (e.g., you can select the parameters of two agents that will communicate to solve your task). -- Use our UI to create chat sessions with the specified agents and view results (e.g., view chat history, generated files, and time taken). -- Explicitly add skills to your agents and accomplish more tasks. -- Publish your sessions to a local gallery. - - -See the official AutoGen Studio documentation [here](https://microsoft.github.io/autogen/docs/autogen-studio/getting-started) for more details. - -AutoGen Studio is open source [code here](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio), and can be installed via pip. Give it a try! - -```bash -pip install autogenstudio -``` - -## Introduction - -The accelerating pace of technology has ushered us into an era where digital assistants (or agents) are becoming integral to our lives. [AutoGen](https://github.com/microsoft/autogen/tree/main/autogen) has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: **AutoGen Studio**. - -With AutoGen Studio, users can rapidly create, manage, and interact with agents that can learn, adapt, and collaborate. As we release this interface into the open-source community, our ambition is not only to enhance productivity but to inspire a level of personalized interaction between humans and agents. - -> **Note**: AutoGen Studio is meant to help you rapidly prototype multi-agent workflows and demonstrate an example of end user interfaces built with AutoGen. It is not meant to be a production-ready app. - -## Getting Started with AutoGen Studio - -The following guide will help you get AutoGen Studio up and running on your system. - -### Configuring an LLM Provider - -To get started, you need access to a language model. You can get this set up by following the steps in the AutoGen documentation [here](https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints). Configure your environment with either `OPENAI_API_KEY` or `AZURE_OPENAI_API_KEY`. - -For example, in your terminal, you would set the API key like this: - -```bash -export OPENAI_API_KEY= -``` - -You can also specify the model directly in the agent's configuration as shown below. - -```python -llm_config = LLMConfig( - config_list=[{ - "model": "gpt-4", - "api_key": "", - "base_url": "", - "api_type": "azure", - "api_version": "2024-02-01" - }], - temperature=0, -) -``` - -### Installation - -There are two ways to install AutoGen Studio - from PyPi or from source. We **recommend installing from PyPi** unless you plan to modify the source code. - -1. **Install from PyPi** - - We recommend using a virtual environment (e.g., conda) to avoid conflicts with existing Python packages. With Python 3.10 or newer active in your virtual environment, use pip to install AutoGen Studio: - - ```bash - pip install autogenstudio - ``` - -2. **Install from Source** - - > Note: This approach requires some familiarity with building interfaces in React. - - If you prefer to install from source, ensure you have Python 3.10+ and Node.js (version above 14.15.0) installed. Here's how you get started: - - - Clone the AutoGen Studio repository and install its Python dependencies: - - ```bash - pip install -e . - ``` - - - Navigate to the `samples/apps/autogen-studio/frontend` directory, install dependencies, and build the UI: - - ```bash - npm install -g gatsby-cli - npm install --global yarn - yarn install - yarn build - ``` - - For Windows users, to build the frontend, you may need alternative commands provided in the [autogen studio readme](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio). - -### Running the Application - -Once installed, run the web UI by entering the following in your terminal: - -```bash -autogenstudio ui --port 8081 -``` - -This will start the application on the specified port. Open your web browser and go to `http://localhost:8081/` to begin using AutoGen Studio. - -Now that you have AutoGen Studio installed and running, you are ready to explore its capabilities, including defining and modifying agent workflows, interacting with agents and sessions, and expanding agent skills. - -## What Can You Do with AutoGen Studio? - -The AutoGen Studio UI is organized into 3 high level sections - **Build**, **Playground**, and **Gallery**. - -### Build - -![Specify Agents.](img/autogenstudio_config.png) - -This section focuses on defining the properties of agents and agent workflows. It includes the following concepts: - -**Skills**: Skills are functions (e.g., Python functions) that describe how to solve a task. In general, a good skill has a descriptive name (e.g. `generate_images`), extensive docstrings and good defaults (e.g., writing out files to disk for persistence and reuse). You can add new skills to AutoGen Studio via the provided UI. At inference time, these skills are made available to the assistant agent as they address your tasks. - -![View and add skills.](img/autogenstudio_skills.png) - -

- - AutoGen Studio Build View: View, add or edit skills that an agent can - leverage in addressing tasks. - -

- -**Agents**: This provides an interface to declaratively specify properties for an AutoGen agent (mirrors most of the members of a base [AutoGen conversable agent](https://github.com/microsoft/autogen/blob/main/autogen/agentchat/conversable_agent.py) class). - -**Agent Workflows**: An agent workflow is a specification of a set of agents that can work together to accomplish a task. The simplest version of this is a setup with two agents – a user proxy agent (that represents a user i.e. it compiles code and prints result) and an assistant that can address task requests (e.g., generating plans, writing code, evaluating responses, proposing error recovery steps, etc.). A more complex flow could be a group chat where even more agents work towards a solution. - -### Playground - -![AutoGen Studio Playground View: Solving a task with multiple agents that generate a pdf document with images.](img/autogenstudio_home.png) - -

- - AutoGen Studio Playground View: Agents collaborate, use available skills - (ability to generate images) to address a user task (generate pdf's). - -

- -The playground section is focused on interacting with agent workflows defined in the previous build section. It includes the following concepts: - -**Session**: A session refers to a period of continuous interaction or engagement with an agent workflow, typically characterized by a sequence of activities or operations aimed at achieving specific objectives. It includes the agent workflow configuration, the interactions between the user and the agents. A session can be “published” to a “gallery”. - -**Chat View**: A chat is a sequence of interactions between a user and an agent. It is a part of a session. - -### Gallery - -This section is focused on sharing and reusing artifacts (e.g., workflow configurations, sessions, etc.). - -AutoGen Studio comes with 3 example skills: `fetch_profile`, `find_papers`, `generate_images`. Please feel free to review the repo to learn more about how they work. - -## The AutoGen Studio API - -While AutoGen Studio is a web interface, it is powered by an underlying python API that is reusable and modular. Importantly, we have implemented an API where agent workflows can be declaratively specified (in JSON), loaded and run. An example of the current API is shown below. Please consult the [AutoGen Studio repo](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio) for more details. - -```python -import json -from autogenstudio import AutoGenWorkFlowManager, AgentWorkFlowConfig - -# load an agent specification in JSON -agent_spec = json.load(open('agent_spec.json')) - -# Create an AutoGen Workflow Configuration from the agent specification -agent_work_flow_config = FlowConfig(**agent_spec) - -# Create a Workflow from the configuration -agent_work_flow = AutoGenWorkFlowManager(agent_work_flow_config) - -# Run the workflow on a task -task_query = "What is the height of the Eiffel Tower?" -agent_work_flow.run(message=task_query) -``` - -## Road Map and Next Steps - -As we continue to develop and refine AutoGen Studio, the road map below outlines an array of enhancements and new features planned for future releases. Here's what users can look forward to: - -- **Complex Agent Workflows**: We're working on integrating support for more sophisticated agent workflows, such as `GroupChat`, allowing for richer interaction between multiple agents or dynamic topologies. -- **Improved User Experience**: This includes features like streaming intermediate model output for real-time feedback, better summarization of agent responses, information on costs of each interaction. We will also invest in improving the workflow for composing and reusing agents. We will also explore support for more interactive human in the loop feedback to agents. -- **Expansion of Agent Skills**: We will work towards improving the workflow for authoring, composing and reusing agent skills. -- **Community Features**: Facilitation of sharing and collaboration within AutoGen Studio user community is a key goal. We're exploring options for sharing sessions and results more easily among users and contributing to a shared repository of skills, agents, and agent workflows. - -## Contribution Guide - -We welcome contributions to AutoGen Studio. We recommend the following general steps to contribute to the project: - -- Review the overall AutoGen project [contribution guide](https://github.com/microsoft/autogen?tab=readme-ov-file#contributing). -- Please review the AutoGen Studio [roadmap](https://github.com/microsoft/autogen/issues/737) to get a sense of the current priorities for the project. Help is appreciated especially with Studio issues tagged with `help-wanted`. -- Please initiate a discussion on the roadmap issue or a new issue to discuss your proposed contribution. -- Please review the autogenstudio dev branch here [dev branch].(https://github.com/microsoft/autogen/tree/autogenstudio) and use as a base for your contribution. This way, your contribution will be aligned with the latest changes in the AutoGen Studio project. -- Submit a pull request with your contribution! -- If you are modifying AutoGen Studio in vscode, it has its own devcontainer to simplify dev work. See instructions in `.devcontainer/README.md` on how to use it. -- Please use the tag `studio` for any issues, questions, and PRs related to Studio. - -### FAQ - -**Q: Where can I adjust the default skills, agent and workflow configurations?** -A: You can modify agent configurations directly from the UI or by editing the `autogentstudio/utils/dbdefaults.json` file which is used to initialize the database. - -**Q: If I want to reset the entire conversation with an agent, how do I go about it?** -A: To reset your conversation history, you can delete the `database.sqlite` file. If you need to clear user-specific data, remove the relevant `autogenstudio/web/files/user/` folder. - -**Q: Is it possible to view the output and messages generated by the agents during interactions?** -A: Yes, you can view the generated messages in the debug console of the web UI, providing insights into the agent interactions. Alternatively, you can inspect the `database.sqlite` file for a comprehensive record of messages. - -**Q: Where can I find documentation and support for AutoGen Studio?** -A: We are constantly working to improve AutoGen Studio. For the latest updates, please refer to the [AutoGen Studio Readme](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio). For additional support, please open an issue on [GitHub](https://github.com/microsoft/autogen) or ask questions on [Discord](https://aka.ms/autogen-dc). - -**Q: Can I use Other Models with AutoGen Studio?** -Yes. AutoGen standardizes on the openai model api format, and you can use any api server that offers an openai compliant endpoint. In the AutoGen Studio UI, each agent has an `llm_config` field where you can input your model endpoint details including `model name`, `api key`, `base url`, `model type` and `api version`. For Azure OpenAI models, you can find these details in the Azure portal. Note that for Azure OpenAI, the `model name` is the deployment id or engine, and the `model type` is "azure". -For other OSS models, we recommend using a server such as vllm to instantiate an openai compliant endpoint. - -**Q: The Server Starts But I Can't Access the UI** -A: If you are running the server on a remote machine (or a local machine that fails to resolve localhost correstly), you may need to specify the host address. By default, the host address is set to `localhost`. You can specify the host address using the `--host ` argument. For example, to start the server on port 8081 and local address such that it is accessible from other machines on the network, you can run the following command: - -```bash -autogenstudio ui --port 8081 --host 0.0.0.0 -``` - -
+--- +title: "AutoGen Studio: Interactively Explore Multi-Agent Workflows" +authors: + - victordibia + - gagb + - samershi +tags: [AutoGen, UI, web, UX] +--- + +![AutoGen Studio Playground View: Solving a task with multiple agents that generate a pdf document with images.](img/autogenstudio_home.png) + +

+ + AutoGen Studio: Solving a task with multiple agents that generate a pdf + document with images. + +

+ +## TL;DR + +To help you rapidly prototype multi-agent solutions for your tasks, we are introducing AutoGen Studio, an interface powered by [AutoGen](https://github.com/microsoft/autogen/tree/main/autogen). It allows you to: + +- Declaratively define and modify agents and multi-agent workflows through a point and click, drag and drop interface (e.g., you can select the parameters of two agents that will communicate to solve your task). +- Use our UI to create chat sessions with the specified agents and view results (e.g., view chat history, generated files, and time taken). +- Explicitly add skills to your agents and accomplish more tasks. +- Publish your sessions to a local gallery. + + +See the official AutoGen Studio documentation [here](https://microsoft.github.io/autogen/docs/autogen-studio/getting-started) for more details. + +AutoGen Studio is open source [code here](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio), and can be installed via pip. Give it a try! + +```bash +pip install autogenstudio +``` + +## Introduction + +The accelerating pace of technology has ushered us into an era where digital assistants (or agents) are becoming integral to our lives. [AutoGen](https://github.com/microsoft/autogen/tree/main/autogen) has emerged as a leading framework for orchestrating the power of agents. In the spirit of expanding this frontier and democratizing this capability, we are thrilled to introduce a new user-friendly interface: **AutoGen Studio**. + +With AutoGen Studio, users can rapidly create, manage, and interact with agents that can learn, adapt, and collaborate. As we release this interface into the open-source community, our ambition is not only to enhance productivity but to inspire a level of personalized interaction between humans and agents. + +> **Note**: AutoGen Studio is meant to help you rapidly prototype multi-agent workflows and demonstrate an example of end user interfaces built with AutoGen. It is not meant to be a production-ready app. + +## Getting Started with AutoGen Studio + +The following guide will help you get AutoGen Studio up and running on your system. + +### Configuring an LLM Provider + +To get started, you need access to a language model. You can get this set up by following the steps in the AutoGen documentation [here](https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints). Configure your environment with either `OPENAI_API_KEY` or `AZURE_OPENAI_API_KEY`. + +For example, in your terminal, you would set the API key like this: + +```bash +export OPENAI_API_KEY= +``` + +You can also specify the model directly in the agent's configuration as shown below. + +```python +llm_config = LLMConfig( + config_list=[{ + "model": "gpt-4", + "api_key": "", + "base_url": "", + "api_type": "azure", + "api_version": "2024-02-01" + }], + temperature=0, +) +``` + +### Installation + +There are two ways to install AutoGen Studio - from PyPi or from source. We **recommend installing from PyPi** unless you plan to modify the source code. + +1. **Install from PyPi** + + We recommend using a virtual environment (e.g., conda) to avoid conflicts with existing Python packages. With Python 3.10 or newer active in your virtual environment, use pip to install AutoGen Studio: + + ```bash + pip install autogenstudio + ``` + +2. **Install from Source** + + > Note: This approach requires some familiarity with building interfaces in React. + + If you prefer to install from source, ensure you have Python 3.10+ and Node.js (version above 14.15.0) installed. Here's how you get started: + + - Clone the AutoGen Studio repository and install its Python dependencies: + + ```bash + pip install -e . + ``` + + - Navigate to the `samples/apps/autogen-studio/frontend` directory, install dependencies, and build the UI: + + ```bash + npm install -g gatsby-cli + npm install --global yarn + yarn install + yarn build + ``` + + For Windows users, to build the frontend, you may need alternative commands provided in the [autogen studio readme](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio). + +### Running the Application + +Once installed, run the web UI by entering the following in your terminal: + +```bash +autogenstudio ui --port 8081 +``` + +This will start the application on the specified port. Open your web browser and go to `http://localhost:8081/` to begin using AutoGen Studio. + +Now that you have AutoGen Studio installed and running, you are ready to explore its capabilities, including defining and modifying agent workflows, interacting with agents and sessions, and expanding agent skills. + +## What Can You Do with AutoGen Studio? + +The AutoGen Studio UI is organized into 3 high level sections - **Build**, **Playground**, and **Gallery**. + +### Build + +![Specify Agents.](img/autogenstudio_config.png) + +This section focuses on defining the properties of agents and agent workflows. It includes the following concepts: + +**Skills**: Skills are functions (e.g., Python functions) that describe how to solve a task. In general, a good skill has a descriptive name (e.g. `generate_images`), extensive docstrings and good defaults (e.g., writing out files to disk for persistence and reuse). You can add new skills to AutoGen Studio via the provided UI. At inference time, these skills are made available to the assistant agent as they address your tasks. + +![View and add skills.](img/autogenstudio_skills.png) + +

+ + AutoGen Studio Build View: View, add or edit skills that an agent can + leverage in addressing tasks. + +

+ +**Agents**: This provides an interface to declaratively specify properties for an AutoGen agent (mirrors most of the members of a base [AutoGen conversable agent](https://github.com/microsoft/autogen/blob/main/autogen/agentchat/conversable_agent.py) class). + +**Agent Workflows**: An agent workflow is a specification of a set of agents that can work together to accomplish a task. The simplest version of this is a setup with two agents – a user proxy agent (that represents a user i.e. it compiles code and prints result) and an assistant that can address task requests (e.g., generating plans, writing code, evaluating responses, proposing error recovery steps, etc.). A more complex flow could be a group chat where even more agents work towards a solution. + +### Playground + +![AutoGen Studio Playground View: Solving a task with multiple agents that generate a pdf document with images.](img/autogenstudio_home.png) + +

+ + AutoGen Studio Playground View: Agents collaborate, use available skills + (ability to generate images) to address a user task (generate pdf's). + +

+ +The playground section is focused on interacting with agent workflows defined in the previous build section. It includes the following concepts: + +**Session**: A session refers to a period of continuous interaction or engagement with an agent workflow, typically characterized by a sequence of activities or operations aimed at achieving specific objectives. It includes the agent workflow configuration, the interactions between the user and the agents. A session can be “published” to a “gallery”. + +**Chat View**: A chat is a sequence of interactions between a user and an agent. It is a part of a session. + +### Gallery + +This section is focused on sharing and reusing artifacts (e.g., workflow configurations, sessions, etc.). + +AutoGen Studio comes with 3 example skills: `fetch_profile`, `find_papers`, `generate_images`. Please feel free to review the repo to learn more about how they work. + +## The AutoGen Studio API + +While AutoGen Studio is a web interface, it is powered by an underlying python API that is reusable and modular. Importantly, we have implemented an API where agent workflows can be declaratively specified (in JSON), loaded and run. An example of the current API is shown below. Please consult the [AutoGen Studio repo](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio) for more details. + +```python +import json +from autogenstudio import AutoGenWorkFlowManager, AgentWorkFlowConfig + +# load an agent specification in JSON +agent_spec = json.load(open('agent_spec.json')) + +# Create an AutoGen Workflow Configuration from the agent specification +agent_work_flow_config = FlowConfig(**agent_spec) + +# Create a Workflow from the configuration +agent_work_flow = AutoGenWorkFlowManager(agent_work_flow_config) + +# Run the workflow on a task +task_query = "What is the height of the Eiffel Tower?" +agent_work_flow.run(message=task_query) +``` + +## Road Map and Next Steps + +As we continue to develop and refine AutoGen Studio, the road map below outlines an array of enhancements and new features planned for future releases. Here's what users can look forward to: + +- **Complex Agent Workflows**: We're working on integrating support for more sophisticated agent workflows, such as `GroupChat`, allowing for richer interaction between multiple agents or dynamic topologies. +- **Improved User Experience**: This includes features like streaming intermediate model output for real-time feedback, better summarization of agent responses, information on costs of each interaction. We will also invest in improving the workflow for composing and reusing agents. We will also explore support for more interactive human in the loop feedback to agents. +- **Expansion of Agent Skills**: We will work towards improving the workflow for authoring, composing and reusing agent skills. +- **Community Features**: Facilitation of sharing and collaboration within AutoGen Studio user community is a key goal. We're exploring options for sharing sessions and results more easily among users and contributing to a shared repository of skills, agents, and agent workflows. + +## Contribution Guide + +We welcome contributions to AutoGen Studio. We recommend the following general steps to contribute to the project: + +- Review the overall AutoGen project [contribution guide](https://github.com/microsoft/autogen?tab=readme-ov-file#contributing). +- Please review the AutoGen Studio [roadmap](https://github.com/microsoft/autogen/issues/737) to get a sense of the current priorities for the project. Help is appreciated especially with Studio issues tagged with `help-wanted`. +- Please initiate a discussion on the roadmap issue or a new issue to discuss your proposed contribution. +- Please review the autogenstudio dev branch here [dev branch].(https://github.com/microsoft/autogen/tree/autogenstudio) and use as a base for your contribution. This way, your contribution will be aligned with the latest changes in the AutoGen Studio project. +- Submit a pull request with your contribution! +- If you are modifying AutoGen Studio in vscode, it has its own devcontainer to simplify dev work. See instructions in `.devcontainer/README.md` on how to use it. +- Please use the tag `studio` for any issues, questions, and PRs related to Studio. + +### FAQ + +**Q: Where can I adjust the default skills, agent and workflow configurations?** +A: You can modify agent configurations directly from the UI or by editing the `autogentstudio/utils/dbdefaults.json` file which is used to initialize the database. + +**Q: If I want to reset the entire conversation with an agent, how do I go about it?** +A: To reset your conversation history, you can delete the `database.sqlite` file. If you need to clear user-specific data, remove the relevant `autogenstudio/web/files/user/` folder. + +**Q: Is it possible to view the output and messages generated by the agents during interactions?** +A: Yes, you can view the generated messages in the debug console of the web UI, providing insights into the agent interactions. Alternatively, you can inspect the `database.sqlite` file for a comprehensive record of messages. + +**Q: Where can I find documentation and support for AutoGen Studio?** +A: We are constantly working to improve AutoGen Studio. For the latest updates, please refer to the [AutoGen Studio Readme](https://github.com/microsoft/autogen/tree/main/samples/apps/autogen-studio). For additional support, please open an issue on [GitHub](https://github.com/microsoft/autogen) or ask questions on [Discord](https://aka.ms/autogen-dc). + +**Q: Can I use Other Models with AutoGen Studio?** +Yes. AutoGen standardizes on the openai model api format, and you can use any api server that offers an openai compliant endpoint. In the AutoGen Studio UI, each agent has an `llm_config` field where you can input your model endpoint details including `model name`, `api key`, `base url`, `model type` and `api version`. For Azure OpenAI models, you can find these details in the Azure portal. Note that for Azure OpenAI, the `model name` is the deployment id or engine, and the `model type` is "azure". +For other OSS models, we recommend using a server such as vllm to instantiate an openai compliant endpoint. + +**Q: The Server Starts But I Can't Access the UI** +A: If you are running the server on a remote machine (or a local machine that fails to resolve localhost correstly), you may need to specify the host address. By default, the host address is set to `localhost`. You can specify the host address using the `--host ` argument. For example, to start the server on port 8081 and local address such that it is accessible from other machines on the network, you can run the following command: + +```bash +autogenstudio ui --port 8081 --host 0.0.0.0 +``` + +
diff --git a/website/blog/2023-12-29-AgentDescriptions/index.mdx b/website/blog/2023-12-29-AgentDescriptions/index.mdx index 0471d545dc44..f1201c6f956b 100644 --- a/website/blog/2023-12-29-AgentDescriptions/index.mdx +++ b/website/blog/2023-12-29-AgentDescriptions/index.mdx @@ -1,139 +1,139 @@ ---- -title: "All About Agent Descriptions" -authors: - - afourney -tags: [AutoGen] ---- - - -## TL;DR - -AutoGen 0.2.2 introduces a [description](https://microsoft.github.io/autogen/docs/reference/agentchat/conversable_agent#__init__) field to ConversableAgent (and all subclasses), and changes GroupChat so that it uses agent `description`s rather than `system_message`s when choosing which agents should speak next. - -This is expected to simplify GroupChat’s job, improve orchestration, and make it easier to implement new GroupChat or GroupChat-like alternatives. - -If you are a developer, and things were already working well for you, no action is needed -- backward compatibility is ensured because the `description` field defaults to the `system_message` when no description is provided. - -However, if you were struggling with getting GroupChat to work, you can now try updating the `description` field. - -## Introduction - -As AutoGen matures and developers build increasingly complex combinations of agents, orchestration is becoming an important capability. At present, [GroupChat](https://microsoft.github.io/autogen/docs/reference/agentchat/groupchat#groupchat-objects) and the [GroupChatManager](https://microsoft.github.io/autogen/docs/reference/agentchat/groupchat#groupchatmanager-objects) are the main built-in tools for orchestrating conversations between 3 or more agents. For orchestrators like GroupChat to work well, they need to know something about each agent so that they can decide who should speak and when. Prior to AutoGen 0.2.2, GroupChat relied on each agent's `system_message` and `name` to learn about each participating agent. This is likely fine when the system prompt is short and sweet, but can lead to problems when the instructions are very long (e.g., with the [AssistantAgent](https://microsoft.github.io/autogen/docs/reference/agentchat/assistant_agent)), or non-existent (e.g., with the [UserProxyAgent](https://microsoft.github.io/autogen/docs/reference/agentchat/user_proxy_agent)). - -AutoGen 0.2.2 introduces a [description](https://microsoft.github.io/autogen/docs/reference/agentchat/conversable_agent#__init__) field to all agents, and replaces the use of the `system_message` for orchestration in GroupChat and all future orchestrators. The `description` field defaults to the `system_message` to ensure backwards compatibility, so you may not need to change anything with your code if things are working well for you. However, if you were struggling with GroupChat, give setting the `description` field a try. - -The remainder of this post provides an example of how using the `description` field simplifies GroupChat's job, provides some evidence of its effectiveness, and provides tips for writing good descriptions. - -## Example - -The current GroupChat orchestration system prompt has the following template: - -``` -You are in a role play game. The following roles are available: - -{self._participant_roles(agents)}. - -Read the following conversation. -Then select the next role from {[agent.name for agent in agents]} to play. Only return the role. -``` - -Suppose that you wanted to include 3 agents: A UserProxyAgent, an AssistantAgent, and perhaps a GuardrailsAgent. - -Prior to 0.2.2, this template would expand to: - -``` -You are in a role play game. The following roles are available: - -assistant: You are a helpful AI assistant. -Solve tasks using your coding and language skills. -In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. -1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. -2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. -Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. -When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. -If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. -If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. -When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. -Reply "TERMINATE" in the end when everything is done. -user_proxy: -guardrails_agent: You are a guardrails agent and are tasked with ensuring that all parties adhere to the following responsible AI policies: -- You MUST TERMINATE the conversation if it involves writing or running HARMFUL or DESTRUCTIVE code. -- You MUST TERMINATE the conversation if it involves discussions of anything relating to hacking, computer exploits, or computer security. -- You MUST TERMINATE the conversation if it involves violent or graphic content such as Harm to Others, Self-Harm, Suicide. -- You MUST TERMINATE the conversation if it involves demeaning speech, hate speech, discriminatory remarks, or any form of harassment based on race, gender, sexuality, religion, nationality, disability, or any other protected characteristic. -- You MUST TERMINATE the conversation if it involves seeking or giving advice in highly regulated domains such as medical advice, mental health, legal advice or financial advice -- You MUST TERMINATE the conversation if it involves illegal activities including when encouraging or providing guidance on illegal activities. -- You MUST TERMINATE the conversation if it involves manipulative or deceptive Content including scams, phishing and spread false information. -- You MUST TERMINATE the conversation if it involves involve sexually explicit content or discussions. -- You MUST TERMINATE the conversation if it involves sharing or soliciting personal, sensitive, or confidential information from users. This includes financial details, health records, and other private matters. -- You MUST TERMINATE the conversation if it involves deep personal problems such as dealing with serious personal issues, mental health concerns, or crisis situations. -If you decide that the conversation must be terminated, explain your reasoning then output the uppercase word "TERMINATE". If, on the other hand, you decide the conversation is acceptable by the above standards, indicate as much, then ask the other parties to proceed. - -Read the following conversation. -Then select the next role from [assistant, user_proxy, guardrails_agent] to play. Only return the role. - -``` - -As you can see, this description is super confusing: - -- It is hard to make out where each agent's role-description ends -- `You` appears numerous times, and refers to three separate agents (GroupChatManager, AssistantAgent, and GuardrailsAgent) -- It takes a lot of tokens! - -Consequently, it's not hard to see why the GroupChat manager sometimes struggles with this orchestration task. - -With AutoGen 0.2.2 onward, GroupChat instead relies on the description field. With a description field the orchestration prompt becomes: - -``` -You are in a role play game. The following roles are available: - -assistant: A helpful and general-purpose AI assistant that has strong language skills, Python skills, and Linux command line skills. -user_proxy: A user that can run Python code or input command line commands at a Linux terminal and report back the execution results. -guradrails_agent: An agent that ensures the conversation conforms to responsible AI guidelines. - -Read the following conversation. -Then select the next role from [assistant, user_proxy, guardrails_agent] to play. Only return the role. -``` - -This is much easier to parse and understand, and it doesn't use nearly as many tokens. Moreover, the following experiment provides early evidence that it works. - -## An Experiment with Distraction - -To illustrate the impact of the `description` field, we set up a three-agent experiment with a reduced 26-problem subset of the HumanEval benchmark. Here, three agents were added to a GroupChat to solve programming problems. The three agents were: - -- Coder (default Assistant prompt) -- UserProxy (configured to execute code) -- ExecutiveChef (added as a distraction) - -The Coder and UserProxy used the AssistantAgent and UserProxy defaults (provided above), while the ExecutiveChef was given the system prompt: - -``` -You are an executive chef with 28 years of industry experience. You can answer questions about menu planning, meal preparation, and cooking techniques. -``` - -The ExecutiveChef is clearly the distractor here -- given that no HumanEval problems are food-related, the GroupChat should rarely consult with the chef. However, when configured with GPT-3.5-turbo-16k, we can clearly see the GroupChat struggling with orchestration: - -#### With versions prior to 0.2.2, using `system_message`: - -- The Agents solve 3 out of 26 problems on their first turn -- The ExecutiveChef is called upon 54 times! (almost as much as the Coder at 68 times) - -#### With version 0.2.2, using `description`: - -- The Agents solve 7 out of 26 problems on the first turn -- The ExecutiveChef is called upon 27 times! (versus 84 times for the Coder) - -Using the `description` field doubles performance on this task and halves the incidence of calling upon the distractor agent. - -## Tips for Writing Good Descriptions -Since `descriptions` serve a different purpose than `system_message`s, it is worth reviewing what makes a good agent description. While descriptions are new, the following tips appear to lead to good results: - -- Avoid using the 1st or 2nd person perspective. Descriptions should not contain "I" or "You", unless perhaps "You" is in reference to the GroupChat / orchestrator -- Include any details that might help the orchestrator know when to call upon the agent -- Keep descriptions short (e.g., "A helpful AI assistant with strong natural language and Python coding skills."). - -The main thing to remember is that **the description is for the benefit of the GroupChatManager, not for the Agent's own use or instruction**. - -## Conclusion - -AutoGen 0.2.2 introduces a `description`, becoming the main way agents describe themselves to orchestrators like GroupChat. Since the `description` defaults to the `system_message`, there's nothing you need to change if you were already satisfied with how your group chats were working. However, we expect this feature to generally improve orchestration, so please consider experimenting with the `description` field if you are struggling with GroupChat or want to boost performance. +--- +title: "All About Agent Descriptions" +authors: + - afourney +tags: [AutoGen] +--- + + +## TL;DR + +AutoGen 0.2.2 introduces a [description](https://microsoft.github.io/autogen/docs/reference/agentchat/conversable_agent#__init__) field to ConversableAgent (and all subclasses), and changes GroupChat so that it uses agent `description`s rather than `system_message`s when choosing which agents should speak next. + +This is expected to simplify GroupChat’s job, improve orchestration, and make it easier to implement new GroupChat or GroupChat-like alternatives. + +If you are a developer, and things were already working well for you, no action is needed -- backward compatibility is ensured because the `description` field defaults to the `system_message` when no description is provided. + +However, if you were struggling with getting GroupChat to work, you can now try updating the `description` field. + +## Introduction + +As AutoGen matures and developers build increasingly complex combinations of agents, orchestration is becoming an important capability. At present, [GroupChat](https://microsoft.github.io/autogen/docs/reference/agentchat/groupchat#groupchat-objects) and the [GroupChatManager](https://microsoft.github.io/autogen/docs/reference/agentchat/groupchat#groupchatmanager-objects) are the main built-in tools for orchestrating conversations between 3 or more agents. For orchestrators like GroupChat to work well, they need to know something about each agent so that they can decide who should speak and when. Prior to AutoGen 0.2.2, GroupChat relied on each agent's `system_message` and `name` to learn about each participating agent. This is likely fine when the system prompt is short and sweet, but can lead to problems when the instructions are very long (e.g., with the [AssistantAgent](https://microsoft.github.io/autogen/docs/reference/agentchat/assistant_agent)), or non-existent (e.g., with the [UserProxyAgent](https://microsoft.github.io/autogen/docs/reference/agentchat/user_proxy_agent)). + +AutoGen 0.2.2 introduces a [description](https://microsoft.github.io/autogen/docs/reference/agentchat/conversable_agent#__init__) field to all agents, and replaces the use of the `system_message` for orchestration in GroupChat and all future orchestrators. The `description` field defaults to the `system_message` to ensure backwards compatibility, so you may not need to change anything with your code if things are working well for you. However, if you were struggling with GroupChat, give setting the `description` field a try. + +The remainder of this post provides an example of how using the `description` field simplifies GroupChat's job, provides some evidence of its effectiveness, and provides tips for writing good descriptions. + +## Example + +The current GroupChat orchestration system prompt has the following template: + +``` +You are in a role play game. The following roles are available: + +{self._participant_roles(agents)}. + +Read the following conversation. +Then select the next role from {[agent.name for agent in agents]} to play. Only return the role. +``` + +Suppose that you wanted to include 3 agents: A UserProxyAgent, an AssistantAgent, and perhaps a GuardrailsAgent. + +Prior to 0.2.2, this template would expand to: + +``` +You are in a role play game. The following roles are available: + +assistant: You are a helpful AI assistant. +Solve tasks using your coding and language skills. +In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. +1. When you need to collect info, use the code to output the info you need, for example, browse or search the web, download/read a file, print the content of a webpage or a file, get the current date/time, check the operating system. After sufficient info is printed and the task is ready to be solved based on your language skill, you can solve the task by yourself. +2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. +Solve the task step by step if you need to. If a plan is not provided, explain your plan first. Be clear which step uses code, and which step uses your language skill. +When using code, you must indicate the script type in the code block. The user cannot provide any other feedback or perform any other action beyond executing the code you suggest. The user can't modify your code. So do not suggest incomplete code which requires users to modify. Don't use a code block if it's not intended to be executed by the user. +If you want the user to save the code in a file before executing it, put # filename: inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user. +If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try. +When you find an answer, verify the answer carefully. Include verifiable evidence in your response if possible. +Reply "TERMINATE" in the end when everything is done. +user_proxy: +guardrails_agent: You are a guardrails agent and are tasked with ensuring that all parties adhere to the following responsible AI policies: +- You MUST TERMINATE the conversation if it involves writing or running HARMFUL or DESTRUCTIVE code. +- You MUST TERMINATE the conversation if it involves discussions of anything relating to hacking, computer exploits, or computer security. +- You MUST TERMINATE the conversation if it involves violent or graphic content such as Harm to Others, Self-Harm, Suicide. +- You MUST TERMINATE the conversation if it involves demeaning speech, hate speech, discriminatory remarks, or any form of harassment based on race, gender, sexuality, religion, nationality, disability, or any other protected characteristic. +- You MUST TERMINATE the conversation if it involves seeking or giving advice in highly regulated domains such as medical advice, mental health, legal advice or financial advice +- You MUST TERMINATE the conversation if it involves illegal activities including when encouraging or providing guidance on illegal activities. +- You MUST TERMINATE the conversation if it involves manipulative or deceptive Content including scams, phishing and spread false information. +- You MUST TERMINATE the conversation if it involves involve sexually explicit content or discussions. +- You MUST TERMINATE the conversation if it involves sharing or soliciting personal, sensitive, or confidential information from users. This includes financial details, health records, and other private matters. +- You MUST TERMINATE the conversation if it involves deep personal problems such as dealing with serious personal issues, mental health concerns, or crisis situations. +If you decide that the conversation must be terminated, explain your reasoning then output the uppercase word "TERMINATE". If, on the other hand, you decide the conversation is acceptable by the above standards, indicate as much, then ask the other parties to proceed. + +Read the following conversation. +Then select the next role from [assistant, user_proxy, guardrails_agent] to play. Only return the role. + +``` + +As you can see, this description is super confusing: + +- It is hard to make out where each agent's role-description ends +- `You` appears numerous times, and refers to three separate agents (GroupChatManager, AssistantAgent, and GuardrailsAgent) +- It takes a lot of tokens! + +Consequently, it's not hard to see why the GroupChat manager sometimes struggles with this orchestration task. + +With AutoGen 0.2.2 onward, GroupChat instead relies on the description field. With a description field the orchestration prompt becomes: + +``` +You are in a role play game. The following roles are available: + +assistant: A helpful and general-purpose AI assistant that has strong language skills, Python skills, and Linux command line skills. +user_proxy: A user that can run Python code or input command line commands at a Linux terminal and report back the execution results. +guradrails_agent: An agent that ensures the conversation conforms to responsible AI guidelines. + +Read the following conversation. +Then select the next role from [assistant, user_proxy, guardrails_agent] to play. Only return the role. +``` + +This is much easier to parse and understand, and it doesn't use nearly as many tokens. Moreover, the following experiment provides early evidence that it works. + +## An Experiment with Distraction + +To illustrate the impact of the `description` field, we set up a three-agent experiment with a reduced 26-problem subset of the HumanEval benchmark. Here, three agents were added to a GroupChat to solve programming problems. The three agents were: + +- Coder (default Assistant prompt) +- UserProxy (configured to execute code) +- ExecutiveChef (added as a distraction) + +The Coder and UserProxy used the AssistantAgent and UserProxy defaults (provided above), while the ExecutiveChef was given the system prompt: + +``` +You are an executive chef with 28 years of industry experience. You can answer questions about menu planning, meal preparation, and cooking techniques. +``` + +The ExecutiveChef is clearly the distractor here -- given that no HumanEval problems are food-related, the GroupChat should rarely consult with the chef. However, when configured with GPT-3.5-turbo-16k, we can clearly see the GroupChat struggling with orchestration: + +#### With versions prior to 0.2.2, using `system_message`: + +- The Agents solve 3 out of 26 problems on their first turn +- The ExecutiveChef is called upon 54 times! (almost as much as the Coder at 68 times) + +#### With version 0.2.2, using `description`: + +- The Agents solve 7 out of 26 problems on the first turn +- The ExecutiveChef is called upon 27 times! (versus 84 times for the Coder) + +Using the `description` field doubles performance on this task and halves the incidence of calling upon the distractor agent. + +## Tips for Writing Good Descriptions +Since `descriptions` serve a different purpose than `system_message`s, it is worth reviewing what makes a good agent description. While descriptions are new, the following tips appear to lead to good results: + +- Avoid using the 1st or 2nd person perspective. Descriptions should not contain "I" or "You", unless perhaps "You" is in reference to the GroupChat / orchestrator +- Include any details that might help the orchestrator know when to call upon the agent +- Keep descriptions short (e.g., "A helpful AI assistant with strong natural language and Python coding skills."). + +The main thing to remember is that **the description is for the benefit of the GroupChatManager, not for the Agent's own use or instruction**. + +## Conclusion + +AutoGen 0.2.2 introduces a `description`, becoming the main way agents describe themselves to orchestrators like GroupChat. Since the `description` defaults to the `system_message`, there's nothing you need to change if you were already satisfied with how your group chats were working. However, we expect this feature to generally improve orchestration, so please consider experimenting with the `description` field if you are struggling with GroupChat or want to boost performance. diff --git a/website/blog/2024-01-25-AutoGenBench/index.mdx b/website/blog/2024-01-25-AutoGenBench/index.mdx index 28cdcd8e6a52..3f4b2d4f216b 100644 --- a/website/blog/2024-01-25-AutoGenBench/index.mdx +++ b/website/blog/2024-01-25-AutoGenBench/index.mdx @@ -1,148 +1,148 @@ ---- -title: "AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents" -authors: - - afourney - - qingyunwu -tags: [AutoGen] ---- - -![AutoGenBench](img/teaser.jpg) - -

- - AutoGenBench is a standalone tool for evaluating AutoGen agents and - workflows on common benchmarks. - -

- -## TL;DR - -Today we are releasing AutoGenBench - a tool for evaluating AutoGen agents and workflows on established LLM and agentic benchmarks. - -AutoGenBench is a standalone command line tool, installable from PyPI, which handles downloading, configuring, running, and reporting supported benchmarks. AutoGenBench works best when run alongside Docker, since it uses Docker to isolate tests from one another. - -- See the [AutoGenBench README](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/README.md) for information on installation and running benchmarks. -- See the [AutoGenBench CONTRIBUTING guide](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/CONTRIBUTING.md) for information on developing or contributing benchmark datasets. - -### Quick Start - -Get started quickly by running the following commands in a bash terminal. - -_Note:_ You may need to adjust the path to the `OAI_CONFIG_LIST`, as appropriate. - -```sh -export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST) -pip install autogenbench -autogenbench clone HumanEval -cd HumanEval -cat README.md -autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl -autogenbench tabulate Results/human_eval_two_agents -``` - -## Introduction - -Measurement and evaluation are core components of every major AI or ML research project. The same is true for AutoGen. To this end, today we are releasing AutoGenBench, a standalone command line tool that we have been using to guide development of AutoGen. Conveniently, AutoGenBench handles: downloading, configuring, running, and reporting results of agents on various public benchmark datasets. In addition to reporting top-line numbers, each AutoGenBench run produces a comprehensive set of logs and telemetry that can be used for debugging, profiling, computing custom metrics, and as input to [AgentEval](https://microsoft.github.io/autogen/blog/2023/11/20/AgentEval). In the remainder of this blog post, we outline core design principles for AutoGenBench (key to understanding its operation); present a guide to installing and running AutoGenBench; outline a roadmap for evaluation; and conclude with an open call for contributions. - -## Design Principles - -AutoGenBench is designed around three core design principles. Knowing these principles will help you understand the tool, its operation and its output. These three principles are: - -- **Repetition:** LLMs are stochastic, and in many cases, so too is the code they write to solve problems. For example, a Python script might call an external search engine, and the results may vary run-to-run. This can lead to variance in agent performance. Repetition is key to measuring and understanding this variance. To this end, AutoGenBench is built from the ground up with an understanding that tasks may be run multiple times, and that variance is a metric we often want to measure. - -- **Isolation:** Agents interact with their worlds in both subtle and overt ways. For example an agent may install a python library or write a file to disk. This can lead to ordering effects that can impact future measurements. Consider, for example, comparing two agents on a common benchmark. One agent may appear more efficient than the other simply because it ran second, and benefitted from the hard work the first agent did in installing and debugging necessary Python libraries. To address this, AutoGenBench isolates each task in its own Docker container. This ensures that all runs start with the same initial conditions. (Docker is also a _much safer way to run agent-produced code_, in general.) - -- **Instrumentation:** While top-line metrics are great for comparing agents or models, we often want much more information about how the agents are performing, where they are getting stuck, and how they can be improved. We may also later think of new research questions that require computing a different set of metrics. To this end, AutoGenBench is designed to log everything, and to compute metrics from those logs. This ensures that one can always go back to the logs to answer questions about what happened, run profiling software, or feed the logs into tools like [AgentEval](https://microsoft.github.io/autogen/blog/2023/11/20/AgentEval). - -## Installing and Running AutoGenBench - -As noted above, isolation is a key design principle, and so AutoGenBench must be run in an environment where Docker is available (desktop or Engine). **It will not run in GitHub codespaces**, unless you opt for native execution (which is strongly discouraged). To install Docker Desktop see [https://www.docker.com/products/docker-desktop/](https://www.docker.com/products/docker-desktop/). -Once Docker is installed, AutoGenBench can then be installed as a standalone tool from PyPI. With `pip`, installation can be achieved as follows: - -```sh -pip install autogenbench -``` - -After installation, you must configure your API keys. As with other AutoGen applications, AutoGenBench will look for the OpenAI keys in the OAI_CONFIG_LIST file in the current working directory, or the OAI_CONFIG_LIST environment variable. This behavior can be overridden using a command-line parameter. - -If you will be running multiple benchmarks, it is often most convenient to leverage the environment variable option. You can load your keys into the environment variable by executing: - -```sh -export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST) -``` - -## A Typical Session - -Once AutoGenBench and necessary keys are installed, a typical session will look as follows: - -``` -autogenbench clone HumanEval -cd HumanEval -cat README.md -autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl -autogenbench tabulate results/human_eval_two_agents -``` - -Where: - -- `autogenbench clone HumanEval` downloads and expands the HumanEval benchmark scenario. -- `cd HumanEval; cat README.md` navigates to the benchmark directory, and prints the README (which you should always read!) -- `autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl` - runs a 10% subsample of the tasks defined in `Tasks/human_eval_two_agents.jsonl`. Each task is run 3 times. -- `autogenbench tabulate results/human_eval_two_agents` tabulates the results of the run. - -After running the above `tabulate` command, you should see output similar to the following: - -``` - Trial 0 Trial 1 Trial 2 -Task Id Success Success Success -------------- --------- --------- --------- -HumanEval_107 False True True -HumanEval_22 True True True -HumanEval_43 True True True -HumanEval_88 True True True -HumanEval_14 True True True -HumanEval_157 True True True -HumanEval_141 True True True -HumanEval_57 True True True -HumanEval_154 True True True -HumanEval_153 True True True -HumanEval_93 False True False -HumanEval_137 True True True -HumanEval_143 True True True -HumanEval_13 True True True -HumanEval_49 True True True -HumanEval_95 True True True -------------- --------- --------- --------- -Successes 14 16 15 -Failures 2 0 1 -Missing 0 0 0 -Total 16 16 16 - -CAUTION: 'autogenbench tabulate' is in early preview. -Please do not cite these values in academic work without first inspecting and verifying the results in the logs yourself. -``` - -From this output we can see the results of the three separate repetitions of each task, and final summary statistics of each run. In this case, the results were generated via GPT-4 (as defined in the OAI_CONFIG_LIST that was provided), and used the `TwoAgents` template. **It is important to remember that AutoGenBench evaluates _specific_ end-to-end configurations of agents (as opposed to evaluating a model or cognitive framework more generally).** - -Finally, complete execution traces and logs can be found in the `Results` folder. See the [AutoGenBench README](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/README.md) for more details about command-line options and output formats. Each of these commands also offers extensive in-line help via: - -- `autogenbench --help` -- `autogenbench clone --help` -- `autogenbench run --help` -- `autogenbench tabulate --help` - -## Roadmap - -While we are announcing AutoGenBench, we note that it is very much an evolving project in its own right. Over the next few weeks and months we hope to: - -- Onboard many additional benchmarks beyond those shipping today -- Greatly improve logging and telemetry -- Introduce new core metrics including total costs, task completion time, conversation turns, etc. -- Provide tighter integration with AgentEval and AutoGen Studio - -For an up to date tracking of our work items on this project, please see [AutoGenBench Work Items](https://github.com/microsoft/autogen/issues/973) - -## Call for Participation - -Finally, we want to end this blog post with an open call for contributions. AutoGenBench is still nascent, and has much opportunity for improvement. New benchmarks are constantly being published, and will need to be added. Everyone may have their own distinct set of metrics that they care most about optimizing, and these metrics should be onboarded. To this end, we welcome any and all contributions to this corner of the AutoGen project. If contributing is something that interests you, please see the [contributor’s guide](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/CONTRIBUTING.md) and join our [Discord](https://aka.ms/autogen-dc) discussion in the [#autogenbench](https://discord.com/channels/1153072414184452236/1199851779328847902) channel! +--- +title: "AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents" +authors: + - afourney + - qingyunwu +tags: [AutoGen] +--- + +![AutoGenBench](img/teaser.jpg) + +

+ + AutoGenBench is a standalone tool for evaluating AutoGen agents and + workflows on common benchmarks. + +

+ +## TL;DR + +Today we are releasing AutoGenBench - a tool for evaluating AutoGen agents and workflows on established LLM and agentic benchmarks. + +AutoGenBench is a standalone command line tool, installable from PyPI, which handles downloading, configuring, running, and reporting supported benchmarks. AutoGenBench works best when run alongside Docker, since it uses Docker to isolate tests from one another. + +- See the [AutoGenBench README](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/README.md) for information on installation and running benchmarks. +- See the [AutoGenBench CONTRIBUTING guide](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/CONTRIBUTING.md) for information on developing or contributing benchmark datasets. + +### Quick Start + +Get started quickly by running the following commands in a bash terminal. + +_Note:_ You may need to adjust the path to the `OAI_CONFIG_LIST`, as appropriate. + +```sh +export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST) +pip install autogenbench +autogenbench clone HumanEval +cd HumanEval +cat README.md +autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl +autogenbench tabulate Results/human_eval_two_agents +``` + +## Introduction + +Measurement and evaluation are core components of every major AI or ML research project. The same is true for AutoGen. To this end, today we are releasing AutoGenBench, a standalone command line tool that we have been using to guide development of AutoGen. Conveniently, AutoGenBench handles: downloading, configuring, running, and reporting results of agents on various public benchmark datasets. In addition to reporting top-line numbers, each AutoGenBench run produces a comprehensive set of logs and telemetry that can be used for debugging, profiling, computing custom metrics, and as input to [AgentEval](https://microsoft.github.io/autogen/blog/2023/11/20/AgentEval). In the remainder of this blog post, we outline core design principles for AutoGenBench (key to understanding its operation); present a guide to installing and running AutoGenBench; outline a roadmap for evaluation; and conclude with an open call for contributions. + +## Design Principles + +AutoGenBench is designed around three core design principles. Knowing these principles will help you understand the tool, its operation and its output. These three principles are: + +- **Repetition:** LLMs are stochastic, and in many cases, so too is the code they write to solve problems. For example, a Python script might call an external search engine, and the results may vary run-to-run. This can lead to variance in agent performance. Repetition is key to measuring and understanding this variance. To this end, AutoGenBench is built from the ground up with an understanding that tasks may be run multiple times, and that variance is a metric we often want to measure. + +- **Isolation:** Agents interact with their worlds in both subtle and overt ways. For example an agent may install a python library or write a file to disk. This can lead to ordering effects that can impact future measurements. Consider, for example, comparing two agents on a common benchmark. One agent may appear more efficient than the other simply because it ran second, and benefitted from the hard work the first agent did in installing and debugging necessary Python libraries. To address this, AutoGenBench isolates each task in its own Docker container. This ensures that all runs start with the same initial conditions. (Docker is also a _much safer way to run agent-produced code_, in general.) + +- **Instrumentation:** While top-line metrics are great for comparing agents or models, we often want much more information about how the agents are performing, where they are getting stuck, and how they can be improved. We may also later think of new research questions that require computing a different set of metrics. To this end, AutoGenBench is designed to log everything, and to compute metrics from those logs. This ensures that one can always go back to the logs to answer questions about what happened, run profiling software, or feed the logs into tools like [AgentEval](https://microsoft.github.io/autogen/blog/2023/11/20/AgentEval). + +## Installing and Running AutoGenBench + +As noted above, isolation is a key design principle, and so AutoGenBench must be run in an environment where Docker is available (desktop or Engine). **It will not run in GitHub codespaces**, unless you opt for native execution (which is strongly discouraged). To install Docker Desktop see [https://www.docker.com/products/docker-desktop/](https://www.docker.com/products/docker-desktop/). +Once Docker is installed, AutoGenBench can then be installed as a standalone tool from PyPI. With `pip`, installation can be achieved as follows: + +```sh +pip install autogenbench +``` + +After installation, you must configure your API keys. As with other AutoGen applications, AutoGenBench will look for the OpenAI keys in the OAI_CONFIG_LIST file in the current working directory, or the OAI_CONFIG_LIST environment variable. This behavior can be overridden using a command-line parameter. + +If you will be running multiple benchmarks, it is often most convenient to leverage the environment variable option. You can load your keys into the environment variable by executing: + +```sh +export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST) +``` + +## A Typical Session + +Once AutoGenBench and necessary keys are installed, a typical session will look as follows: + +``` +autogenbench clone HumanEval +cd HumanEval +cat README.md +autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl +autogenbench tabulate results/human_eval_two_agents +``` + +Where: + +- `autogenbench clone HumanEval` downloads and expands the HumanEval benchmark scenario. +- `cd HumanEval; cat README.md` navigates to the benchmark directory, and prints the README (which you should always read!) +- `autogenbench run --subsample 0.1 --repeat 3 Tasks/human_eval_two_agents.jsonl` + runs a 10% subsample of the tasks defined in `Tasks/human_eval_two_agents.jsonl`. Each task is run 3 times. +- `autogenbench tabulate results/human_eval_two_agents` tabulates the results of the run. + +After running the above `tabulate` command, you should see output similar to the following: + +``` + Trial 0 Trial 1 Trial 2 +Task Id Success Success Success +------------- --------- --------- --------- +HumanEval_107 False True True +HumanEval_22 True True True +HumanEval_43 True True True +HumanEval_88 True True True +HumanEval_14 True True True +HumanEval_157 True True True +HumanEval_141 True True True +HumanEval_57 True True True +HumanEval_154 True True True +HumanEval_153 True True True +HumanEval_93 False True False +HumanEval_137 True True True +HumanEval_143 True True True +HumanEval_13 True True True +HumanEval_49 True True True +HumanEval_95 True True True +------------- --------- --------- --------- +Successes 14 16 15 +Failures 2 0 1 +Missing 0 0 0 +Total 16 16 16 + +CAUTION: 'autogenbench tabulate' is in early preview. +Please do not cite these values in academic work without first inspecting and verifying the results in the logs yourself. +``` + +From this output we can see the results of the three separate repetitions of each task, and final summary statistics of each run. In this case, the results were generated via GPT-4 (as defined in the OAI_CONFIG_LIST that was provided), and used the `TwoAgents` template. **It is important to remember that AutoGenBench evaluates _specific_ end-to-end configurations of agents (as opposed to evaluating a model or cognitive framework more generally).** + +Finally, complete execution traces and logs can be found in the `Results` folder. See the [AutoGenBench README](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/README.md) for more details about command-line options and output formats. Each of these commands also offers extensive in-line help via: + +- `autogenbench --help` +- `autogenbench clone --help` +- `autogenbench run --help` +- `autogenbench tabulate --help` + +## Roadmap + +While we are announcing AutoGenBench, we note that it is very much an evolving project in its own right. Over the next few weeks and months we hope to: + +- Onboard many additional benchmarks beyond those shipping today +- Greatly improve logging and telemetry +- Introduce new core metrics including total costs, task completion time, conversation turns, etc. +- Provide tighter integration with AgentEval and AutoGen Studio + +For an up to date tracking of our work items on this project, please see [AutoGenBench Work Items](https://github.com/microsoft/autogen/issues/973) + +## Call for Participation + +Finally, we want to end this blog post with an open call for contributions. AutoGenBench is still nascent, and has much opportunity for improvement. New benchmarks are constantly being published, and will need to be added. Everyone may have their own distinct set of metrics that they care most about optimizing, and these metrics should be onboarded. To this end, we welcome any and all contributions to this corner of the AutoGen project. If contributing is something that interests you, please see the [contributor’s guide](https://github.com/microsoft/autogen/blob/main/samples/tools/autogenbench/CONTRIBUTING.md) and join our [Discord](https://aka.ms/autogen-dc) discussion in the [#autogenbench](https://discord.com/channels/1153072414184452236/1199851779328847902) channel! diff --git a/website/blog/2024-02-11-FSM-GroupChat/index.mdx b/website/blog/2024-02-11-FSM-GroupChat/index.mdx index 74b5b49e35f5..3f2f7a5dba1e 100644 --- a/website/blog/2024-02-11-FSM-GroupChat/index.mdx +++ b/website/blog/2024-02-11-FSM-GroupChat/index.mdx @@ -1,288 +1,288 @@ ---- -title: "FSM Group Chat -- User-specified agent transitions" -authors: - - joshkyh - - freedeaths -tags: [AutoGen] ---- - -![FSM Group Chat](img/teaser.jpg) -

Finite State Machine (FSM) Group Chat allows the user to constrain agent transitions.

- - -## TL;DR -Recently, FSM Group Chat is released that allows the user to input a transition graph to constrain agent transitions. This is useful as the number of agents increases because the number of transition pairs (N choose 2 combinations) increases exponentially increasing the risk of sub-optimal transitions, which leads to wastage of tokens and/or poor outcomes. - -## Possible use-cases for transition graph -1. One-pass workflow, i.e., we want each agent to only have one pass at the problem, Agent A -> B -> C. -2. Decision tree flow, like a decision tree, we start with a root node (agent), and flow down the decision tree with agents being nodes. For example, if the query is a SQL query, hand over to the SQL agent, else if the query is a RAG query, hand over to the RAG agent. -3. Sequential Team Ops. Suppose we have a team of 3 developer agents, each responsible for a different GitHub repo. We also have a team of business analyst that discuss and debate the overall goal of the user. We could have the manager agent of the developer team speak to the manager agent of the business analysis team. That way, the discussions are more focused team-wise, and better outcomes can be expected. - -Note that we are not enforcing a directed acyclic graph; the user can specify the graph to be acyclic, but cyclic workflows can also be useful to iteratively work on a problem, and layering additional analysis onto the solution. - - -## Usage Guide -We have added two parameters `allowed_or_disallowed_speaker_transitions` and `speaker_transitions_type`. -- `allowed_or_disallowed_speaker_transitions`: is a dictionary with the type expectation of `{Agent: [Agent]}`. The key refers to the source agent, while the value(s) in the list refers to the target agent(s). If none, a fully connection graph is assumed. -- `speaker_transitions_type`: is a string with the type expectation of string, and specifically, one of ["allowed", "disallowed"]. We wanted the user to be able to supply a dictionary of allowed or disallowed transitions to improve the ease of use. In the code base, we would invert the disallowed transition into a allowed transition dictionary `allowed_speaker_transitions_dict`. - - -### Application of the FSM Feature - -A quick demonstration of how to initiate a FSM-based `GroupChat` in the `AutoGen` framework. In this demonstration, if we consider each agent as a state, and each agent speaks according to certain conditions. For example, User always initiates the task first, followed by Planner creating a plan. Then Engineer and Executor work alternately, with Critic intervening when necessary, and after Critic, only Planner should revise additional plans. Each state can only exist at a time, and there are transition conditions between states. Therefore, GroupChat can be well abstracted as a Finite-State Machine (FSM). - -![visualization](img/FSM_logic.png) - - -### Usage - -0. Pre-requisites -```bash -pip install autogen[graph] -``` - -1. Import dependencies - - ```python - from autogen.agentchat import GroupChat, AssistantAgent, UserProxyAgent, GroupChatManager - from autogen.oai.openai_utils import config_list_from_dotenv - ``` -2. Configure LLM parameters - - ```python - # Please feel free to change it as you wish - config_list = config_list_from_dotenv( - dotenv_file_path='.env', - model_api_key_map={'gpt-4-1106-preview':'OPENAI_API_KEY'}, - filter_dict={ - "model": { - "gpt-4-1106-preview" - } - } - ) - - gpt_config = { - "cache_seed": None, - "temperature": 0, - "config_list": config_list, - "timeout": 100, - } - ``` - -3. Define the task - - ```python - # describe the task - task = """Add 1 to the number output by the previous role. If the previous number is 20, output "TERMINATE".""" - ``` - -4. Define agents - - ```python - # agents configuration - engineer = AssistantAgent( - name="Engineer", - llm_config=gpt_config, - system_message=task, - description="""I am **ONLY** allowed to speak **immediately** after `Planner`, `Critic` and `Executor`. - If the last number mentioned by `Critic` is not a multiple of 5, the next speaker must be `Engineer`. - """ - ) - - planner = AssistantAgent( - name="Planner", - system_message=task, - llm_config=gpt_config, - description="""I am **ONLY** allowed to speak **immediately** after `User` or `Critic`. - If the last number mentioned by `Critic` is a multiple of 5, the next speaker must be `Planner`. - """ - ) - - executor = AssistantAgent( - name="Executor", - system_message=task, - is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("FINISH"), - llm_config=gpt_config, - description="""I am **ONLY** allowed to speak **immediately** after `Engineer`. - If the last number mentioned by `Engineer` is a multiple of 3, the next speaker can only be `Executor`. - """ - ) - - critic = AssistantAgent( - name="Critic", - system_message=task, - llm_config=gpt_config, - description="""I am **ONLY** allowed to speak **immediately** after `Engineer`. - If the last number mentioned by `Engineer` is not a multiple of 3, the next speaker can only be `Critic`. - """ - ) - - user_proxy = UserProxyAgent( - name="User", - system_message=task, - code_execution_config=False, - human_input_mode="NEVER", - llm_config=False, - description=""" - Never select me as a speaker. - """ - ) - ``` - - 1. Here, I have configured the `system_messages` as "task" because every agent should know what it needs to do. In this example, each agent has the same task, which is to count in sequence. - 2. **The most important point is the `description` parameter, where I have used natural language to describe the transition conditions of the FSM. Because the manager knows which agents are available next based on the constraints of the graph, I describe in the `description` field of each candidate agent when it can speak, effectively describing the transition conditions in the FSM.** - -5. Define the graph - - ```python - graph_dict = {} - graph_dict[user_proxy] = [planner] - graph_dict[planner] = [engineer] - graph_dict[engineer] = [critic, executor] - graph_dict[critic] = [engineer, planner] - graph_dict[executor] = [engineer] - ``` - - 1. **The graph here and the transition conditions mentioned above together form a complete FSM. Both are essential and cannot be missing.** - 2. You can visualize it as you wish, which is shown as follows - - ![visualization](img/FSM_of_multi-agents.png) - -6. Define a `GroupChat` and a `GroupChatManager` - - ```python - agents = [user_proxy, engineer, planner, executor, critic] - - # create the groupchat - group_chat = GroupChat(agents=agents, messages=[], max_round=25, allowed_or_disallowed_speaker_transitions=graph_dict, allow_repeat_speaker=None, speaker_transitions_type="allowed") - - # create the manager - manager = GroupChatManager( - groupchat=group_chat, - llm_config=gpt_config, - is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"), - code_execution_config=False, - ) - ``` - -7. Initiate the chat - - ```python - # initiate the task - user_proxy.initiate_chat( - manager, - message="1", - clear_history=True - ) - ``` - -8. You may get the following output(I deleted the ignorable warning): - - ``` - User (to chat_manager): - - 1 - - -------------------------------------------------------------------------------- - Planner (to chat_manager): - - 2 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 3 - - -------------------------------------------------------------------------------- - Executor (to chat_manager): - - 4 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 5 - - -------------------------------------------------------------------------------- - Critic (to chat_manager): - - 6 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 7 - - -------------------------------------------------------------------------------- - Critic (to chat_manager): - - 8 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 9 - - -------------------------------------------------------------------------------- - Executor (to chat_manager): - - 10 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 11 - - -------------------------------------------------------------------------------- - Critic (to chat_manager): - - 12 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 13 - - -------------------------------------------------------------------------------- - Critic (to chat_manager): - - 14 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 15 - - -------------------------------------------------------------------------------- - Executor (to chat_manager): - - 16 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 17 - - -------------------------------------------------------------------------------- - Critic (to chat_manager): - - 18 - - -------------------------------------------------------------------------------- - Engineer (to chat_manager): - - 19 - - -------------------------------------------------------------------------------- - Critic (to chat_manager): - - 20 - - -------------------------------------------------------------------------------- - Planner (to chat_manager): - - TERMINATE - ``` - -## Notebook examples -More examples can be found in the [notebook](https://microsoft.github.io/autogen/docs/notebooks/agentchat_groupchat_finite_state_machine/). The notebook includes more examples of possible transition paths such as (1) hub and spoke, (2) sequential team operations, and (3) think aloud and debate. It also uses the function `visualize_speaker_transitions_dict` from `autogen.graph_utils` to visualize the various graphs. +--- +title: "FSM Group Chat -- User-specified agent transitions" +authors: + - joshkyh + - freedeaths +tags: [AutoGen] +--- + +![FSM Group Chat](img/teaser.jpg) +

Finite State Machine (FSM) Group Chat allows the user to constrain agent transitions.

+ + +## TL;DR +Recently, FSM Group Chat is released that allows the user to input a transition graph to constrain agent transitions. This is useful as the number of agents increases because the number of transition pairs (N choose 2 combinations) increases exponentially increasing the risk of sub-optimal transitions, which leads to wastage of tokens and/or poor outcomes. + +## Possible use-cases for transition graph +1. One-pass workflow, i.e., we want each agent to only have one pass at the problem, Agent A -> B -> C. +2. Decision tree flow, like a decision tree, we start with a root node (agent), and flow down the decision tree with agents being nodes. For example, if the query is a SQL query, hand over to the SQL agent, else if the query is a RAG query, hand over to the RAG agent. +3. Sequential Team Ops. Suppose we have a team of 3 developer agents, each responsible for a different GitHub repo. We also have a team of business analyst that discuss and debate the overall goal of the user. We could have the manager agent of the developer team speak to the manager agent of the business analysis team. That way, the discussions are more focused team-wise, and better outcomes can be expected. + +Note that we are not enforcing a directed acyclic graph; the user can specify the graph to be acyclic, but cyclic workflows can also be useful to iteratively work on a problem, and layering additional analysis onto the solution. + + +## Usage Guide +We have added two parameters `allowed_or_disallowed_speaker_transitions` and `speaker_transitions_type`. +- `allowed_or_disallowed_speaker_transitions`: is a dictionary with the type expectation of `{Agent: [Agent]}`. The key refers to the source agent, while the value(s) in the list refers to the target agent(s). If none, a fully connection graph is assumed. +- `speaker_transitions_type`: is a string with the type expectation of string, and specifically, one of ["allowed", "disallowed"]. We wanted the user to be able to supply a dictionary of allowed or disallowed transitions to improve the ease of use. In the code base, we would invert the disallowed transition into a allowed transition dictionary `allowed_speaker_transitions_dict`. + + +### Application of the FSM Feature + +A quick demonstration of how to initiate a FSM-based `GroupChat` in the `AutoGen` framework. In this demonstration, if we consider each agent as a state, and each agent speaks according to certain conditions. For example, User always initiates the task first, followed by Planner creating a plan. Then Engineer and Executor work alternately, with Critic intervening when necessary, and after Critic, only Planner should revise additional plans. Each state can only exist at a time, and there are transition conditions between states. Therefore, GroupChat can be well abstracted as a Finite-State Machine (FSM). + +![visualization](img/FSM_logic.png) + + +### Usage + +0. Pre-requisites +```bash +pip install autogen[graph] +``` + +1. Import dependencies + + ```python + from autogen.agentchat import GroupChat, AssistantAgent, UserProxyAgent, GroupChatManager + from autogen.oai.openai_utils import config_list_from_dotenv + ``` +2. Configure LLM parameters + + ```python + # Please feel free to change it as you wish + config_list = config_list_from_dotenv( + dotenv_file_path='.env', + model_api_key_map={'gpt-4-1106-preview':'OPENAI_API_KEY'}, + filter_dict={ + "model": { + "gpt-4-1106-preview" + } + } + ) + + gpt_config = { + "cache_seed": None, + "temperature": 0, + "config_list": config_list, + "timeout": 100, + } + ``` + +3. Define the task + + ```python + # describe the task + task = """Add 1 to the number output by the previous role. If the previous number is 20, output "TERMINATE".""" + ``` + +4. Define agents + + ```python + # agents configuration + engineer = AssistantAgent( + name="Engineer", + llm_config=gpt_config, + system_message=task, + description="""I am **ONLY** allowed to speak **immediately** after `Planner`, `Critic` and `Executor`. + If the last number mentioned by `Critic` is not a multiple of 5, the next speaker must be `Engineer`. + """ + ) + + planner = AssistantAgent( + name="Planner", + system_message=task, + llm_config=gpt_config, + description="""I am **ONLY** allowed to speak **immediately** after `User` or `Critic`. + If the last number mentioned by `Critic` is a multiple of 5, the next speaker must be `Planner`. + """ + ) + + executor = AssistantAgent( + name="Executor", + system_message=task, + is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("FINISH"), + llm_config=gpt_config, + description="""I am **ONLY** allowed to speak **immediately** after `Engineer`. + If the last number mentioned by `Engineer` is a multiple of 3, the next speaker can only be `Executor`. + """ + ) + + critic = AssistantAgent( + name="Critic", + system_message=task, + llm_config=gpt_config, + description="""I am **ONLY** allowed to speak **immediately** after `Engineer`. + If the last number mentioned by `Engineer` is not a multiple of 3, the next speaker can only be `Critic`. + """ + ) + + user_proxy = UserProxyAgent( + name="User", + system_message=task, + code_execution_config=False, + human_input_mode="NEVER", + llm_config=False, + description=""" + Never select me as a speaker. + """ + ) + ``` + + 1. Here, I have configured the `system_messages` as "task" because every agent should know what it needs to do. In this example, each agent has the same task, which is to count in sequence. + 2. **The most important point is the `description` parameter, where I have used natural language to describe the transition conditions of the FSM. Because the manager knows which agents are available next based on the constraints of the graph, I describe in the `description` field of each candidate agent when it can speak, effectively describing the transition conditions in the FSM.** + +5. Define the graph + + ```python + graph_dict = {} + graph_dict[user_proxy] = [planner] + graph_dict[planner] = [engineer] + graph_dict[engineer] = [critic, executor] + graph_dict[critic] = [engineer, planner] + graph_dict[executor] = [engineer] + ``` + + 1. **The graph here and the transition conditions mentioned above together form a complete FSM. Both are essential and cannot be missing.** + 2. You can visualize it as you wish, which is shown as follows + + ![visualization](img/FSM_of_multi-agents.png) + +6. Define a `GroupChat` and a `GroupChatManager` + + ```python + agents = [user_proxy, engineer, planner, executor, critic] + + # create the groupchat + group_chat = GroupChat(agents=agents, messages=[], max_round=25, allowed_or_disallowed_speaker_transitions=graph_dict, allow_repeat_speaker=None, speaker_transitions_type="allowed") + + # create the manager + manager = GroupChatManager( + groupchat=group_chat, + llm_config=gpt_config, + is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"), + code_execution_config=False, + ) + ``` + +7. Initiate the chat + + ```python + # initiate the task + user_proxy.initiate_chat( + manager, + message="1", + clear_history=True + ) + ``` + +8. You may get the following output(I deleted the ignorable warning): + + ``` + User (to chat_manager): + + 1 + + -------------------------------------------------------------------------------- + Planner (to chat_manager): + + 2 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 3 + + -------------------------------------------------------------------------------- + Executor (to chat_manager): + + 4 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 5 + + -------------------------------------------------------------------------------- + Critic (to chat_manager): + + 6 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 7 + + -------------------------------------------------------------------------------- + Critic (to chat_manager): + + 8 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 9 + + -------------------------------------------------------------------------------- + Executor (to chat_manager): + + 10 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 11 + + -------------------------------------------------------------------------------- + Critic (to chat_manager): + + 12 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 13 + + -------------------------------------------------------------------------------- + Critic (to chat_manager): + + 14 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 15 + + -------------------------------------------------------------------------------- + Executor (to chat_manager): + + 16 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 17 + + -------------------------------------------------------------------------------- + Critic (to chat_manager): + + 18 + + -------------------------------------------------------------------------------- + Engineer (to chat_manager): + + 19 + + -------------------------------------------------------------------------------- + Critic (to chat_manager): + + 20 + + -------------------------------------------------------------------------------- + Planner (to chat_manager): + + TERMINATE + ``` + +## Notebook examples +More examples can be found in the [notebook](https://microsoft.github.io/autogen/docs/notebooks/agentchat_groupchat_finite_state_machine/). The notebook includes more examples of possible transition paths such as (1) hub and spoke, (2) sequential team operations, and (3) think aloud and debate. It also uses the function `visualize_speaker_transitions_dict` from `autogen.graph_utils` to visualize the various graphs. diff --git a/website/blog/authors.yml b/website/blog/authors.yml index b52fffbdd0fa..70a4e5c0f9c1 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,125 +1,125 @@ -sonichi: - name: Chi Wang - title: Principal Researcher at Microsoft Research - url: https://www.linkedin.com/in/chi-wang-49b15b16/ - image_url: https://github.com/sonichi.png - -qingyunwu: - name: Qingyun Wu - title: Assistant Professor at the Pennsylvania State University - url: https://qingyun-wu.github.io/ - image_url: https://github.com/qingyun-wu.png - -yiranwu: - name: Yiran Wu - title: PhD student at Pennsylvania State University - url: https://github.com/kevin666aa - image_url: https://github.com/kevin666aa.png - -jialeliu: - name: Jiale Liu - title: Undergraduate student at Xidian University - url: https://leoljl.github.io - image_url: https://github.com/LeoLjl/leoljl.github.io/blob/main/profile.jpg?raw=true - -thinkall: - name: Li Jiang - title: Senior Software Engineer at Microsoft - url: https://github.com/thinkall - image_url: https://github.com/thinkall.png - -rickyloynd-microsoft: - name: Ricky Loynd - title: Senior Research Engineer at Microsoft - url: https://github.com/rickyloynd-microsoft - image_url: https://github.com/rickyloynd-microsoft.png - -samershi: - name: Saleema Amershi - title: Senior Principal Research Manager at Microsoft Research - url: https://github.com/samershi - image_url: https://github.com/samershi.png - -pcdeadeasy: - name: Piali Choudhury - title: Principal RSDE at Microsoft Research - url: https://github.com/pcdeadeasy - image_url: https://github.com/pcdeadeasy.png - -victordibia: - name: Victor Dibia - title: Principal RSDE at Microsoft Research - url: https://github.com/victordibia - image_url: https://github.com/victordibia.png - -afourney: - name: Adam Fourney - title: Principal Researcher Microsoft Research - url: https://www.adamfourney.com - image_url: https://github.com/afourney.png - -beibinli: - name: Beibin Li - title: Senior Research Engineer at Microsoft - url: https://github.com/beibinli - image_url: https://github.com/beibinli.png - -gagb: - name: Gagan Bansal - title: Senior Researcher at Microsoft Research - url: https://www.linkedin.com/in/gagan-bansal/ - image_url: https://github.com/gagb.png - -jieyuz2: - name: Jieyu Zhang - title: PhD student at University of Washington - url: https://jieyuz2.github.io/ - image_url: https://github.com/jieyuz2.png - -julianakiseleva: - name: Julia Kiseleva - title: Senior Researcher at Microsoft Research - url: https://github.com/julianakiseleva/ - image_url: https://avatars.githubusercontent.com/u/5908392?v=4 - -narabzad: - name: Negar Arabzadeh - title: PhD student at the University of Waterloo - url: https://www.negara.me/ - image_url: https://github.com/Narabzad.png - -LinxinS97: - name: Linxin Song - title: MS student at Waseda University - url: https://linxins97.github.io/ - image_url: https://github.com/LinxinS97.png - -skzhang1: - name: Shaokun Zhang - title: PhD student at the Pennsylvania State University - url: https://github.com/skzhang1 - image_url: https://github.com/skzhang1.png - -olgavrou: - name: Olga Vrousgou - title: Senior Software Engineer at Microsoft Research - url: https://github.com/olgavrou/ - image_url: https://github.com/olgavrou.png - -joshkyh: - name: Joshua Kim - title: AI Freelancer at SpectData - url: https://github.com/joshkyh/ - image_url: https://github.com/joshkyh.png - -freedeaths: - name: Yishen Sun - title: Data Scientist at PingCAP LAB - url: https://github.com/freedeaths/ - image_url: https://github.com/freedeaths.png - -yifanzeng: - name: Yifan Zeng - title: PhD student at Oregon State University - url: https://xhmy.github.io/ +sonichi: + name: Chi Wang + title: Principal Researcher at Microsoft Research + url: https://www.linkedin.com/in/chi-wang-49b15b16/ + image_url: https://github.com/sonichi.png + +qingyunwu: + name: Qingyun Wu + title: Assistant Professor at the Pennsylvania State University + url: https://qingyun-wu.github.io/ + image_url: https://github.com/qingyun-wu.png + +yiranwu: + name: Yiran Wu + title: PhD student at Pennsylvania State University + url: https://github.com/kevin666aa + image_url: https://github.com/kevin666aa.png + +jialeliu: + name: Jiale Liu + title: Undergraduate student at Xidian University + url: https://leoljl.github.io + image_url: https://github.com/LeoLjl/leoljl.github.io/blob/main/profile.jpg?raw=true + +thinkall: + name: Li Jiang + title: Senior Software Engineer at Microsoft + url: https://github.com/thinkall + image_url: https://github.com/thinkall.png + +rickyloynd-microsoft: + name: Ricky Loynd + title: Senior Research Engineer at Microsoft + url: https://github.com/rickyloynd-microsoft + image_url: https://github.com/rickyloynd-microsoft.png + +samershi: + name: Saleema Amershi + title: Senior Principal Research Manager at Microsoft Research + url: https://github.com/samershi + image_url: https://github.com/samershi.png + +pcdeadeasy: + name: Piali Choudhury + title: Principal RSDE at Microsoft Research + url: https://github.com/pcdeadeasy + image_url: https://github.com/pcdeadeasy.png + +victordibia: + name: Victor Dibia + title: Principal RSDE at Microsoft Research + url: https://github.com/victordibia + image_url: https://github.com/victordibia.png + +afourney: + name: Adam Fourney + title: Principal Researcher Microsoft Research + url: https://www.adamfourney.com + image_url: https://github.com/afourney.png + +beibinli: + name: Beibin Li + title: Senior Research Engineer at Microsoft + url: https://github.com/beibinli + image_url: https://github.com/beibinli.png + +gagb: + name: Gagan Bansal + title: Senior Researcher at Microsoft Research + url: https://www.linkedin.com/in/gagan-bansal/ + image_url: https://github.com/gagb.png + +jieyuz2: + name: Jieyu Zhang + title: PhD student at University of Washington + url: https://jieyuz2.github.io/ + image_url: https://github.com/jieyuz2.png + +julianakiseleva: + name: Julia Kiseleva + title: Senior Researcher at Microsoft Research + url: https://github.com/julianakiseleva/ + image_url: https://avatars.githubusercontent.com/u/5908392?v=4 + +narabzad: + name: Negar Arabzadeh + title: PhD student at the University of Waterloo + url: https://www.negara.me/ + image_url: https://github.com/Narabzad.png + +LinxinS97: + name: Linxin Song + title: MS student at Waseda University + url: https://linxins97.github.io/ + image_url: https://github.com/LinxinS97.png + +skzhang1: + name: Shaokun Zhang + title: PhD student at the Pennsylvania State University + url: https://github.com/skzhang1 + image_url: https://github.com/skzhang1.png + +olgavrou: + name: Olga Vrousgou + title: Senior Software Engineer at Microsoft Research + url: https://github.com/olgavrou/ + image_url: https://github.com/olgavrou.png + +joshkyh: + name: Joshua Kim + title: AI Freelancer at SpectData + url: https://github.com/joshkyh/ + image_url: https://github.com/joshkyh.png + +freedeaths: + name: Yishen Sun + title: Data Scientist at PingCAP LAB + url: https://github.com/freedeaths/ + image_url: https://github.com/freedeaths.png + +yifanzeng: + name: Yifan Zeng + title: PhD student at Oregon State University + url: https://xhmy.github.io/ image_url: https://xhmy.github.io/assets/img/photo.JPG