From b3fba9734ebf7ce504b180fac1783442389b85bf Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 4 May 2023 19:48:31 -0700
Subject: [PATCH] Mark experimental classes; doc; multi-config trial (#1021)

* Mark experimental classes

* template

* multi model

* test

* multi-config doc

* doc

* doc

* test

---------

Co-authored-by: Li Jiang <bnujli@gmail.com>
---
 flaml/autogen/agent/agent.py                  |  2 +-
 flaml/autogen/agent/coding_agent.py           |  2 +-
 flaml/autogen/agent/execution_agent.py        |  2 +-
 flaml/autogen/oai/completion.py               | 70 +++++++++++++++++--
 setup.py                                      |  1 +
 test/openai/test_completion.py                | 68 +++++++++++++++---
 website/docs/Use-Cases/Auto-Generation.md     | 47 +++++++++++--
 .../docs/Use-Cases/Task-Oriented-AutoML.md    |  2 +-
 8 files changed, 170 insertions(+), 24 deletions(-)

diff --git a/flaml/autogen/agent/agent.py b/flaml/autogen/agent/agent.py
index 9eb4ee4114..be7743b6ce 100644
--- a/flaml/autogen/agent/agent.py
+++ b/flaml/autogen/agent/agent.py
@@ -2,7 +2,7 @@
 
 
 class Agent:
-    """An abstract class for AI agent.
+    """(Experimental) An abstract class for AI agent.
     An agent can communicate with other agents, human and perform actions.
     Different agents can differ in how and who they communicate with, and what actions they can perform. For example, an autonomous agent can communicate with human and other agents, and perform actions by creating agents and sending messages to other agents. A planning agent can communicate with other agents to make a plan and keep track of tasks. An execution agent can only communicate with other agents, and perform actions such as executing a command or code.
     """
diff --git a/flaml/autogen/agent/coding_agent.py b/flaml/autogen/agent/coding_agent.py
index 6cd51717a3..159b603ac5 100644
--- a/flaml/autogen/agent/coding_agent.py
+++ b/flaml/autogen/agent/coding_agent.py
@@ -5,7 +5,7 @@
 
 
 class PythonAgent(Agent):
-    """Suggest code blocks."""
+    """(Experimental) Suggest code blocks."""
 
     DEFAULT_SYSTEM_MESSAGE = """You are a coding agent. You suggest python code for a user to execute for a given task. Don't suggest shell command. Output the code in a coding block. Check the execution result. If the result indicates there is an error, fix the error and output the code again.
     """
diff --git a/flaml/autogen/agent/execution_agent.py b/flaml/autogen/agent/execution_agent.py
index cafae427f9..4cb9f92deb 100644
--- a/flaml/autogen/agent/execution_agent.py
+++ b/flaml/autogen/agent/execution_agent.py
@@ -3,7 +3,7 @@
 
 
 class ExecutionAgent(Agent):
-    """Perform actions based on instructions from other agents.
+    """(Experimental) Perform actions based on instructions from other agents.
     An execution agent can only communicate with other agents, and perform actions such as executing a command or code.
     """
 
diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index 7cc7be9d6c..4cb5edfa7e 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -164,11 +164,17 @@ def _book_keeping(cls, config: Dict, response):
         cls._count_create += 1
 
     @classmethod
-    def _get_response(cls, config: dict, eval_only=False, use_cache=True):
+    def _get_response(cls, config: Dict, eval_only=False, use_cache=True):
         """Get the response from the openai api call.
 
         Try cache first. If not found, call the openai api. If the api call fails, retry after retry_time.
         """
+        config = config.copy()
+        openai.api_key = config.pop("api_key", openai.api_key)
+        openai.api_base = config.pop("api_base", openai.api_base)
+        openai.api_key_path = config.pop("api_key_path", openai.api_key_path)
+        openai.api_type = config.pop("api_type", openai.api_type)
+        openai.api_version = config.pop("api_version", openai.api_version)
         key = get_key(config)
         if use_cache:
             response = cls._cache.get(key, None)
@@ -222,7 +228,6 @@ def _get_response(cls, config: dict, eval_only=False, use_cache=True):
             except InvalidRequestError:
                 if "azure" == openai.api_type and "model" in config:
                     # azure api uses "engine" instead of "model"
-                    config = config.copy()
                     config["engine"] = config.pop("model").replace("gpt-3.5-turbo", "gpt-35-turbo")
                 else:
                     raise
@@ -671,16 +676,56 @@ def eval_func(responses, **data):
         return params, analysis
 
     @classmethod
-    def create(cls, context: Optional[Dict] = None, use_cache: Optional[bool] = True, **config):
+    def create(
+        cls,
+        context: Optional[Dict] = None,
+        use_cache: Optional[bool] = True,
+        config_list: Optional[List] = None,
+        **config,
+    ):
         """Make a completion for a given context.
 
         Args:
-            context (dict, Optional): The context to instantiate the prompt.
+            context (Dict, Optional): The context to instantiate the prompt.
                 It needs to contain keys that are used by the prompt template.
                 E.g., `prompt="Complete the following sentence: {prefix}, context={"prefix": "Today I feel"}`.
-                The actual prompt sent to OpenAI will be:
+                The actual prompt will be:
                 "Complete the following sentence: Today I feel".
+                More examples can be found at [templating](/docs/Use-Cases/Auto-Generation#templating).
             use_cache (bool, Optional): Whether to use cached responses.
+            config_list (List, Optional): List of configurations for the completion to try.
+                The first one that does not raise an error will be used.
+                Only the differences from the default config need to be provided.
+                E.g.,
+
+        ```python
+        response = oai.Completion.create(
+            config_list=[
+                {
+                    "model": "gpt-4",
+                    "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
+                    "api_type": "azure",
+                    "api_base": os.environ.get("AZURE_OPENAI_API_BASE"),
+                    "api_version": "2023-03-15-preview",
+                },
+                {
+                    "model": "gpt-3.5-turbo",
+                    "api_key": os.environ.get("OPENAI_API_KEY"),
+                    "api_type": "open_ai",
+                    "api_base": "https://api.openai.com/v1",
+                    "api_version": None,
+                },
+                {
+                    "model": "llama-7B",
+                    "api_base": "http://127.0.0.1:8080",
+                    "api_type": "open_ai",
+                    "api_version": None,
+                }
+            ],
+            prompt="Hi",
+        )
+        ```
+
             **config: Configuration for the completion.
                 Besides the parameters for the openai API call, it can also contain a seed (int) for the cache.
                 This is useful when implementing "controlled randomness" for the completion.
@@ -691,6 +736,21 @@ def create(cls, context: Optional[Dict] = None, use_cache: Optional[bool] = True
         """
         if ERROR:
             raise ERROR
+        if config_list:
+            retry_timeout = cls.retry_timeout
+            for i, each_config in enumerate(config_list):
+                base_config = config.copy()
+                base_config.update(each_config)
+                try:
+                    cls.retry_timeout = 0 if i < len(config_list) - 1 else retry_timeout
+                    # retry_timeout = 0 to avoid retrying
+                    return cls.create(context, use_cache, **base_config)
+                except (RateLimitError, Timeout):
+                    logger.info(f"failed with config {i}", exc_info=1)
+                    if i == len(config_list) - 1:
+                        raise
+                finally:
+                    cls.retry_timeout = retry_timeout
         params = cls._construct_params(context, config)
         if not use_cache:
             return cls._get_response(params, eval_only=True, use_cache=False)
diff --git a/setup.py b/setup.py
index ac55422644..5e59da800e 100644
--- a/setup.py
+++ b/setup.py
@@ -76,6 +76,7 @@
             "nbformat",
             "ipykernel",
             "pytorch-lightning<1.9.1",  # test_forecast_panel
+            "requests<2.29.0",  # https://github.com/docker/docker-py/issues/3113
         ],
         "catboost": ["catboost>=0.26"],
         "blendsearch": ["optuna==2.8.0"],
diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py
index fc5f5558cc..d8c557165e 100644
--- a/test/openai/test_completion.py
+++ b/test/openai/test_completion.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 from functools import partial
+import os
 from flaml import oai
 from flaml.autogen.code_utils import (
     eval_function_completions,
@@ -17,6 +18,48 @@
 from flaml.autogen.math_utils import eval_math_responses, solve_problem
 
 
+def test_multi_model():
+    try:
+        import openai
+    except ImportError as exc:
+        print(exc)
+        return
+    response = oai.Completion.create(
+        config_list=[
+            {
+                "model": "gpt-4",
+                "api_key": os.environ.get("OPENAI_API_KEY"),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+                "api_version": None,
+            },
+            {
+                "model": "gpt-4",
+                "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
+                "api_type": "azure",
+                "api_base": os.environ.get("AZURE_OPENAI_API_BASE"),
+                "api_version": "2023-03-15-preview",
+            },
+            {
+                "model": "gpt-3.5-turbo",
+                "api_key": os.environ.get("OPENAI_API_KEY"),
+                "api_type": "open_ai",
+                "api_base": "https://api.openai.com/v1",
+                "api_version": None,
+            },
+            {
+                "model": "gpt-3.5-turbo",
+                "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
+                "api_type": "azure",
+                "api_base": os.environ.get("AZURE_OPENAI_API_BASE"),
+                "api_version": "2023-03-15-preview",
+            },
+        ],
+        prompt="Hi",
+    )
+    print(response)
+
+
 @pytest.mark.skipif(
     sys.platform in ["darwin", "win32"],
     reason="do not run on MacOS or windows",
@@ -239,6 +282,13 @@ def test_humaneval(num_samples=1):
 
 
 def test_math(num_samples=-1):
+    try:
+        import openai
+        import diskcache
+    except ImportError as exc:
+        print(exc)
+        return
+
     seed = 41
     data = datasets.load_dataset("competition_math")
     train_data = data["train"].shuffle(seed=seed)
@@ -271,13 +321,6 @@ def test_math(num_samples=-1):
         % data["problem"]
     ]
 
-    try:
-        import openai
-        import diskcache
-    except ImportError as exc:
-        print(exc)
-        return
-
     oai.ChatCompletion.set_cache(seed)
     vanilla_config = {
         "model": "gpt-3.5-turbo",
@@ -341,11 +384,14 @@ def my_average(results):
 
 
 if __name__ == "__main__":
-    # import openai
+    import openai
 
-    # openai.api_key_path = "test/openai/key.txt"
-    test_execute_code()
-    # test_improve()
+    openai.api_key = os.environ["OPENAI_API_KEY"] = open("test/openai/key.txt").read().strip()
+    os.environ["AZURE_OPENAI_API_KEY"] = open("test/openai/key_azure.txt").read().strip()
+    os.environ["AZURE_OPENAI_API_BASE"] = open("test/openai/base_azure.txt").read().strip()
+    # test_multi_model()
+    # test_execute_code()
+    test_improve()
     # test_nocontext()
     # test_humaneval(1)
     # test_math(1)
diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md
index e8e42932b8..cb82ff45db 100644
--- a/website/docs/Use-Cases/Auto-Generation.md
+++ b/website/docs/Use-Cases/Auto-Generation.md
@@ -98,16 +98,19 @@ config, analysis = oai.Completion.tune(
 `num_samples` is the number of configurations to sample. -1 means unlimited (until optimization budget is exhausted).
 The returned `config` contains the optimized configuration and `analysis` contains an [ExperimentAnalysis](../reference/tune/analysis#experimentanalysis-objects) object for all the tried configurations and results.
 
-## Perform inference with the tuned config
+The tuend config can be used to perform inference.
+
+## Perform Inference
 
 One can use [`flaml.oai.Completion.create`](../reference/autogen/oai/completion#create) to perform inference.
 There are a number of benefits of using `flaml.oai.Completion.create` to perform inference.
 
-
 ### API unification
 
 `flaml.oai.Completion.create` is compatible with both `openai.Completion.create` and `openai.ChatCompletion.create`, and both OpenAI API and Azure OpenAI API. So models such as "text-davinci-003", "gpt-3.5-turbo" and "gpt-4" can share a common API. When only tuning the chat-based models, `flaml.oai.ChatCompletion` can be used.
 
+For local LLMs, one can spin up an endpoint using a package like [simple_ai_server](https://github.com/lhenault/simpleAI), and then use the same API to send a request.
+
 ### Caching
 
 API call results are cached locally and reused when the same request is issued. This is useful when repeating or continuing experiments for reproducibility and cost saving. It still allows controlled randomness by setting the "seed", using [`set_cache`](../reference/autogen/oai/completion#set_cache) or specifying in `create()`.
@@ -116,12 +119,48 @@ API call results are cached locally and reused when the same request is issued.
 
 It is easy to hit error when calling OpenAI APIs, due to connection, rate limit, or timeout. Some of the errors are transient. `flaml.oai.Completion.create` deals with the transient errors and retries automatically. Initial request timeout, retry timeout and retry time interval can be configured via `flaml.oai.request_timeout`, `flaml.oai.retry_timeout` and `flaml.oai.retry_time`.
 
+Moreover, one can pass a list of configurations of different models/endpoints to mitigate the rate limits. For example,
+
+```python
+response = oai.Completion.create(
+    config_list=[
+        {
+            "model": "gpt-4",
+            "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
+            "api_type": "azure",
+            "api_base": os.environ.get("AZURE_OPENAI_API_BASE"),
+            "api_version": "2023-03-15-preview",
+        },
+        {
+            "model": "gpt-3.5-turbo",
+            "api_key": os.environ.get("OPENAI_API_KEY"),
+            "api_type": "open_ai",
+            "api_base": "https://api.openai.com/v1",
+            "api_version": None,
+        },
+        {
+            "model": "llama-7B",
+            "api_base": "http://127.0.0.1:8080",
+            "api_type": "open_ai",
+            "api_version": None,
+        }
+    ],
+    prompt="Hi",
+)
+```
+
+It will try querying Azure OpenAI gpt-4, OpenAI gpt-3.5-turbo, and llama-7B one by one, until a valid result is returned. This can speed up the development process where the rate limit is a bottleneck.
+
 ### Templating
 
 If the provided prompt or message is a template, it will be automatically materialized with a given context. For example,
 
 ```python
-response = oai.Completion.create(problme=problem, prompt="{problem} Solve the problem carefully.", **config)
+response = oai.Completion.create(
+    context={"problem": "How many positive integers, not exceeding 100, are multiples of 2 or 3 but not 4?"},
+    prompt="{problem} Solve the problem carefully.",
+    **config
+)
 ```
 
 A template is either a format str, like the example above, or a function which produces a str from several input fields, like the example below.
@@ -291,7 +330,7 @@ Set `compact=False` in `start_logging()` to switch.
 It can be seen that the individual API call history contain redundant information of the conversation. For a long conversation the degree of redundancy is high.
 The compact history is more efficient and the individual API call history contains more details.
 
-## Other utilities
+## Other Utilities
 
 ### Completion
 
diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md
index 28b5ce255e..ca9367244f 100644
--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -515,7 +515,7 @@ automl_settings = {
 automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
 ```
 
-## Retrieve and analyze the outcomes of AutoML.fit()
+## Retrieve the Outcomes
 
 ### Get best model