diff --git a/.github/workflows/openai.yml b/.github/workflows/openai.yml
index 6c4cbfdc7b2f..d0027a5b8bee 100644
--- a/.github/workflows/openai.yml
+++ b/.github/workflows/openai.yml
@@ -10,10 +10,11 @@ on:
       - 'flaml/integrations/oai/**'
       - 'test/openai/**'
       - 'notebook/integrate_openai.ipynb'
+      - 'notebook/integrate_chatgpt_math.ipynb'
       - '.github/workflows/openai.yml'
 
 jobs:
-  build:
+  test:
     strategy:
       matrix:
         os: [ubuntu-latest]
diff --git a/.gitignore b/.gitignore
index 641a5d3ee3d1..a9c35a1a1d15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,6 +159,6 @@ automl.pkl
 
 test/nlp/testtmp.py
 test/nlp/testtmpfl.py
-
+output/
 flaml/tune/spark/mylearner.py
 *.pkl
diff --git a/README.md b/README.md
index 80d7ef13c840..b465b2967fe5 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@
     <br>
 </p>
 
-:fire: OpenAI GPT-3 models support in v1.1.3. ChatGPT support is coming.
+:fire: OpenAI GPT-3 models support in v1.1.3. ChatGPT and GPT-4 support will be added in v1.2.0.
 
 :fire: A [lab forum](https://github.com/microsoft/FLAML/tree/tutorial-aaai23/tutorial) on FLAML at AAAI 2023.
 
diff --git a/flaml/integrations/oai/completion.py b/flaml/integrations/oai/completion.py
index ef8bd8b5beb8..0a7ce3d40c1c 100644
--- a/flaml/integrations/oai/completion.py
+++ b/flaml/integrations/oai/completion.py
@@ -1,6 +1,7 @@
 from time import sleep
 import logging
 import numpy as np
+import time
 from flaml import tune, BlendSearch
 
 try:
@@ -11,9 +12,9 @@
         APIError,
         InvalidRequestError,
         APIConnectionError,
+        Timeout,
     )
     import diskcache
-    from urllib3.exceptions import ReadTimeoutError
 
     ERROR = None
 except ImportError:
@@ -46,7 +47,14 @@ class Completion:
     """
 
     # set of models that support chat completion
-    chat_models = {"gpt-3.5-turbo"}
+    chat_models = {
+        "gpt-3.5-turbo",
+        "gpt-3.5-turbo-0301",
+        "gpt-4",
+        "gpt-4-32k",
+        "gpt-4-32k-0314",
+        "gpt-4-0314",
+    }
 
     # price per 1k tokens
     price1K = {
@@ -58,10 +66,23 @@ class Completion:
         "text-davinci-002": 0.02,
         "text-davinci-003": 0.02,
         "gpt-3.5-turbo": 0.002,
+        "gpt-3.5-turbo-0301": 0.002,
+        "gpt-4": (0.03, 0.06),
+        "gpt-4-0314": (0.03, 0.06),
+        "gpt-4-32k": (0.06, 0.12),
+        "gpt-4-32k-0314": (0.06, 0.12),
     }
 
     default_search_space = {
-        "model": tune.choice(list(price1K.keys())),
+        "model": tune.choice(
+            [
+                "text-ada-001",
+                "text-babbage-001",
+                "text-davinci-003",
+                "gpt-3.5-turbo",
+                "gpt-4",
+            ]
+        ),
         "temperature_or_top_p": tune.choice(
             [
                 {"temperature": tune.uniform(0, 1)},
@@ -107,13 +128,13 @@ def _get_response(cls, config: dict, eval_only=False):
         if response is not None and (response != -1 or not eval_only):
             # print("using cached response")
             return response
-        retry = 0
         openai_completion = (
             openai.ChatCompletion
             if config["model"] in cls.chat_models
             else openai.Completion
         )
-        while eval_only or retry * cls.retry_time < cls.retry_timeout:
+        start_time = time.time()
+        while True:
             try:
                 response = openai_completion.create(**config)
                 cls._cache.set(key, response)
@@ -122,21 +143,26 @@ def _get_response(cls, config: dict, eval_only=False):
                 ServiceUnavailableError,
                 APIError,
                 APIConnectionError,
-                ReadTimeoutError,
             ):
+                # transient error
                 logger.warning(f"retrying in {cls.retry_time} seconds...", exc_info=1)
                 sleep(cls.retry_time)
-            except RateLimitError:
-                logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
-                retry += 1
+            except (RateLimitError, Timeout):
+                # retry after retry_time seconds
+                if time.time() - start_time + cls.retry_time < cls.retry_timeout:
+                    logger.info(f"retrying in {cls.retry_time} seconds...", exc_info=1)
+                elif not eval_only:
+                    break
+                sleep(cls.retry_time)
             except InvalidRequestError:
-                if "model" in config:
+                if "azure" == openai.api_type and "model" in config:
+                    # azure api uses "engine" instead of "model"
                     config = config.copy()
                     config["engine"] = config.pop("model")
                 else:
                     raise
         logger.warning(
-            f"Failed to get response from openai api due to getting RateLimitError for {cls.retry_timeout} seconds."
+            f"Failed to get response from openai api due to getting RateLimitError or Timeout for {cls.retry_timeout} seconds."
         )
         response = -1
         cls._cache.set(key, response)
@@ -205,16 +231,18 @@ def eval(cls, config: dict, prune=True, eval_only=False):
         data = cls.data
         model = config["model"]
         data_length = len(data)
-        target_n_tokens = getattr(cls, "inference_budget", None) and (
-            1000 * cls.inference_budget / cls.price1K[model]
-            if cls.inference_budget and cls.price1K.get(model)
-            else None
+        price = cls.price1K.get(model)
+        price_input, price_output = (
+            price if isinstance(price, tuple) else (price, price)
         )
+        inference_budget = getattr(cls, "inference_budget", None)
         prune_hp = getattr(cls, "_prune_hp", "n")
         metric = cls._metric
         config_n = config.get(prune_hp, 1)  # default value in OpenAI is 1
-        max_tokens = config.get("max_tokens", 16)  # default value in OpenAI is 16
-        region_key = cls._get_region_key(config)
+        max_tokens = config.get(
+            "max_tokens", np.inf if model in cls.chat_models else 16
+        )
+        # default value in OpenAI
         if model in cls.chat_models:
             # either "prompt" should be in config (for being compatible with non-chat models)
             # or "messages" should be in config (for tuning chat models only)
@@ -231,17 +259,23 @@ def eval(cls, config: dict, prune=True, eval_only=False):
         else:
             prompt = cls._prompts[config["prompt"]]
         stop = cls._stops and cls._stops[config["stop"]]
-        if prune and target_n_tokens:
+        target_output_tokens = None
+        if not cls.avg_input_tokens:
+            input_tokens = [None] * data_length
+        prune = prune and inference_budget and not eval_only
+        if prune:
+            region_key = cls._get_region_key(config)
             max_valid_n = cls._get_max_valid_n(region_key, max_tokens)
             if cls.avg_input_tokens:
+                target_output_tokens = (
+                    inference_budget * 1000 - cls.avg_input_tokens * price_input
+                ) / price_output
                 # max_tokens bounds the maximum tokens
                 # so using it we can calculate a valid n according to the avg # input tokens
                 max_valid_n = max(
                     max_valid_n,
-                    int((target_n_tokens - cls.avg_input_tokens) // max_tokens),
+                    int(target_output_tokens // max_tokens),
                 )
-            else:
-                input_tokens = [None] * data_length
             if config_n <= max_valid_n:
                 start_n = config_n
             else:
@@ -316,24 +350,15 @@ def eval(cls, config: dict, prune=True, eval_only=False):
                         if model in cls.chat_models
                         else [r["text"].rstrip() for r in response["choices"]]
                     )
-                    n_tokens = (
-                        response["usage"]["completion_tokens"]
-                        if previous_num_completions
-                        else response["usage"]["total_tokens"]
-                    )
-                    if (
-                        prune
-                        and target_n_tokens
-                        and not cls.avg_input_tokens
-                        and not input_tokens[i]
-                    ):
+                    usage = response["usage"]
+                    n_input_tokens = usage["prompt_tokens"]
+                    n_output_tokens = usage.get("completion_tokens", 0)
+                    if not cls.avg_input_tokens and not input_tokens[i]:
                         # store the # input tokens
-                        input_tokens[i] = response["usage"]["prompt_tokens"]
-                    # Under Assumption 1, we should count both the input and output tokens in the first query,
-                    # and only count ouput tokens afterwards
+                        input_tokens[i] = n_input_tokens
                     query_cost = (
-                        response["usage"]["total_tokens"] * cls.price1K[model] / 1000
-                    )
+                        price_input * n_input_tokens + price_output * n_output_tokens
+                    ) / 1000
                     cls._total_cost += query_cost
                     cost += query_cost
                     if (
@@ -348,12 +373,12 @@ def eval(cls, config: dict, prune=True, eval_only=False):
                             "cost": cost,
                         }
                     if previous_num_completions:
-                        n_tokens_list[i] += n_tokens
+                        n_tokens_list[i] += n_output_tokens
                         responses_list[i].extend(responses)
                         # Assumption 1: assuming requesting n1, n2 responses separatively then combining them
                         # is the same as requesting (n1+n2) responses together
                     else:
-                        n_tokens_list.append(n_tokens)
+                        n_tokens_list.append(n_output_tokens)
                         responses_list.append(responses)
                 avg_n_tokens = np.mean(n_tokens_list[:data_limit])
                 rho = (
@@ -364,8 +389,8 @@ def eval(cls, config: dict, prune=True, eval_only=False):
                 # Hoeffding-Serfling bound
                 ratio = 0.1 * np.sqrt(rho / data_limit)
                 if (
-                    target_n_tokens
-                    and avg_n_tokens > target_n_tokens * (1 + ratio)
+                    target_output_tokens
+                    and avg_n_tokens > target_output_tokens * (1 + ratio)
                     and not eval_only
                 ):
                     cls._update_invalid_n(
@@ -377,8 +402,8 @@ def eval(cls, config: dict, prune=True, eval_only=False):
                     return result
                 if (
                     prune
-                    and target_n_tokens
-                    and avg_n_tokens <= target_n_tokens * (1 - ratio)
+                    and target_output_tokens
+                    and avg_n_tokens <= target_output_tokens * (1 - ratio)
                     and (
                         num_completions < config_n
                         or num_completions == config_n
@@ -410,16 +435,24 @@ def eval(cls, config: dict, prune=True, eval_only=False):
                     metrics = cls._eval_func(responses, **data_i)
                     if result:
                         for key, value in metrics.items():
-                            result[key] += value
+                            if isinstance(value, (float, int)):
+                                result[key] += value
                     else:
                         result = metrics
                 for key in result.keys():
-                    result[key] /= data_limit
+                    if isinstance(result[key], (float, int)):
+                        result[key] /= data_limit
                 result["total_cost"] = cls._total_cost
                 result["cost"] = cost
-                result["inference_cost"] = avg_n_tokens * cls.price1K[model] / 1000
-                if prune and target_n_tokens and not cls.avg_input_tokens:
+                if not cls.avg_input_tokens:
                     cls.avg_input_tokens = np.mean(input_tokens)
+                    if prune:
+                        target_output_tokens = (
+                            inference_budget * 1000 - cls.avg_input_tokens * price_input
+                        ) / price_output
+                result["inference_cost"] = (
+                    avg_n_tokens * price_output + cls.avg_input_tokens * price_input
+                ) / 1000
                 break
             else:
                 if data_early_stop:
@@ -559,11 +592,12 @@ def eval_func(responses, **data):
             mode=mode,
             space=space,
         )
-        if len(space["model"]) > 1:
+        space_model = space["model"]
+        if not isinstance(space_model, str) and len(space_model) > 1:
             # start all the models with the same hp config
             config0 = search_alg.suggest("t0")
             points_to_evaluate = [config0]
-            for model in space["model"]:
+            for model in space_model:
                 if model != config0["model"]:
                     point = config0.copy()
                     point["model"] = model
@@ -652,8 +686,13 @@ class ChatCompletion(Completion):
 
     price1K = {
         "gpt-3.5-turbo": 0.002,
+        "gpt-3.5-turbo-0301": 0.002,
+        "gpt-4": (0.03, 0.06),
+        "gpt-4-0314": (0.03, 0.06),
+        "gpt-4-32k": (0.06, 0.12),
+        "gpt-4-32k-0314": (0.06, 0.12),
     }
 
     default_search_space = Completion.default_search_space.copy()
-    default_search_space["model"] = tune.choice(list(price1K.keys()))
+    default_search_space["model"] = tune.choice(["gpt-3.5-turbo", "gpt-4"])
     openai_completion_class = not ERROR and openai.ChatCompletion
diff --git a/notebook/integrate_chatgpt.ipynb b/notebook/integrate_chatgpt.ipynb
new file mode 100644
index 000000000000..5495b37c7883
--- /dev/null
+++ b/notebook/integrate_chatgpt.ipynb
@@ -0,0 +1,1797 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+    "\n",
+    "Licensed under the MIT License.\n",
+    "\n",
+    "# Use FLAML to Tune ChatGPT\n",
+    "\n",
+    "FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n",
+    "\n",
+    "In this notebook, we tune OpenAI ChatGPT (both GPT-3.5 and GPT-4) models for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning. \n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
+    "```bash\n",
+    "pip install flaml[openai]==1.2.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install flaml[openai]==1.2.0 datasets"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set your OpenAI key:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.324240Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.323783Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.330570Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.329750Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "if \"OPENAI_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Uncomment the following to use Azure OpenAI:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.333547Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.333249Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.336508Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.335858Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# openai.api_type = \"azure\"\n",
+    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
+    "# openai.api_version = \"2022-12-01\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load dataset\n",
+    "\n",
+    "First, we load the competition_math dataset. The dataset contains 201 \"Level 2\" Algebra examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation. We use one demonstration example in the prompt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.339556Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.603349Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.602630Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration default\n",
+      "Found cached dataset competition_math (/home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8358c4bf9cc44b99916c9b6cb1e3a279",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b/cache-f1cfe8228271b121.arrow\n",
+      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b/cache-d155a2d38c23bd53.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "20 201\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datasets\n",
+    "\n",
+    "seed = 41\n",
+    "data = datasets.load_dataset(\"competition_math\")\n",
+    "train_data = data[\"train\"].shuffle(seed=seed)\n",
+    "test_data = data[\"test\"].shuffle(seed=seed)\n",
+    "n_tune_data = 20\n",
+    "tune_data = [\n",
+    "    {\n",
+    "        \"problem\": train_data[x][\"problem\"],\n",
+    "        \"solution\": train_data[x][\"solution\"],\n",
+    "    }\n",
+    "    for x in range(len(train_data)) if train_data[x][\"level\"] == \"Level 2\" and train_data[x][\"type\"] == \"Algebra\"\n",
+    "][:n_tune_data]\n",
+    "test_data = [\n",
+    "    {\n",
+    "        \"problem\": test_data[x][\"problem\"],\n",
+    "        \"solution\": test_data[x][\"solution\"],\n",
+    "    }\n",
+    "    for x in range(len(test_data)) if test_data[x][\"level\"] == \"Level 2\" and test_data[x][\"type\"] == \"Algebra\"\n",
+    "]\n",
+    "print(len(tune_data), len(test_data))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "Check a tuning example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.606441Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.610504Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.609759Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Rationalize the denominator of $\\displaystyle\\frac{21}{\\sqrt{21}}$.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tune_data[1][\"problem\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is one example of the canonical solution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.613168Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.616873Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.616193Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "$\\dfrac{21}{\\sqrt{21}} = \\dfrac{21}{\\sqrt{21}} \\cdot \\dfrac{\\sqrt{21}}{\\sqrt{21}} = \\dfrac{21\\sqrt{21}}{21} = \\boxed{\\!\\sqrt{21}}$.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tune_data[1][\"solution\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Success Metric\n",
+    "\n",
+    "Before we start tuning, we need to define the success metric we want to opotimize. For each math task, we use voting to select a response with the most common answers out of all the generated responses. If it has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can optimize the mean success rate of a collection of tasks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.626593Z",
+     "iopub.status.idle": "2023-02-13T23:40:54.631383Z",
+     "shell.execute_reply": "2023-02-13T23:40:54.630770Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Optional\n",
+    "\n",
+    "def remove_boxed(string: str) -> Optional[str]:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Extract the text within a \\\\boxed{...} environment.\n",
+    "    Example:\n",
+    "    >>> remove_boxed(\\\\boxed{\\\\frac{2}{3}})\n",
+    "    \\\\frac{2}{3}\n",
+    "    \"\"\"\n",
+    "    left = \"\\\\boxed{\"\n",
+    "    try:\n",
+    "        assert string[: len(left)] == left\n",
+    "        assert string[-1] == \"}\"\n",
+    "        return string[len(left) : -1]\n",
+    "    except Exception:\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "def last_boxed_only_string(string: str) -> Optional[str]:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Extract the last \\\\boxed{...} or \\\\fbox{...} element from a string.\n",
+    "    \"\"\"\n",
+    "    idx = string.rfind(\"\\\\boxed\")\n",
+    "    if idx < 0:\n",
+    "        idx = string.rfind(\"\\\\fbox\")\n",
+    "        if idx < 0:\n",
+    "            return None\n",
+    "\n",
+    "    i = idx\n",
+    "    right_brace_idx = None\n",
+    "    num_left_braces_open = 0\n",
+    "    while i < len(string):\n",
+    "        if string[i] == \"{\":\n",
+    "            num_left_braces_open += 1\n",
+    "        if string[i] == \"}\":\n",
+    "            num_left_braces_open -= 1\n",
+    "            if num_left_braces_open == 0:\n",
+    "                right_brace_idx = i\n",
+    "                break\n",
+    "        i += 1\n",
+    "\n",
+    "    if right_brace_idx is None:\n",
+    "        retval = None\n",
+    "    else:\n",
+    "        retval = string[idx : right_brace_idx + 1]\n",
+    "\n",
+    "    return retval\n",
+    "\n",
+    "\n",
+    "def _fix_fracs(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Reformat fractions.\n",
+    "    Examples:\n",
+    "    >>> _fix_fracs(\"\\\\frac1b\")\n",
+    "    \\frac{1}{b}\n",
+    "    >>> _fix_fracs(\"\\\\frac12\")\n",
+    "    \\frac{1}{2}\n",
+    "    >>> _fix_fracs(\"\\\\frac1{72}\")\n",
+    "    \\frac{1}{72}\n",
+    "    \"\"\"\n",
+    "    substrs = string.split(\"\\\\frac\")\n",
+    "    new_str = substrs[0]\n",
+    "    if len(substrs) > 1:\n",
+    "        substrs = substrs[1:]\n",
+    "        for substr in substrs:\n",
+    "            new_str += \"\\\\frac\"\n",
+    "            if substr[0] == \"{\":\n",
+    "                new_str += substr\n",
+    "            else:\n",
+    "                try:\n",
+    "                    assert len(substr) >= 2\n",
+    "                except Exception:\n",
+    "                    return string\n",
+    "                a = substr[0]\n",
+    "                b = substr[1]\n",
+    "                if b != \"{\":\n",
+    "                    if len(substr) > 2:\n",
+    "                        post_substr = substr[2:]\n",
+    "                        new_str += \"{\" + a + \"}{\" + b + \"}\" + post_substr\n",
+    "                    else:\n",
+    "                        new_str += \"{\" + a + \"}{\" + b + \"}\"\n",
+    "                else:\n",
+    "                    if len(substr) > 2:\n",
+    "                        post_substr = substr[2:]\n",
+    "                        new_str += \"{\" + a + \"}\" + b + post_substr\n",
+    "                    else:\n",
+    "                        new_str += \"{\" + a + \"}\" + b\n",
+    "    string = new_str\n",
+    "    return string\n",
+    "\n",
+    "\n",
+    "def _fix_a_slash_b(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Reformat fractions formatted as a/b to \\\\frac{a}{b}.\n",
+    "    Example:\n",
+    "    >>> _fix_a_slash_b(\"2/3\")\n",
+    "    \\frac{2}{3}\n",
+    "    \"\"\"\n",
+    "    if len(string.split(\"/\")) != 2:\n",
+    "        return string\n",
+    "    a_str = string.split(\"/\")[0]\n",
+    "    b_str = string.split(\"/\")[1]\n",
+    "    try:\n",
+    "        a = int(a_str)\n",
+    "        b = int(b_str)\n",
+    "        assert string == \"{}/{}\".format(a, b)\n",
+    "        new_string = \"\\\\frac{\" + str(a) + \"}{\" + str(b) + \"}\"\n",
+    "        return new_string\n",
+    "    except Exception:\n",
+    "        return string\n",
+    "\n",
+    "\n",
+    "def _remove_right_units(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Remove units (on the right).\n",
+    "    \"\\\\text{ \" only ever occurs (at least in the val set) when describing units.\n",
+    "    \"\"\"\n",
+    "    if \"\\\\text{ \" in string:\n",
+    "        splits = string.split(\"\\\\text{ \")\n",
+    "        assert len(splits) == 2\n",
+    "        return splits[0]\n",
+    "    else:\n",
+    "        return string\n",
+    "\n",
+    "\n",
+    "def _fix_sqrt(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Reformat square roots.\n",
+    "    Example:\n",
+    "    >>> _fix_sqrt(\"\\\\sqrt3\")\n",
+    "    \\sqrt{3}\n",
+    "    \"\"\"\n",
+    "    if \"\\\\sqrt\" not in string:\n",
+    "        return string\n",
+    "    splits = string.split(\"\\\\sqrt\")\n",
+    "    new_string = splits[0]\n",
+    "    for split in splits[1:]:\n",
+    "        if split[0] != \"{\":\n",
+    "            a = split[0]\n",
+    "            new_substr = \"\\\\sqrt{\" + a + \"}\" + split[1:]\n",
+    "        else:\n",
+    "            new_substr = \"\\\\sqrt\" + split\n",
+    "        new_string += new_substr\n",
+    "    return new_string\n",
+    "\n",
+    "\n",
+    "def _strip_string(string: str) -> str:\n",
+    "    \"\"\"Source: https://github.com/hendrycks/math\n",
+    "    Apply the reformatting helper functions above.\n",
+    "    \"\"\"\n",
+    "    # linebreaks\n",
+    "    string = string.replace(\"\\n\", \"\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # remove inverse spaces\n",
+    "    string = string.replace(\"\\\\!\", \"\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # replace \\\\ with \\\n",
+    "    string = string.replace(\"\\\\\\\\\", \"\\\\\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # replace tfrac and dfrac with frac\n",
+    "    string = string.replace(\"tfrac\", \"frac\")\n",
+    "    string = string.replace(\"dfrac\", \"frac\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # remove \\left and \\right\n",
+    "    string = string.replace(\"\\\\left\", \"\")\n",
+    "    string = string.replace(\"\\\\right\", \"\")\n",
+    "    # print(string)\n",
+    "\n",
+    "    # Remove circ (degrees)\n",
+    "    string = string.replace(\"^{\\\\circ}\", \"\")\n",
+    "    string = string.replace(\"^\\\\circ\", \"\")\n",
+    "\n",
+    "    # remove dollar signs\n",
+    "    string = string.replace(\"\\\\$\", \"\")\n",
+    "\n",
+    "    # remove units (on the right)\n",
+    "    string = _remove_right_units(string)\n",
+    "\n",
+    "    # remove percentage\n",
+    "    string = string.replace(\"\\\\%\", \"\")\n",
+    "    string = string.replace(\"\\%\", \"\")\n",
+    "\n",
+    "    # \" 0.\" equivalent to \" .\" and \"{0.\" equivalent to \"{.\" Alternatively, add \"0\" if \".\" is the start of the string\n",
+    "    string = string.replace(\" .\", \" 0.\")\n",
+    "    string = string.replace(\"{.\", \"{0.\")\n",
+    "    # if empty, return empty string\n",
+    "    if len(string) == 0:\n",
+    "        return string\n",
+    "    if string[0] == \".\":\n",
+    "        string = \"0\" + string\n",
+    "\n",
+    "    # to consider: get rid of e.g. \"k = \" or \"q = \" at beginning\n",
+    "    if len(string.split(\"=\")) == 2:\n",
+    "        if len(string.split(\"=\")[0]) <= 2:\n",
+    "            string = string.split(\"=\")[1]\n",
+    "\n",
+    "    # fix sqrt3 --> sqrt{3}\n",
+    "    string = _fix_sqrt(string)\n",
+    "\n",
+    "    # remove spaces\n",
+    "    string = string.replace(\" \", \"\")\n",
+    "\n",
+    "    # \\frac1b or \\frac12 --> \\frac{1}{b} and \\frac{1}{2}, etc.\n",
+    "    # Even works with \\frac1{72} (but not \\frac{72}1).\n",
+    "    # Also does a/b --> \\\\frac{a}{b}\n",
+    "    string = _fix_fracs(string)\n",
+    "\n",
+    "    # manually change 0.5 --> \\frac{1}{2}\n",
+    "    if string == \"0.5\":\n",
+    "        string = \"\\\\frac{1}{2}\"\n",
+    "\n",
+    "    # NOTE: X/Y changed to \\frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y\n",
+    "    string = _fix_a_slash_b(string)\n",
+    "\n",
+    "    return string\n",
+    "\n",
+    "\n",
+    "def get_answer(solution: Optional[str]) -> Optional[str]:\n",
+    "    if solution is None:\n",
+    "        return None\n",
+    "    last_boxed = last_boxed_only_string(solution)\n",
+    "    if last_boxed is None:\n",
+    "        return None\n",
+    "    answer = remove_boxed(last_boxed)\n",
+    "    if answer is None:\n",
+    "        return None\n",
+    "    return answer\n",
+    "\n",
+    "\n",
+    "def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:\n",
+    "    \"\"\"Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in\n",
+    "    - units\n",
+    "    - fractions\n",
+    "    - square roots\n",
+    "    - superfluous LaTeX.\n",
+    "    Source: https://github.com/hendrycks/math\n",
+    "    \"\"\"\n",
+    "    if str1 is None and str2 is None:\n",
+    "        print(\"WARNING: Both None\")\n",
+    "        return 1.0\n",
+    "    if str1 is None or str2 is None:\n",
+    "        return 0.0\n",
+    "\n",
+    "    try:\n",
+    "        ss1 = _strip_string(str1)\n",
+    "        ss2 = _strip_string(str2)\n",
+    "        return float(ss1 == ss2)\n",
+    "    except Exception:\n",
+    "        return float(str1 == str2)\n",
+    "\n",
+    "\n",
+    "def is_equiv_chain_of_thought(str1: str, str2: str) -> float:\n",
+    "    \"\"\"Strips the solution first before calling `is_equiv`.\"\"\"\n",
+    "    ans1 = get_answer(str1)\n",
+    "    ans2 = get_answer(str2)\n",
+    "\n",
+    "    return is_equiv(ans1, ans2)\n",
+    "\n",
+    "\n",
+    "def success_metrics(responses, solution, **args):\n",
+    "    \"\"\"Check if each response is correct.\n",
+    "    \n",
+    "    Args:\n",
+    "        responses (list): The list of responses.\n",
+    "        solution (str): The canonical solution.\n",
+    "    \n",
+    "    Returns:\n",
+    "        dict: The success metrics.\n",
+    "    \"\"\"\n",
+    "    success_list = []\n",
+    "    n = len(responses)\n",
+    "    for i in range(n):\n",
+    "        response = responses[i]\n",
+    "        succeed = is_equiv_chain_of_thought(response, solution)\n",
+    "        success_list.append(succeed)\n",
+    "    # voting\n",
+    "    answers = {}\n",
+    "    for i in range(n):\n",
+    "        equiv = i\n",
+    "        if get_answer(responses[i]) is None:\n",
+    "            # ignore None answers\n",
+    "            continue\n",
+    "        for j in answers:\n",
+    "            if is_equiv_chain_of_thought(responses[i], responses[j]):\n",
+    "                equiv = j\n",
+    "                break\n",
+    "        if equiv in answers:\n",
+    "            answers[equiv] += 1\n",
+    "        else:\n",
+    "            answers[equiv] = 1\n",
+    "    # find the answer with highest votes in answers\n",
+    "    answer = max(answers.items(), key=lambda x: x[1], default=(0, 0))[0]\n",
+    "    # check if the answer is correct\n",
+    "    success_vote = is_equiv_chain_of_thought(responses[answer], solution)\n",
+    "    return {\n",
+    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
+    "        \"success\": any(s for s in success_list),\n",
+    "        \"success_vote\": success_vote,\n",
+    "        \"voted_answer\": responses[answer],\n",
+    "    }\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "## Use the tuning data to find a good configuration\n",
+    "\n",
+    "### Import the oai and tune subpackages from flaml.\n",
+    "\n",
+    "FLAML has provided an API for hyperparameter optimization of OpenAI ChatGPT models: `oai.ChatCompletion.tune` and to make a request with the tuned config: `oai.ChatCompletion.create`. First, we import oai from flaml:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
+     "iopub.status.busy": "2023-02-13T23:40:54.633929Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.105700Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.105085Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from flaml import oai"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.108624Z",
+     "iopub.status.idle": "2023-02-13T23:40:56.112651Z",
+     "shell.execute_reply": "2023-02-13T23:40:56.112076Z"
+    },
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "oai.ChatCompletion.set_cache(seed)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n",
+    "\n",
+    "### Perform tuning\n",
+    "\n",
+    "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n",
+    "\n",
+    "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.004 means the target inference budget is 0.004 dollars, which translates to 2000 tokens (input + output combined) if the gpt-3.5-turbo model is used.\n",
+    "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 1 means 1 dollars are allowed in total, which translates to 500K tokens for the gpt-3.5-turbo model.\n",
+    "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n",
+    "\n",
+    "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
+    "\n",
+    "```python\n",
+    "default_search_space = {\n",
+    "    \"model\": tune.choice([\n",
+    "        \"gpt-3.5-turbo\",\n",
+    "        \"gpt-4\",\n",
+    "    ]),\n",
+    "    \"temperature_or_top_p\": tune.choice(\n",
+    "        [\n",
+    "            {\"temperature\": tune.uniform(0, 1)},\n",
+    "            {\"top_p\": tune.uniform(0, 1)},\n",
+    "        ]\n",
+    "    ),\n",
+    "    \"max_tokens\": tune.lograndint(50, 1000),\n",
+    "    \"n\": tune.randint(1, 100),\n",
+    "    \"prompt\": \"{prompt}\",\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "The default search space can be overridden by users' input.\n",
+    "For example, the following code specifies a fixed prompt template. For hyperparameters which don't appear in users' input, the default search space will be used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
+     "iopub.status.busy": "2023-02-13T23:40:56.114975Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.045654Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.044973Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m[I 2023-03-26 04:03:37,074]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
+      "\u001b[32m[I 2023-03-26 04:03:37,077]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 1 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.9, 'success': 0.9, 'success_vote': 0.9, 'voted_answer': 'We use the distance formula to find the distance between the two points: $\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{3^2+(-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}$.', 'total_cost': 0.13772999999999996, 'cost': 0.13772999999999996, 'inference_cost': 0.0068864999999999985, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.004978179931640625}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$$ Letting $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0)$, we have: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5$$ Therefore, the distance between the points $(0,4)$ and $(3,0)$ is $\\\\boxed{5}$.', 'total_cost': 0.145722, 'cost': 0.007992, 'inference_cost': 0.00039759999999999996, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0047664642333984375}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.5140933870421127, 'success': 0.55, 'success_vote': 0.5, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=5.$$ Therefore, the distance between the points (0,4) and (3,0) is $\\\\boxed{5}$.', 'total_cost': 0.21644799999999997, 'cost': 0.07072600000000001, 'inference_cost': 0.0035343, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.4985070123025904}, 'config/max_tokens': 97, 'config/n': 20, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.010622501373291016}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.3386014997741698, 'success': 0.4, 'success_vote': 0.35, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(3-0)^2+(0-4)^2}=\\\\sqrt{9+16}=\\\\sqrt{25}=\\\\boxed{5}.$$', 'total_cost': 0.3192479999999999, 'cost': 0.10279999999999999, 'inference_cost': 0.005138, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9533933461949365}, 'max_tokens': 50, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9533933461949365}, 'config/max_tokens': 50, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.015543699264526367}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.9999998720099207, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2},$$ where $(x_1,y_1)$ and $(x_2,y_2)$ are the given points. Plugging in the values $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0),$ we have: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25}.$$ Therefore, the distance between the two points is $\\\\boxed{5}$.', 'total_cost': 0.6322379999999999, 'cost': 0.31299, 'inference_cost': 0.015323400000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.05237627029418945}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 6 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'success_vote': 0, 'total_cost': 0.7246679999999999, 'cost': 0.09243, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.4340139933332937}, 'max_tokens': 317, 'n': 51, 'prompt': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.4340139933332937}, 'config/max_tokens': 317, 'config/n': 51, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.001924753189086914}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'expected_success': 0.7572581384563789, 'success': 0.8, 'success_vote': 0.8, 'voted_answer': 'We use the distance formula: \\\\begin{align*}\\n\\\\text{distance}&=\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}\\\\\\\\\\n&=\\\\sqrt{(3-0)^2+(0-4)^2}\\\\\\\\\\n&=\\\\sqrt{9+16}\\\\\\\\\\n&=\\\\sqrt{25}\\\\\\\\\\n&=\\\\boxed{5}.\\n\\\\end{align*}', 'total_cost': 0.7647499999999999, 'cost': 0.04008199999999999, 'inference_cost': 0.0020021, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9086488808086682}, 'config/max_tokens': 129, 'config/n': 9, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.007839441299438477}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {215} INFO - result: {'success_vote': 0, 'total_cost': 1.0214359999999998, 'cost': 0.25668599999999997, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6262871483113925}, 'max_tokens': 257, 'n': 82, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6262871483113925}, 'config/max_tokens': 257, 'config/n': 82, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.009511232376098633}\n",
+      "[flaml.tune.tune: 03-26 04:03:37] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "\n",
+    "prompts = [\"{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.\"]\n",
+    "config, analysis = oai.ChatCompletion.tune(\n",
+    "    data=tune_data,  # the data for tuning\n",
+    "    metric=\"success_vote\",  # the metric to optimize\n",
+    "    mode=\"max\",  # the optimization mode\n",
+    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
+    "    # log_file_name=\"logs/math.log\",  # the log file name\n",
+    "    inference_budget=0.03,  # the inference budget (dollar)\n",
+    "    optimization_budget=1,  # the optimization budget (dollar)\n",
+    "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
+    "    # -1 means decided by the optimization budget only\n",
+    "    num_samples=-1,\n",
+    "    # model=\"chatgpt-35-turbo-0301\",  # uncomment if using Azure OpenAI\n",
+    "    # model=\"gpt-3-turbo\",  # uncomment if you don't have access to gpt-4\n",
+    "    prompt=prompts,  # the prompt templates to choose from\n",
+    "    # stop=\"###\",  # the stop sequence\n",
+    "    logging_level=logging.INFO,  # the logging level\n",
+    ")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Output tuning results\n",
+    "\n",
+    "After the tuning, we can print out the config and the result found by FLAML:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:55.049204Z",
+     "iopub.status.busy": "2023-02-13T23:41:55.048871Z",
+     "iopub.status.idle": "2023-02-13T23:41:55.053284Z",
+     "shell.execute_reply": "2023-02-13T23:41:55.052574Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 424, 'n': 54, 'prompt': '{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\\\boxed{{}}.', 'stop': None, 'temperature': 0.9177741225129434}\n",
+      "best result on tuning data {'expected_success': 0.9999998720099207, 'success': 1.0, 'success_vote': 0.95, 'voted_answer': 'We use the distance formula: $$\\\\sqrt{(x_2-x_1)^2+(y_2-y_1)^2},$$ where $(x_1,y_1)$ and $(x_2,y_2)$ are the given points. Plugging in the values $(x_1,y_1)=(0,4)$ and $(x_2,y_2)=(3,0),$ we have: $$\\\\sqrt{(3-0)^2+(0-4)^2} = \\\\sqrt{9+16} = \\\\sqrt{25}.$$ Therefore, the distance between the two points is $\\\\boxed{5}$.', 'total_cost': 0.6322379999999999, 'cost': 0.31299, 'inference_cost': 0.015323400000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'n': 54, 'prompt': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/n': 54, 'config/prompt': 0, 'experiment_tag': 'exp', 'time_total_s': 0.05237627029418945}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"optimized config\", config)\n",
+    "print(\"best result on tuning data\", analysis.best_result)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "### Make a request with the tuned config\n",
+    "\n",
+    "We can apply the tuned config on the request for an example task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:55.056205Z",
+     "iopub.status.busy": "2023-02-13T23:41:55.055631Z",
+     "iopub.status.idle": "2023-02-13T23:41:56.039259Z",
+     "shell.execute_reply": "2023-02-13T23:41:56.038427Z"
+    },
+    "slideshow": {
+     "slide_type": "subslide"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to get rid of the square root in the denominator. We can do this by multiplying both the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\sqrt{21}.$$ Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 1,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}.$$Therefore, the answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 2,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"Using the definition of square roots, we see that $\\\\sqrt{21}\\\\cdot\\\\sqrt{21}=21$. Therefore, we can write $\\\\frac{21}{\\\\sqrt{21}}$ as $\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 3,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by multiplying both the numerator and the denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}.$$ Simplifying the fraction, we get: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 4,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can simplify $\\\\sqrt{21}$ by finding its prime factorization: $21=3\\\\cdot7$, so $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, \\\\[\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.\\\\] To rationalize this denominator, we need to multiply both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$, which gives \\\\[\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}\\\\] \\\\[=\\\\boxed{3\\\\sqrt{3}}.\\\\]\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 5,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can start by simplifying the denominator. Since $\\\\sqrt{21}$ equals $\\\\sqrt{3} \\\\cdot \\\\sqrt{7}$, we can write:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}}$$\\n\\nTo rationalize the denominator, we need to multiply both the numerator and denominator by $\\\\sqrt{3} \\\\cdot \\\\sqrt{7}$:\\n\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}} \\\\cdot \\\\frac{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}} &= \\\\frac{21 \\\\cdot \\\\sqrt{3} \\\\cdot \\\\sqrt{7}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7} \\\\cdot \\\\sqrt{3} \\\\cdot \\\\sqrt{7}} \\\\\\\\\\n&= \\\\frac{21 \\\\cdot \\\\sqrt{3} \\\\cdot \\\\sqrt{7}}{3 \\\\cdot 7} \\\\\\\\\\n&= \\\\boxed{\\\\frac{3 \\\\sqrt{21}}{7}}\\n\\\\end{align*}\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 6,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$. Therefore, $\\\\boxed{\\\\sqrt{21}}$ is our final answer.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 7,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\sqrt{21}$. Thus, our final answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 8,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by multiplying both the numerator and denominator of the fraction by $\\\\sqrt{21}$, since $\\\\sqrt{21}/\\\\sqrt{21} = 1$:\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}}&=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}\\\\\\\\\\n&=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}\\\\\\\\\\n&=\\\\sqrt{21}.\\n\\\\end{align*}\\nTherefore, the simplified expression is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 9,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by multiplying the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{(\\\\sqrt{21})^2} = \\\\frac{21\\\\sqrt{21}}{21}$$ Simplifying, we get: $$\\\\frac{21\\\\sqrt{21}}{21}= \\\\sqrt{21}$$ Therefore, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 10,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can simplify the denominator by rationalizing it, which means getting rid of the square root in the denominator. To do this, we can multiply both the numerator and denominator by $\\\\sqrt{21}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$$\\n\\nTherefore, $\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 11,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by rationalizing the denominator. We want to get rid of the square root in the denominator, so we need to multiply both the numerator and denominator by something that will give us a perfect square in the denominator. We notice that $\\\\sqrt{21} = \\\\sqrt{3 \\\\cdot 7}$, so we can multiply both the numerator and denominator by $\\\\sqrt{3}$ to get:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{3}}{\\\\sqrt{3}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{21}\\\\sqrt{3}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3 \\\\cdot 21}}$$\\n\\nNow we can simplify the denominator:\\n\\n$$\\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3 \\\\cdot 21}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{21}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{3 \\\\cdot 7}} = \\\\frac{21\\\\sqrt{3}}{\\\\sqrt{3^2 \\\\cdot 7}} = \\\\boxed{\\\\frac{3\\\\sqrt{7}}{2}}$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 12,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can simplify $\\\\sqrt{21}$ by recognizing that $21=3\\\\cdot7$ and $\\\\sqrt{3^2\\\\cdot7}=3\\\\sqrt{7}$. Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3^2\\\\cdot7}} = \\\\frac{21}{3\\\\sqrt{7}} = \\\\frac{7\\\\cdot3}{3\\\\cdot\\\\sqrt{7}} = \\\\frac{7}{\\\\sqrt{7}}.$$We can rationalize this by multiplying top and bottom by $\\\\sqrt{7}$, giving us $$\\\\frac{7}{\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{7}}{\\\\sqrt{7}} = \\\\frac{7\\\\sqrt{7}}{7} = \\\\boxed{\\\\sqrt{7}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 13,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to get rid of the square root sign in the denominator. One way to do this is to multiply both the numerator and denominator by $\\\\sqrt{21}$. We get \\\\[\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21\\\\cdot\\\\sqrt{21}}{(\\\\sqrt{21})\\\\cdot(\\\\sqrt{21})} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\boxed{\\\\sqrt{21}}.\\\\]Note that we simplified by dividing $21$ by $21$ to get $1$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 14,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"To rationalize the denominator, we need to eliminate the square root. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$:$$\\\\dfrac{21}{\\\\sqrt{21}}\\\\cdot\\\\dfrac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\dfrac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 15,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"To rationalize the denominator, we need to eliminate the radical from the denominator. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$. \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}&=\\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&=\\\\frac{\\\\cancelto{1}{21}\\\\cdot \\\\cancel{\\\\sqrt{21}}\\\\cdot \\\\sqrt{21}}{\\\\cancel{21}}\\\\\\\\\\n&=\\\\sqrt{21}\\n\\\\end{align*}Therefore, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 16,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We begin by multiplying the top and bottom of the fraction by $\\\\sqrt{21}$: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} &= \\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&= \\\\frac{\\\\cancel{21}\\\\cdot\\\\sqrt{21}}{\\\\cancel{21}}\\\\\\\\\\n&= \\\\sqrt{\\\\boxed{21}}.\\n\\\\end{align*}\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 17,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We rationalize the denominator as follows: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21\\\\cdot \\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}}\\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&= \\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 18,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by simplifying the square root in the denominator. The largest perfect square that divides into 21 is 3, so we can break apart the square root as follows: $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3\\\\cdot7}} = \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}$$ Now we can use the property that $\\\\frac{1}{\\\\sqrt{x}} = \\\\frac{\\\\sqrt{x}}{x}$. Applying this to our expression, we have: $$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} = \\\\frac{21\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}} = \\\\frac{21\\\\sqrt{21}}{3\\\\cdot7} = \\\\frac{\\\\cancel{21}\\\\sqrt{21}}{\\\\cancel{3}\\\\cdot\\\\cancel{7}} = \\\\boxed{\\\\sqrt{21}}$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 19,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by writing $\\\\sqrt{21}$ as a product of simpler terms. Since $21=3\\\\cdot7$ and neither factor has a perfect square factor other than 1, we can write $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have \\\\[\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.\\\\] To rationalize the denominator, we multiply both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$: \\\\[\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} =\\\\frac{21\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7} = \\\\boxed{\\\\frac{3\\\\sqrt{21}}{7}}.\\\\]\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 20,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to remove the radical from the denominator. To do this, we multiply the top and bottom of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}.$$ Simplifying the numerator, we have $21\\\\cdot\\\\sqrt{21}=21^{\\\\frac{3}{2}}$. Therefore, the simplified version of $\\\\frac{21}{\\\\sqrt{21}}$ is $\\\\boxed{21^{\\\\frac{3}{2}}}$ or $\\\\boxed{21\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 21,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can simplify $\\\\frac{21}{\\\\sqrt{21}}$ by multiplying both the numerator and denominator by $\\\\sqrt{21}$:\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\cdot\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 22,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by simplifying the square root in the denominator. We notice that $\\\\sqrt{21}$ can be divided into $\\\\sqrt{3}$ and $\\\\sqrt{7}$. Specifically, $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} \\\\\\\\\\n&= \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{1}{\\\\sqrt{7}} \\\\\\\\\\n&= \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{\\\\sqrt{7}}{7} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot 7} \\\\\\\\\\n&= \\\\boxed{\\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}}}.\\n\\\\end{align*}\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 23,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by noticing that $\\\\sqrt{21}$ can be simplified. The prime factorization of 21 is $3\\\\cdot7$, so $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$ To rationalize the denominator, we need to get rid of the $\\\\sqrt{3}$ and $\\\\sqrt{7}$ in the denominator. We can do this by multiplying the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. We get $$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{21}}{3\\\\cdot7}=\\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{21}}{3\\\\cdot7}.$$ Cancelling the 3's and the 7's, we get $$\\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{21}}{3\\\\cdot7} = \\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 24,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to get rid of the radical in the denominator. One way to do this is to multiply both the numerator and the denominator by the radical (or a simplified version of it). In this case, $\\\\sqrt{21}$ is already simplified, so we can just multiply top and bottom by $\\\\sqrt{21}$: \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}\\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}We simplified $\\\\frac{21\\\\sqrt{21}}{21}$ by canceling the common factor of 21 and leaving only $\\\\sqrt{21}$ in the numerator.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 25,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We know that $\\\\sqrt{21}$ can be simplified as $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, \\n\\n$$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$ \\n\\nWe can now rationalize the denominator by multiplying both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}=\\\\boxed{3\\\\sqrt{3}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 26,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"Since the denominator has a radical, we want to get rid of it. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}$$\\nSimplifying, we see that the $21$s cancel: $$\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 27,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by simplifying $\\\\sqrt{21}$ as a product of its prime factors: $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$ To rationalize the denominator, we can multiply both the numerator and denominator by $\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$: $$\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}\\\\cdot\\\\frac{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}.$$ Simplifying the fraction in the numerator, we have $$\\\\frac{21\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}=\\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot7}=\\\\boxed{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 28,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to get rid of the radical in the denominator. To do this, we can multiply the numerator and denominator by $\\\\sqrt{21}$, which is equal to $\\\\sqrt{21}\\\\cdot\\\\sqrt{21} = 21$:  \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\sqrt{21}.\\n\\\\end{align*}Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 29,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 30,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We begin by multiplying both the numerator and denominator of the fraction $\\\\frac{21}{\\\\sqrt{21}}$ by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}.$$ Simplifying the numerator by canceling $21$ from both the numerator and denominator, we get: $$\\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}.$$ Therefore, $\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$ (after rationalizing the denominator).\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 31,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by writing $\\\\dfrac{21}{\\\\sqrt{21}}$ as $\\\\dfrac{21}{\\\\sqrt{21}} \\\\cdot \\\\dfrac{\\\\sqrt{21}}{\\\\sqrt{21}}$. This gives us \\\\begin{align*}\\n\\\\dfrac{21}{\\\\sqrt{21}} \\\\cdot \\\\dfrac{\\\\sqrt{21}}{\\\\sqrt{21}} &= \\\\dfrac{21\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot \\\\sqrt{21}} \\\\\\\\\\n&= \\\\dfrac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&=\\\\boxed{\\\\sqrt{21}}.\\n\\\\end{align*}\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 32,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can rationalize the denominator $\\\\sqrt{21}$ by multiplying both the numerator and denominator by $\\\\sqrt{21}$. \\n\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21 \\\\cdot \\\\sqrt{21}}{\\\\sqrt{21} \\\\cdot \\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\sqrt{21}\\n\\\\end{align*}\\n\\nTherefore, the simplified answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 33,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by multiplying the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}.$$ Simplifying, we find that $\\\\frac{21\\\\sqrt{21}}{21} = \\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 34,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can rationalize the denominator by multiplying the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot \\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot \\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 35,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can start by simplifying $\\\\sqrt{21}$. Since $21$ has no perfect square factors other than $1$, we have $\\\\sqrt{21} = \\\\sqrt{3\\\\cdot7}= \\\\sqrt{3}\\\\sqrt{7}$. Thus, we have:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3}\\\\sqrt{7}}$$\\n\\nTo rationalize the denominator, we want to get rid of the radical in the denominator by multiplying both the numerator and denominator by $\\\\sqrt{3}\\\\sqrt{7}$. This gives:\\n\\n$$\\\\frac{21}{\\\\sqrt{3}\\\\sqrt{7}} \\\\cdot \\\\frac{\\\\sqrt{3}\\\\sqrt{7}}{\\\\sqrt{3}\\\\sqrt{7}} = \\\\frac{21\\\\sqrt{3}\\\\sqrt{7}}{3\\\\cdot7} = \\\\frac{3\\\\sqrt{3}\\\\sqrt{7}}{1}$$\\n\\nTherefore, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}$ simplified with a rationalized denominator is $\\\\boxed{3\\\\sqrt{3}\\\\sqrt{7}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 36,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by multiplying the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}.$$ Now we can simplify by canceling the common factor of 21: $$\\\\frac{21\\\\sqrt{21}}{21} = \\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 37,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can simplify this fraction by rationalizing the denominator, which means getting rid of the square root in the denominator.\\n\\nTo do this, we can multiply both the numerator and denominator by $\\\\sqrt{21}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}$$\\n\\nSimplifying the fraction, we get:\\n\\n$$\\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$$\\n\\nTherefore, $\\\\boxed{\\\\sqrt{21}}$ is the final, simplified answer.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 38,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We notice that $\\\\sqrt{21}=3\\\\sqrt{7}$. So, rationalizing the denominator of $\\\\frac{21}{\\\\sqrt{21}}$ is the same as multiplying it by $\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}$:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}= \\\\frac{3\\\\cdot7\\\\cdot\\\\sqrt{3\\\\cdot7}}{3\\\\cdot7} = \\\\frac{\\\\sqrt{3\\\\cdot7}}{1} = \\\\boxed{\\\\sqrt{21}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 39,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by noting that $\\\\sqrt{21} = \\\\sqrt{3\\\\cdot7} = \\\\sqrt{3}\\\\cdot\\\\sqrt{7}$. Therefore, $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}} = \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{1}{\\\\sqrt{7}} = \\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{\\\\sqrt{7}}{7} = \\\\boxed{\\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}}}. $$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 40,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by simplifying the denominator. Recall that $\\\\sqrt{21}$ can be written as $21^{\\\\frac{1}{2}}$. So, we have:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{21^{\\\\frac{1}{2}}}$$\\n\\nTo rationalize the denominator, we need to get rid of the radical in the denominator. We can do this by multiplying both the numerator and the denominator by $\\\\sqrt{21}$. This gives:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21}$$\\n\\nNow we can simplify:\\n\\n$$\\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}$$\\n\\nTherefore, $$\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}} = \\\\sqrt{21}$$\\n\\nAnd our final answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 41,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to eliminate the radical in the denominator of the fraction $\\\\frac{21}{\\\\sqrt{21}}$. To do this, we can multiply both the numerator and denominator by $\\\\sqrt{21}$. This gives us \\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21 \\\\cdot \\\\sqrt{21}}{\\\\sqrt{21} \\\\cdot \\\\sqrt{21}}\\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21}\\\\\\\\\\n&= \\\\sqrt{21}.\\n\\\\end{align*} Therefore, $\\\\boxed{\\\\sqrt{21}}$ is our final answer.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 42,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start by multiplying the numerator and denominator of the fraction by $\\\\sqrt{21}$:\\n\\\\begin{align*}\\n\\\\frac{21}{\\\\sqrt{21}} &= \\\\frac{21 \\\\cdot \\\\sqrt{21}}{\\\\sqrt{21} \\\\cdot \\\\sqrt{21}} \\\\\\\\\\n&= \\\\frac{21\\\\sqrt{21}}{21} \\\\\\\\\\n&= \\\\sqrt{21}\\n\\\\end{align*}\\nTherefore, $\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 43,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We can begin by simplifying the denominator using the property $\\\\sqrt{n^2}=n$ for any positive real number $n$. We have:\\n\\n$$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3^2\\\\cdot 7}}=\\\\frac{21}{3\\\\sqrt{7}}$$\\n\\nWe can simplify this fraction further by dividing both the numerator and denominator by the greatest common factor, which is 3. We obtain:\\n\\n$$\\\\frac{21}{3\\\\sqrt{7}}=\\\\frac{7}{\\\\sqrt{7}}=\\\\frac{7\\\\cdot\\\\sqrt{7}}{7}=\\\\boxed{\\\\sqrt{7}}$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 44,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to get rid of the square root in the denominator. To do this, we can multiply both the numerator and the denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{\\\\sqrt{21}\\\\cdot\\\\sqrt{21}}=\\\\frac{21\\\\cdot \\\\sqrt{21}}{21}.$$Simplifying the fraction, we have $\\\\frac{21}{21}=1$, so our final answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 45,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"Since $\\\\sqrt{21}=\\\\sqrt{3\\\\cdot7}=\\\\sqrt{3}\\\\cdot\\\\sqrt{7}$, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}=\\\\frac{21}{\\\\sqrt{3}}\\\\cdot\\\\frac{1}{\\\\sqrt{7}}=\\\\boxed{3\\\\sqrt{7}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 46,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have  \\\\[\\n\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\cdot \\\\sqrt{21}}{21}=\\\\sqrt{21}.\\n\\\\] Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}= \\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 47,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\frac{\\\\cancel{21}\\\\sqrt{\\\\cancel{21}\\\\cdot3}}{\\\\cancel{21}}=\\\\boxed{\\\\sqrt{3}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 48,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We want to find a way to eliminate the radical in the denominator. One way to do this is to multiply the numerator and denominator by the radical, since $(\\\\sqrt{21})^2=21$. Doing this gives us: $$\\\\frac{21}{\\\\sqrt{21}} \\\\cdot \\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\frac{\\\\cancel{21}\\\\sqrt{3}\\\\cdot\\\\cancel{7}}{\\\\cancel{21}} = \\\\boxed{\\\\sqrt{3}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"length\",\n",
+      "      \"index\": 49,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We begin by noticing that $\\\\sqrt{21}=21^{\\\\frac12}$, so we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{21^{\\\\frac12}}.$$ Using the rule $a^{m}\\\\cdot a^{n}=a^{m+n}$, we can write $21=3\\\\cdot7$ as $$21^{\\\\frac12}=3^{\\\\frac12}\\\\cdot7^{\\\\frac12}.$$ Thus, we have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{21^{\\\\frac12}}=\\\\frac{21}{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}.$$ To rationalize the denominator, we multiply the top and bottom of the fraction by $3^{\\\\frac12}\\\\cdot7^{\\\\frac12}$: $$\\\\frac{21}{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}\\\\cdot\\\\frac{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}{3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}=\\\\frac{21\\\\cdot3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}{3\\\\cdot7}.$$ Simplifying the numerator, we have $$\\\\frac{21\\\\cdot3^{\\\\frac12}\\\\cdot7^{\\\\frac12}}{3\\\\cdot7}=\\\\frac{3^{\\\\frac12}\\\\cdot7^{\\\\frac32}}{3}.$$ Now, using the rule $a^{m}/a^{n}=a^{m-n}$, we simplify further: $$\\\\frac{3^{\\\\frac12}\\\\cdot7^{\\\\frac32}}{3}=3^{-\\\\frac12}\\\\cdot7^{\\\\frac32}=\\\\frac{7^{\\\\frac32}}{\\\\sqrt{3}}}.$$ Finally, we rationalize the denominator by multiplying top and bottom by $\\\\sqrt{3}$: $$\\\\frac{7^{\\\\frac32}}{\\\\sqrt{3}}\\\\cdot\\\\frac{\\\\sqrt{3}}{\\\\sqrt{3}}=\\\\boxed{\\\\\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 50,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"To rationalize the denominator, we need to get rid of the square root. We can do this by multiplying both the numerator and denominator by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\sqrt{21}.$$ Therefore, the answer is $\\\\boxed{\\\\sqrt{21}}$.\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 51,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We start with $\\\\frac{21}{\\\\sqrt{21}}$. To rationalize the denominator, we want to get rid of the square root. We know that $\\\\sqrt{21}$ can be simplified as $\\\\sqrt{3 \\\\cdot 7} = \\\\sqrt{3} \\\\cdot \\\\sqrt{7}$. Therefore, we have \\\\[\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}}.\\\\] We can now use the property $\\\\frac{a}{b \\\\cdot c} = \\\\frac{a}{b} \\\\cdot \\\\frac{1}{c}$ to split up the denominator: \\\\[\\\\frac{21}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7}} = \\\\frac{21}{\\\\sqrt{3}} \\\\cdot \\\\frac{1}{\\\\sqrt{7}} = \\\\frac{21\\\\sqrt{7}}{\\\\sqrt{3} \\\\cdot \\\\sqrt{7} \\\\cdot \\\\sqrt{7}} = \\\\frac{21\\\\sqrt{7}}{7\\\\sqrt{3}}.\\\\] Finally, we simplify by dividing both the numerator and the denominator by $7$: \\\\[\\\\frac{21\\\\sqrt{7}}{7\\\\sqrt{3}} = \\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}} = \\\\frac{3\\\\sqrt{7}}{\\\\sqrt{3}} \\\\cdot \\\\frac{\\\\sqrt{3}}{\\\\sqrt{3}} = \\\\boxed{3\\\\sqrt{21}}.\\\\]\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 52,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}} = \\\\frac{21\\\\sqrt{21}}{21} = \\\\frac{3\\\\cdot 7\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot 7}.$$ Simplifying this, we get $$\\\\frac{3\\\\cdot 7\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{3\\\\cdot 7} = \\\\frac{\\\\cancel{3}\\\\cdot \\\\cancel{7}\\\\cdot\\\\sqrt{3}\\\\cdot\\\\sqrt{7}}{\\\\cancel{3}\\\\cdot \\\\cancel{7}} = \\\\sqrt{3}\\\\sqrt{7}.$$ Therefore, we have $$\\\\frac{21}{\\\\sqrt{21}} = \\\\boxed{\\\\sqrt{3}\\\\sqrt{7}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    },\n",
+      "    {\n",
+      "      \"finish_reason\": \"stop\",\n",
+      "      \"index\": 53,\n",
+      "      \"message\": {\n",
+      "        \"content\": \"We have $$\\\\frac{21}{\\\\sqrt{21}}=\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\cdot\\\\sqrt{21}}{21}=\\\\frac{\\\\cancel{21}\\\\cdot\\\\sqrt{\\\\cancel{21}\\\\cdot 3}}{\\\\cancel{21}}=\\\\boxed{\\\\sqrt{3}}.$$\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
+      "    }\n",
+      "  ],\n",
+      "  \"created\": 1679622917,\n",
+      "  \"id\": \"chatcmpl-6xQw98DIHC3S1iQAacY3vjL6TLPRL\",\n",
+      "  \"model\": \"gpt-3.5-turbo-0301\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"usage\": {\n",
+      "    \"completion_tokens\": 7762,\n",
+      "    \"prompt_tokens\": 50,\n",
+      "    \"total_tokens\": 7812\n",
+      "  }\n",
+      "}\n",
+      "{'expected_success': 1.0, 'success': True, 'success_vote': 1.0, 'voted_answer': 'We want to get rid of the square root in the denominator. We can do this by multiplying both the numerator and denominator of the fraction by $\\\\sqrt{21}$: $$\\\\frac{21}{\\\\sqrt{21}}\\\\cdot\\\\frac{\\\\sqrt{21}}{\\\\sqrt{21}}=\\\\frac{21\\\\sqrt{21}}{21}=\\\\sqrt{21}.$$ Thus, $\\\\displaystyle\\\\frac{21}{\\\\sqrt{21}}=\\\\boxed{\\\\sqrt{21}}$.'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
+    "print(responses)\n",
+    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Evaluate the success rate on the test data\n",
+    "\n",
+    "You can use flaml's `oai.ChatCompletion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.ChatCompletion.data` to the data to evaluate. The following code will take a while (30 mins to 1 hour) to evaluate all the test data instances if uncommented and run. It will cost roughly $3. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:41:56.042764Z",
+     "iopub.status.busy": "2023-02-13T23:41:56.042086Z",
+     "iopub.status.idle": "2023-02-13T23:53:05.597643Z",
+     "shell.execute_reply": "2023-02-13T23:53:05.596603Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'expected_success': 0.9878128576084944, 'success': 0.9950248756218906, 'success_vote': 0.9203980099502488, 'voted_answer': \"We have that $1$ kilowatt is equivalent to $1.36$ horsepower. Therefore, we can set up the proportion $\\\\frac{1\\\\text{ kW}}{1.36\\\\text{ hp}} = \\\\frac{x\\\\text{ kW}}{500\\\\text{ hp}}$, where $x$ is the number of kilowatts that Eric's car's engine can generate. Solving for $x$, we get $x = \\\\frac{(1\\\\text{ kW})(500\\\\text{ hp})}{1.36\\\\text{ hp}} \\\\approx \\\\boxed{368 \\\\text{ kW}}$.\", 'total_cost': 4.194939999999996, 'cost': 3.1735039999999994, 'inference_cost': 0.01577204825870647}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# oai.ChatCompletion.data = test_data\n",
+    "# result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
+    "# print(result)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "What about the default, untuned gpt-4 config (with the same prompt as the tuned config)? We can evaluate it and compare:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'expected_success': 0.6965174129353234, 'success': 0.6965174129353234, 'success_vote': 0.6965174129353234, 'voted_answer': \"If we let $x$ be the number of kilowatts, then we can set up the proportion $\\\\frac{x\\\\text{ kW}}{500\\\\text{ hp}}=\\\\frac{1\\\\text{ kW}}{1.36\\\\text{ hp}}$. Solving for $x$, we get $x=\\\\frac{500}{1.36} = 367.65$. Rounding to the nearest integer, we get that Eric's car's engine has $\\\\boxed{368}$ kilowatts.\", 'total_cost': 6.009489999999993, 'cost': 1.8145500000000006, 'inference_cost': 0.008809679104477611}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# assuming you have access to gpt-4; otherwise use gpt-3.5-turbo\n",
+    "# the following code will cost roughly $2 if uncommented and run.\n",
+    "\n",
+    "# default_config = {\"model\": 'gpt-4', \"prompt\": 0}\n",
+    "# default_result = oai.ChatCompletion.eval(default_config, prune=False, eval_only=True)\n",
+    "# print(default_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tuned config succeeds in 92.0% test cases\n",
+      "untuned config succeeds in 69.7% test cases\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print(\"tuned config succeeds in {:.1f}% test cases\".format(result[\"success_vote\"] * 100))\n",
+    "# print(\"untuned config succeeds in {:.1f}% test cases\".format(default_result[\"success_vote\"] * 100))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that the untuned config has a lower inference cost. What if we heuristically increase the number of responses n to 5?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'expected_success': 0.9181755223880596, 'success': 0.9552238805970149, 'success_vote': 0.8756218905472637, 'voted_answer': \"To figure out how many kilowatts of power Eric's car can generate, we need to find the conversion factor for metric horsepower to kilowatts. To do this, we start by dividing the power in Eric's car in horsepower by the number of kilowatts per horsepower: $$\\\\frac{500\\\\text{ hp}}{1.36\\\\text{ hp/kW}}$$Now, to get to kilowatts, we divide by 1 hp, which gives us $$\\\\frac{500}{1.36}\\\\approx \\\\boxed{368}\\\\text{ kW}$$\", 'total_cost': 14.071600000000004, 'cost': 8.06211, 'inference_cost': 0.039892067164179104}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# The following evaluation costs $8 and nearly one hour if you uncomment it and run it.\n",
+    "\n",
+    "# config_larger = {\"model\": 'gpt-4', \"prompt\": 0, \"n\": 5}\n",
+    "# default_result = oai.ChatCompletion.eval(config_larger, prune=False, eval_only=True)\n",
+    "# print(default_result)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We find that the 'success_vote' metric is increased at the cost of exceeding the inference budget. But the tuned configuration has both higher 'success_vote' (92% vs. 87%) and lower average inference cost ($0.016 vs. $0.04 per instance).\n",
+    "\n",
+    "A developer could use flaml to tune the configuration to satisfy the target inference budget while maximizing the value out of it."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+   }
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "454146d0f7224f038689031002906e6f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
+        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
+        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
+       ],
+       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
+       "tabbable": null,
+       "tooltip": null
+      }
+     },
+     "577e1e3cc4db4942b0883577b3b52755": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
+       "max": 1,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
+       "tabbable": null,
+       "tooltip": null,
+       "value": 1
+      }
+     },
+     "6086462a12d54bafa59d3c4566f06cb2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "74a6ba0c3cbc4051be0a83e152fe1e62": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "7d3f3d9e15894d05a4d188ff4f466554": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
+       "placeholder": "​",
+       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
+       "tabbable": null,
+       "tooltip": null,
+       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
+      }
+     },
+     "ca245376fd9f4354af6b2befe4af4466": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "dc83c7bff2f241309537a8119dfc7555": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
+       "placeholder": "​",
+       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
+       "tabbable": null,
+       "tooltip": null,
+       "value": "100%"
+      }
+     },
+     "f1355871cc6f4dd4b50d9df5af20e5c8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/integrate_chatgpt_code.ipynb b/notebook/integrate_chatgpt_code.ipynb
deleted file mode 100644
index 735881d97136..000000000000
--- a/notebook/integrate_chatgpt_code.ipynb
+++ /dev/null
@@ -1,1082 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
-    "\n",
-    "Licensed under the MIT License.\n",
-    "\n",
-    "# Use FLAML to Tune ChatGPT\n",
-    "\n",
-    "In this notebook, we tune OpenAI ChatGPT model for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n",
-    "\n",
-    "## Requirements\n",
-    "\n",
-    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
-    "```bash\n",
-    "pip install flaml[openai]==1.2.0\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
-     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
-     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install flaml[openai]==1.2.0 datasets"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Set your OpenAI key:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.324240Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.323783Z",
-     "iopub.status.idle": "2023-02-13T23:40:52.330570Z",
-     "shell.execute_reply": "2023-02-13T23:40:52.329750Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "When ChatGPT is available in Azure OpenAI, uncomment the following to use Azure OpenAI:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.333547Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.333249Z",
-     "iopub.status.idle": "2023-02-13T23:40:52.336508Z",
-     "shell.execute_reply": "2023-02-13T23:40:52.335858Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# openai.api_type = \"azure\"\n",
-    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
-    "# openai.api_version = \"2023-3-01\""
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load dataset\n",
-    "\n",
-    "First, we load the humaneval dataset. The dataset contains 164 examples. We use the first 20 for tuning the generation hyperparameters and the remaining for evaluation. In each example, the \"prompt\" is the prompt string for eliciting the code generation, \"test\" is the Python code for unit test for the example, and \"entry_point\" is the function name to be tested."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.339556Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.603349Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.602630Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset openai_humaneval (/home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "aac289608bce4a808e224c0a09e1e8cf",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/openai_humaneval/openai_humaneval/1.0.0/2955cebd73602e828fa8c0a424c594e5fab4ec863b316ca98f3d8fdb6a626e75/cache-1e8448101c1b32e8.arrow\n"
-     ]
-    }
-   ],
-   "source": [
-    "import datasets\n",
-    "\n",
-    "seed = 41\n",
-    "data = datasets.load_dataset(\"openai_humaneval\")[\"test\"].shuffle(seed=seed)\n",
-    "n_tune_data = 20\n",
-    "tune_data = [\n",
-    "    {\n",
-    "        \"prompt\": data[x][\"prompt\"],\n",
-    "        \"test\": data[x][\"test\"],\n",
-    "        \"entry_point\": data[x][\"entry_point\"],\n",
-    "    }\n",
-    "    for x in range(n_tune_data)\n",
-    "]\n",
-    "test_data = [\n",
-    "    {\n",
-    "        \"prompt\": data[x][\"prompt\"],\n",
-    "        \"test\": data[x][\"test\"],\n",
-    "        \"entry_point\": data[x][\"entry_point\"],\n",
-    "    }\n",
-    "    for x in range(n_tune_data, len(data))\n",
-    "]\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "Check a tuning example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.606441Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.610504Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.609759Z"
-    },
-    "slideshow": {
-     "slide_type": "subslide"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "def compare(game,guess):\n",
-      "    \"\"\"I think we all remember that feeling when the result of some long-awaited\n",
-      "    event is finally known. The feelings and thoughts you have at that moment are\n",
-      "    definitely worth noting down and comparing.\n",
-      "    Your task is to determine if a person correctly guessed the results of a number of matches.\n",
-      "    You are given two arrays of scores and guesses of equal length, where each index shows a match. \n",
-      "    Return an array of the same length denoting how far off each guess was. If they have guessed correctly,\n",
-      "    the value is 0, and if not, the value is the absolute difference between the guess and the score.\n",
-      "    \n",
-      "    \n",
-      "    example:\n",
-      "\n",
-      "    compare([1,2,3,4,5,1],[1,2,3,4,2,-2]) -> [0,0,0,0,3,3]\n",
-      "    compare([0,5,0,0,0,4],[4,1,1,0,0,-2]) -> [4,4,1,0,0,6]\n",
-      "    \"\"\"\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tune_data[1][\"prompt\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is one example of the unit test code for verifying the correctness of the generated code:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.613168Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.616873Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.616193Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "def check(candidate):\n",
-      "\n",
-      "    # Check some simple cases\n",
-      "    assert candidate([1,2,3,4,5,1],[1,2,3,4,2,-2])==[0,0,0,0,3,3], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "    assert candidate([0,0,0,0,0,0],[0,0,0,0,0,0])==[0,0,0,0,0,0], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "    assert candidate([1,2,3],[-1,-2,-3])==[2,4,6], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "    assert candidate([1,2,3,5],[-1,2,3,4])==[2,0,0,1], \"This prints if this assert fails 1 (good for debugging!)\"\n",
-      "\n",
-      "    # Check some edge cases that are easy to work out by hand.\n",
-      "    assert True, \"This prints if this assert fails 2 (also good for debugging!)\"\n",
-      "\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tune_data[1][\"test\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Define Success Metric\n",
-    "\n",
-    "Before we start tuning, we need to define the success metric we want to opotimize. For each code generation task, if one of the returned responses can pass the test, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks.\n",
-    "\n",
-    "### Define a code executor\n",
-    "\n",
-    "First, we write a simple code executor. The code executor takes the generated code and the test code as the input, and execute them with a timer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.619618Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.619218Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.624272Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.623664Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import signal\n",
-    "import subprocess\n",
-    "import sys\n",
-    "\n",
-    "def timeout_handler(signum, frame):\n",
-    "    raise TimeoutError(\"Timed out!\")\n",
-    "\n",
-    "signal.signal(signal.SIGALRM, timeout_handler)\n",
-    "max_exec_time = 3  # seconds\n",
-    "\n",
-    "def execute_code(code):\n",
-    "    code = code.strip()\n",
-    "    with open(\"codetest.py\", \"w\") as fout:\n",
-    "        fout.write(code)\n",
-    "    try:\n",
-    "        signal.alarm(max_exec_time)\n",
-    "        result = subprocess.run(\n",
-    "            [sys.executable, \"codetest.py\"],\n",
-    "            stdout=subprocess.DEVNULL,\n",
-    "            stderr=subprocess.PIPE,\n",
-    "        )\n",
-    "        signal.alarm(0)\n",
-    "    except TimeoutError:\n",
-    "        return 0\n",
-    "    return int(result.returncode == 0)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This function will create a temp file \"codetest.py\" and execute it in a separate process. It allows for 3 seconds to finish that code.\n",
-    "\n",
-    "### Define a function to evaluate the success for a given program synthesis task"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.626593Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.631383Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.630770Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "def success_metrics(responses, prompt, test, entry_point):\n",
-    "    \"\"\"Check if the task is successful.\n",
-    "\n",
-    "    Args:\n",
-    "        responses (list): The list of responses.\n",
-    "        prompt (str): The input prompt.\n",
-    "        test (str): The test code.\n",
-    "        entry_point (str): The name of the function.\n",
-    "\n",
-    "    Returns:\n",
-    "        dict: The success metrics.\n",
-    "    \"\"\"\n",
-    "    success_list = []\n",
-    "    n = len(responses)\n",
-    "    for i in range(n):\n",
-    "        response = responses[i]\n",
-    "        code = f\"{prompt}{response}\\n{test}\\ncheck({entry_point})\"\n",
-    "        succeed = execute_code(code)\n",
-    "        success_list.append(succeed)\n",
-    "    return {\n",
-    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
-    "        \"success\": any(s for s in success_list),\n",
-    "    }\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "## Use the tuning data to find a good configuration\n",
-    "\n",
-    "### Import the oai and tune subpackages from flaml.\n",
-    "\n",
-    "FLAML has provided an API for hyperparameter optimization of OpenAI ChatGPT completions: `oai.ChatCompletion.tune` and to make a request with the tuned config: `oai.ChatCompletion.create`. First, we import oai from flaml:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.633929Z",
-     "iopub.status.idle": "2023-02-13T23:40:56.105700Z",
-     "shell.execute_reply": "2023-02-13T23:40:56.105085Z"
-    },
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from flaml import oai"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
-     "iopub.status.busy": "2023-02-13T23:40:56.108624Z",
-     "iopub.status.idle": "2023-02-13T23:40:56.112651Z",
-     "shell.execute_reply": "2023-02-13T23:40:56.112076Z"
-    },
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "oai.ChatCompletion.set_cache(seed)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n",
-    "\n",
-    "### Perform tuning\n",
-    "\n",
-    "The tuning will take a while to finish, depending on the optimization budget (~5 mins for the budget used in the following example). The tuning will be performed under the specified optimization budgets.\n",
-    "\n",
-    "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.002 means the target inference budget is 0.002 dollars, which translates to 1000 tokens (input + output combined) if the gpt-3.5-turbo model is used.\n",
-    "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 0.1 means 0.1 dollars are allowed in total, which translates to 50K tokens for the gpt-3.5-turbo model.\n",
-    "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n",
-    "\n",
-    "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
-    "\n",
-    "```python\n",
-    "price1K = {\n",
-    "    \"gpt-3.5-turbo\": 0.002,\n",
-    "}\n",
-    "\n",
-    "default_search_space = {\n",
-    "    \"model\": tune.choice(list(price1K.keys())),\n",
-    "    \"temperature_or_top_p\": tune.choice(\n",
-    "        [\n",
-    "            {\"temperature\": tune.uniform(0, 1)},\n",
-    "            {\"top_p\": tune.uniform(0, 1)},\n",
-    "        ]\n",
-    "    ),\n",
-    "    \"max_tokens\": tune.lograndint(50, 1000),\n",
-    "    \"n\": tune.randint(1, 100),\n",
-    "    \"prompt\": \"{prompt}\",\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "The default search space can be overriden by users' input.\n",
-    "For example, the following code specifies four choices for the prompt and a fixed stop sequence and number of completions. For hyperparameters which don't appear in users' input, the default search space will be used."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
-     "iopub.status.busy": "2023-02-13T23:40:56.114975Z",
-     "iopub.status.idle": "2023-02-13T23:41:55.045654Z",
-     "shell.execute_reply": "2023-02-13T23:41:55.044973Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32m[I 2023-03-04 03:34:16,379]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 03-04 03:34:16] {811} INFO - trial 1 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 1, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-04 03:34:17] {215} INFO - result: {'expected_success': 0.25, 'success': 0.25, 'total_cost': 0.00971, 'cost': 0.00971, 'inference_cost': 0.0004855, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.36280922847807595}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5723528861999512}\n",
-      "[flaml.tune.tune: 03-04 03:34:17] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'prompt': 3, 'stop': 0, 'n': 1}\n",
-      "[flaml.tune.tune: 03-04 03:34:18] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.019959999999999995, 'cost': 0.01025, 'inference_cost': 0.0005124999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'prompt': 3, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6336482349262754}, 'config/max_tokens': 470, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.8377933502197266}\n",
-      "[flaml.tune.tune: 03-04 03:34:18] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6853598183677972}, 'max_tokens': 869, 'prompt': 2, 'stop': 0, 'n': 1}\n",
-      "[flaml.tune.tune: 03-04 03:34:18] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.02982599999999999, 'cost': 0.009866, 'inference_cost': 0.0004933, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6853598183677972}, 'max_tokens': 869, 'prompt': 2, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6853598183677972}, 'config/max_tokens': 869, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.5643947124481201}\n",
-      "[flaml.tune.tune: 03-04 03:34:18] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 879, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7857528212930487}}\n",
-      "[flaml.tune.tune: 03-04 03:34:19] {215} INFO - result: {'expected_success': 0.7, 'success': 0.7, 'total_cost': 0.039289999999999985, 'cost': 0.009464, 'inference_cost': 0.0004732, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 879, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7857528212930487}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 879, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.7857528212930487}, 'experiment_tag': 'exp', 'time_total_s': 0.615419864654541}\n",
-      "[flaml.tune.tune: 03-04 03:34:19] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'prompt': 3, 'stop': 0, 'n': 1}\n",
-      "[flaml.tune.tune: 03-04 03:34:20] {215} INFO - result: {'expected_success': 0.65, 'success': 0.65, 'total_cost': 0.049551999999999985, 'cost': 0.010261999999999999, 'inference_cost': 0.0005131, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9177741225129434}, 'max_tokens': 424, 'prompt': 3, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9177741225129434}, 'config/max_tokens': 424, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.968498945236206}\n",
-      "[flaml.tune.tune: 03-04 03:34:20] {811} INFO - trial 6 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'prompt': 2, 'stop': 0, 'n': 1}\n",
-      "[flaml.tune.tune: 03-04 03:34:20] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.059558, 'cost': 0.010006000000000001, 'inference_cost': 0.0005003, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'prompt': 2, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6177669784693172}, 'config/max_tokens': 231, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.6833891868591309}\n",
-      "[flaml.tune.tune: 03-04 03:34:20] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 342, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.4957816798898031}}\n",
-      "[flaml.tune.tune: 03-04 03:34:21] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.069498, 'cost': 0.009940000000000001, 'inference_cost': 0.000497, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 342, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.4957816798898031}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 342, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.4957816798898031}, 'experiment_tag': 'exp', 'time_total_s': 0.5958354473114014}\n",
-      "[flaml.tune.tune: 03-04 03:34:21] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 156, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7397522770488312}}\n",
-      "[flaml.tune.tune: 03-04 03:34:22] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.07878399999999999, 'cost': 0.009286000000000003, 'inference_cost': 0.00046430000000000006, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 156, 'prompt': 2, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.7397522770488312}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 156, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.7397522770488312}, 'experiment_tag': 'exp', 'time_total_s': 0.5807592868804932}\n",
-      "[flaml.tune.tune: 03-04 03:34:22] {811} INFO - trial 9 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 201, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6833066621306901}}\n",
-      "[flaml.tune.tune: 03-04 03:34:22] {215} INFO - result: {'expected_success': 0.7, 'success': 0.7, 'total_cost': 0.088372, 'cost': 0.009588000000000001, 'inference_cost': 0.0004794, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 201, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6833066621306901}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 201, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.6833066621306901}, 'experiment_tag': 'exp', 'time_total_s': 0.756892204284668}\n",
-      "[flaml.tune.tune: 03-04 03:34:22] {811} INFO - trial 10 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 266, 'prompt': 1, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.5522272948079442}}\n",
-      "[flaml.tune.tune: 03-04 03:34:23] {215} INFO - result: {'expected_success': 0.35, 'success': 0.35, 'total_cost': 0.09748600000000002, 'cost': 0.009114, 'inference_cost': 0.0004557, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 266, 'prompt': 1, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.5522272948079442}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 266, 'config/prompt': 1, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.5522272948079442}, 'experiment_tag': 'exp', 'time_total_s': 0.5654494762420654}\n",
-      "[flaml.tune.tune: 03-04 03:34:23] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 218, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6240777950749403}}\n",
-      "[flaml.tune.tune: 03-04 03:34:23] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.10044800000000001, 'cost': 0.0029620000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 218, 'prompt': 3, 'stop': 0, 'n': 1, 'temperature_or_top_p': {'temperature': 0.6240777950749403}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 218, 'config/prompt': 3, 'config/stop': 0, 'config/n': 1, 'config/temperature_or_top_p': {'temperature': 0.6240777950749403}, 'experiment_tag': 'exp', 'time_total_s': 0.003355741500854492}\n",
-      "[flaml.tune.tune: 03-04 03:34:23] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import logging\n",
-    "\n",
-    "config, analysis = oai.ChatCompletion.tune(\n",
-    "    data=tune_data,  # the data for tuning\n",
-    "    metric=\"expected_success\",  # the metric to optimize\n",
-    "    mode=\"max\",  # the optimization mode\n",
-    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
-    "    # log_file_name=\"logs/humaneval.log\",  # the log file name\n",
-    "    inference_budget=0.002,  # the inference budget (dollar)\n",
-    "    optimization_budget=0.1,  # the optimization budget (dollar)\n",
-    "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
-    "    # -1 means decided by the optimization budget only\n",
-    "    num_samples=-1,\n",
-    "    prompt=[\n",
-    "        \"{prompt}\",\n",
-    "        \"# Python 3{prompt}\",\n",
-    "        \"Complete the following Python function:{prompt}\",\n",
-    "        \"Complete the following Python function while including necessary import statements inside the function:{prompt}\",\n",
-    "    ],  # the prompt templates to choose from\n",
-    "    stop=[\"\\nprint\"],  # the stop sequence\n",
-    "    logging_level=logging.INFO,  # the logging level\n",
-    "    n=1,  # the number of responses to generate\n",
-    ")\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Output tuning results\n",
-    "\n",
-    "After the tuning, we can print out the config and the result found by FLAML:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:41:55.049204Z",
-     "iopub.status.busy": "2023-02-13T23:41:55.048871Z",
-     "iopub.status.idle": "2023-02-13T23:41:55.053284Z",
-     "shell.execute_reply": "2023-02-13T23:41:55.052574Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 869, 'prompt': 'Complete the following Python function:{prompt}', 'stop': ['\\nprint'], 'n': 1, 'temperature': 0.6853598183677972}\n",
-      "best result on tuning data {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.02982599999999999, 'cost': 0.009866, 'inference_cost': 0.0004933, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6853598183677972}, 'max_tokens': 869, 'prompt': 2, 'stop': 0, 'n': 1}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6853598183677972}, 'config/max_tokens': 869, 'config/prompt': 2, 'config/stop': 0, 'config/n': 1, 'experiment_tag': 'exp', 'time_total_s': 0.5643947124481201}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"optimized config\", config)\n",
-    "print(\"best result on tuning data\", analysis.best_result)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "### Make a request with the tuned config\n",
-    "\n",
-    "We can apply the tuned config on the request for an example task:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:41:55.056205Z",
-     "iopub.status.busy": "2023-02-13T23:41:55.055631Z",
-     "iopub.status.idle": "2023-02-13T23:41:56.039259Z",
-     "shell.execute_reply": "2023-02-13T23:41:56.038427Z"
-    },
-    "slideshow": {
-     "slide_type": "subslide"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "  \"choices\": [\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 0,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\ndef compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        if game[i] == guess[i]:\\n            result.append(0)\\n        else:\\n            result.append(abs(game[i] - guess[i]))\\n    return result\\n\\n\\n\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    }\n",
-      "  ],\n",
-      "  \"created\": 1677891117,\n",
-      "  \"id\": \"chatcmpl-6qAPt1bjNEM80fK4JDOT3RqP3POjA\",\n",
-      "  \"model\": \"gpt-3.5-turbo-0301\",\n",
-      "  \"object\": \"chat.completion\",\n",
-      "  \"usage\": {\n",
-      "    \"completion_tokens\": 52,\n",
-      "    \"prompt_tokens\": 237,\n",
-      "    \"total_tokens\": 289\n",
-      "  }\n",
-      "}\n",
-      "{'expected_success': 1.0, 'success': True}\n"
-     ]
-    }
-   ],
-   "source": [
-    "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
-    "print(responses)\n",
-    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Evaluate the success rate on the test data\n",
-    "\n",
-    "You can use flaml's `oai.ChatCompletion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.ChatCompletion.data` to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:41:56.042764Z",
-     "iopub.status.busy": "2023-02-13T23:41:56.042086Z",
-     "iopub.status.idle": "2023-02-13T23:53:05.597643Z",
-     "shell.execute_reply": "2023-02-13T23:53:05.596603Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'expected_success': 0.7152777777777778, 'success': 0.7152777777777778, 'total_cost': 0.17079400000000003, 'cost': 0.07034599999999996, 'inference_cost': 0.0004885138888888889}\n"
-     ]
-    }
-   ],
-   "source": [
-    "oai.ChatCompletion.data = test_data\n",
-    "result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
-    "print(result)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15 (main, Oct 26 2022, 03:47:43) \n[GCC 10.2.1 20210110]"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
-   }
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "454146d0f7224f038689031002906e6f": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HBoxModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HBoxView",
-       "box_style": "",
-       "children": [
-        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
-        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
-        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
-       ],
-       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
-       "tabbable": null,
-       "tooltip": null
-      }
-     },
-     "577e1e3cc4db4942b0883577b3b52755": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "FloatProgressModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "ProgressView",
-       "bar_style": "success",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
-       "max": 1,
-       "min": 0,
-       "orientation": "horizontal",
-       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
-       "tabbable": null,
-       "tooltip": null,
-       "value": 1
-      }
-     },
-     "6086462a12d54bafa59d3c4566f06cb2": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "74a6ba0c3cbc4051be0a83e152fe1e62": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "ProgressStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "bar_color": null,
-       "description_width": ""
-      }
-     },
-     "7d3f3d9e15894d05a4d188ff4f466554": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "background": null,
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HTMLView",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
-       "placeholder": "​",
-       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
-       "tabbable": null,
-       "tooltip": null,
-       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
-      }
-     },
-     "ca245376fd9f4354af6b2befe4af4466": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "background": null,
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "dc83c7bff2f241309537a8119dfc7555": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HTMLView",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
-       "placeholder": "​",
-       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
-       "tabbable": null,
-       "tooltip": null,
-       "value": "100%"
-      }
-     },
-     "f1355871cc6f4dd4b50d9df5af20e5c8": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     }
-    },
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebook/integrate_chatgpt_math.ipynb b/notebook/integrate_chatgpt_math.ipynb
deleted file mode 100644
index 66392efc69e0..000000000000
--- a/notebook/integrate_chatgpt_math.ipynb
+++ /dev/null
@@ -1,1386 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "Copyright (c) Microsoft Corporation. All rights reserved. \n",
-    "\n",
-    "Licensed under the MIT License.\n",
-    "\n",
-    "# Use FLAML to Tune ChatGPT\n",
-    "\n",
-    "In this notebook, we tune OpenAI ChatGPT model for math problem solving. We use [the MATH benchmark](https://crfm.stanford.edu/helm/latest/?group=math_chain_of_thought) for measuring mathematical problem solving on competition math problems with chain-of-thoughts style reasoning. \n",
-    "\n",
-    "## Requirements\n",
-    "\n",
-    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option:\n",
-    "```bash\n",
-    "pip install flaml[openai]==1.2.0\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
-     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
-     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install flaml[openai]==1.2.0 datasets"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Set your OpenAI key:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.324240Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.323783Z",
-     "iopub.status.idle": "2023-02-13T23:40:52.330570Z",
-     "shell.execute_reply": "2023-02-13T23:40:52.329750Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "if \"OPENAI_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"OPENAI_API_KEY\"] = \"<your OpenAI API key here>\""
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "When ChatGPT is available in Azure OpenAI, uncomment the following to use Azure OpenAI:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.333547Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.333249Z",
-     "iopub.status.idle": "2023-02-13T23:40:52.336508Z",
-     "shell.execute_reply": "2023-02-13T23:40:52.335858Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "# openai.api_type = \"azure\"\n",
-    "# openai.api_base = \"https://<your_endpoint>.openai.azure.com/\"\n",
-    "# openai.api_version = \"2023-3-01\""
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load dataset\n",
-    "\n",
-    "First, we load the competition_math dataset. The dataset contains 457 \"Level 1\" examples. We use a random sample of 20 examples for tuning the generation hyperparameters and the remaining for evaluation. We use one demonstration example in the prompt."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:52.339977Z",
-     "iopub.status.busy": "2023-02-13T23:40:52.339556Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.603349Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.602630Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset competition_math (/home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "79ced88ccf474030bda228436813e94b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b/cache-f1cfe8228271b121.arrow\n",
-      "Loading cached shuffled indices for dataset at /home/vscode/.cache/huggingface/datasets/competition_math/default/1.0.0/2a2a2995c2847186883ecd64f69be7d602b8a6f6b51950624d4dc2263f93333b/cache-d155a2d38c23bd53.arrow\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "max tokens in tuning data's canonical solutions 128\n",
-      "20 437\n"
-     ]
-    }
-   ],
-   "source": [
-    "import datasets\n",
-    "\n",
-    "seed = 41\n",
-    "data = datasets.load_dataset(\"competition_math\")\n",
-    "train_data = data[\"train\"].shuffle(seed=seed)\n",
-    "test_data = data[\"test\"].shuffle(seed=seed)\n",
-    "n_tune_data = 20\n",
-    "tune_data = [\n",
-    "    {\n",
-    "        \"problem\": train_data[x][\"problem\"],\n",
-    "        \"solution\": train_data[x][\"solution\"],\n",
-    "    }\n",
-    "    for x in range(len(train_data)) if train_data[x][\"level\"] == \"Level 1\"\n",
-    "][:n_tune_data]\n",
-    "test_data = [\n",
-    "    {\n",
-    "        \"problem\": test_data[x][\"problem\"],\n",
-    "        \"solution\": test_data[x][\"solution\"],\n",
-    "    }\n",
-    "    for x in range(len(test_data)) if test_data[x][\"level\"] == \"Level 1\"\n",
-    "]\n",
-    "input_field = \"problem\"\n",
-    "output_fields = [\"solution\"]\n",
-    "print(\"max tokens in tuning data's canonical solutions\", max([len(x[\"solution\"].split()) for x in tune_data]))\n",
-    "print(len(tune_data), len(test_data))\n",
-    "# prompt template\n",
-    "prompts = [lambda data: \"Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\\n###\\nProblem: What is the value of $\\\\sqrt{3! \\\\cdot 3!}$ expressed as a positive integer?\\nAnswer: $\\\\sqrt{3!\\\\cdot3!}$ is equal to $\\\\sqrt{(3!)^2}=3!=3\\\\cdot2\\\\cdot1=\\\\boxed{6}$.\\n###\\nProblem: %s\\nAnswer:\" + data[\"problem\"]]\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "Check a tuning example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.607152Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.606441Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.610504Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.609759Z"
-    },
-    "slideshow": {
-     "slide_type": "subslide"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Find $\\log_{10} 40 +\\log_{10} 25$.\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tune_data[1][\"problem\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is one example of the canonical solution:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.613590Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.613168Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.616873Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.616193Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Using $\\log x+\\log y=\\log xy,$ we get that $\\log_{10} 40+\\log_{10} 25=\\log_{10}(40\\cdot 25)=\\log 1000.$ That means we want $x$ where $10^x=1000,$ which means $x=3.$ Therefore, $\\log_{10} 40+\\log_{10} 25=\\boxed{3}.$\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tune_data[1][\"solution\"])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Define Success Metric\n",
-    "\n",
-    "Before we start tuning, we need to define the success metric we want to opotimize. For each math task, if one of the returned responses has an equivalent answer to the canonical solution, we consider the task as successfully solved. Then we can define the mean success rate of a collection of tasks."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.626998Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.626593Z",
-     "iopub.status.idle": "2023-02-13T23:40:54.631383Z",
-     "shell.execute_reply": "2023-02-13T23:40:54.630770Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from typing import Optional\n",
-    "\n",
-    "def remove_boxed(string: str) -> Optional[str]:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Extract the text within a \\\\boxed{...} environment.\n",
-    "    Example:\n",
-    "    >>> remove_boxed(\\\\boxed{\\\\frac{2}{3}})\n",
-    "    \\\\frac{2}{3}\n",
-    "    \"\"\"\n",
-    "    left = \"\\\\boxed{\"\n",
-    "    try:\n",
-    "        assert string[: len(left)] == left\n",
-    "        assert string[-1] == \"}\"\n",
-    "        return string[len(left) : -1]\n",
-    "    except Exception:\n",
-    "        return None\n",
-    "\n",
-    "\n",
-    "def last_boxed_only_string(string: str) -> Optional[str]:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Extract the last \\\\boxed{...} or \\\\fbox{...} element from a string.\n",
-    "    \"\"\"\n",
-    "    idx = string.rfind(\"\\\\boxed\")\n",
-    "    if idx < 0:\n",
-    "        idx = string.rfind(\"\\\\fbox\")\n",
-    "        if idx < 0:\n",
-    "            return None\n",
-    "\n",
-    "    i = idx\n",
-    "    right_brace_idx = None\n",
-    "    num_left_braces_open = 0\n",
-    "    while i < len(string):\n",
-    "        if string[i] == \"{\":\n",
-    "            num_left_braces_open += 1\n",
-    "        if string[i] == \"}\":\n",
-    "            num_left_braces_open -= 1\n",
-    "            if num_left_braces_open == 0:\n",
-    "                right_brace_idx = i\n",
-    "                break\n",
-    "        i += 1\n",
-    "\n",
-    "    if right_brace_idx is None:\n",
-    "        retval = None\n",
-    "    else:\n",
-    "        retval = string[idx : right_brace_idx + 1]\n",
-    "\n",
-    "    return retval\n",
-    "\n",
-    "\n",
-    "def _fix_fracs(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Reformat fractions.\n",
-    "    Examples:\n",
-    "    >>> _fix_fracs(\"\\\\frac1b\")\n",
-    "    \\frac{1}{b}\n",
-    "    >>> _fix_fracs(\"\\\\frac12\")\n",
-    "    \\frac{1}{2}\n",
-    "    >>> _fix_fracs(\"\\\\frac1{72}\")\n",
-    "    \\frac{1}{72}\n",
-    "    \"\"\"\n",
-    "    substrs = string.split(\"\\\\frac\")\n",
-    "    new_str = substrs[0]\n",
-    "    if len(substrs) > 1:\n",
-    "        substrs = substrs[1:]\n",
-    "        for substr in substrs:\n",
-    "            new_str += \"\\\\frac\"\n",
-    "            if substr[0] == \"{\":\n",
-    "                new_str += substr\n",
-    "            else:\n",
-    "                try:\n",
-    "                    assert len(substr) >= 2\n",
-    "                except Exception:\n",
-    "                    return string\n",
-    "                a = substr[0]\n",
-    "                b = substr[1]\n",
-    "                if b != \"{\":\n",
-    "                    if len(substr) > 2:\n",
-    "                        post_substr = substr[2:]\n",
-    "                        new_str += \"{\" + a + \"}{\" + b + \"}\" + post_substr\n",
-    "                    else:\n",
-    "                        new_str += \"{\" + a + \"}{\" + b + \"}\"\n",
-    "                else:\n",
-    "                    if len(substr) > 2:\n",
-    "                        post_substr = substr[2:]\n",
-    "                        new_str += \"{\" + a + \"}\" + b + post_substr\n",
-    "                    else:\n",
-    "                        new_str += \"{\" + a + \"}\" + b\n",
-    "    string = new_str\n",
-    "    return string\n",
-    "\n",
-    "\n",
-    "def _fix_a_slash_b(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Reformat fractions formatted as a/b to \\\\frac{a}{b}.\n",
-    "    Example:\n",
-    "    >>> _fix_a_slash_b(\"2/3\")\n",
-    "    \\frac{2}{3}\n",
-    "    \"\"\"\n",
-    "    if len(string.split(\"/\")) != 2:\n",
-    "        return string\n",
-    "    a_str = string.split(\"/\")[0]\n",
-    "    b_str = string.split(\"/\")[1]\n",
-    "    try:\n",
-    "        a = int(a_str)\n",
-    "        b = int(b_str)\n",
-    "        assert string == \"{}/{}\".format(a, b)\n",
-    "        new_string = \"\\\\frac{\" + str(a) + \"}{\" + str(b) + \"}\"\n",
-    "        return new_string\n",
-    "    except Exception:\n",
-    "        return string\n",
-    "\n",
-    "\n",
-    "def _remove_right_units(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Remove units (on the right).\n",
-    "    \"\\\\text{ \" only ever occurs (at least in the val set) when describing units.\n",
-    "    \"\"\"\n",
-    "    if \"\\\\text{ \" in string:\n",
-    "        splits = string.split(\"\\\\text{ \")\n",
-    "        assert len(splits) == 2\n",
-    "        return splits[0]\n",
-    "    else:\n",
-    "        return string\n",
-    "\n",
-    "\n",
-    "def _fix_sqrt(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Reformat square roots.\n",
-    "    Example:\n",
-    "    >>> _fix_sqrt(\"\\\\sqrt3\")\n",
-    "    \\sqrt{3}\n",
-    "    \"\"\"\n",
-    "    if \"\\\\sqrt\" not in string:\n",
-    "        return string\n",
-    "    splits = string.split(\"\\\\sqrt\")\n",
-    "    new_string = splits[0]\n",
-    "    for split in splits[1:]:\n",
-    "        if split[0] != \"{\":\n",
-    "            a = split[0]\n",
-    "            new_substr = \"\\\\sqrt{\" + a + \"}\" + split[1:]\n",
-    "        else:\n",
-    "            new_substr = \"\\\\sqrt\" + split\n",
-    "        new_string += new_substr\n",
-    "    return new_string\n",
-    "\n",
-    "\n",
-    "def _strip_string(string: str) -> str:\n",
-    "    \"\"\"Source: https://github.com/hendrycks/math\n",
-    "    Apply the reformatting helper functions above.\n",
-    "    \"\"\"\n",
-    "    # linebreaks\n",
-    "    string = string.replace(\"\\n\", \"\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # remove inverse spaces\n",
-    "    string = string.replace(\"\\\\!\", \"\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # replace \\\\ with \\\n",
-    "    string = string.replace(\"\\\\\\\\\", \"\\\\\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # replace tfrac and dfrac with frac\n",
-    "    string = string.replace(\"tfrac\", \"frac\")\n",
-    "    string = string.replace(\"dfrac\", \"frac\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # remove \\left and \\right\n",
-    "    string = string.replace(\"\\\\left\", \"\")\n",
-    "    string = string.replace(\"\\\\right\", \"\")\n",
-    "    # print(string)\n",
-    "\n",
-    "    # Remove circ (degrees)\n",
-    "    string = string.replace(\"^{\\\\circ}\", \"\")\n",
-    "    string = string.replace(\"^\\\\circ\", \"\")\n",
-    "\n",
-    "    # remove dollar signs\n",
-    "    string = string.replace(\"\\\\$\", \"\")\n",
-    "\n",
-    "    # remove units (on the right)\n",
-    "    string = _remove_right_units(string)\n",
-    "\n",
-    "    # remove percentage\n",
-    "    string = string.replace(\"\\\\%\", \"\")\n",
-    "    string = string.replace(\"\\%\", \"\")\n",
-    "\n",
-    "    # \" 0.\" equivalent to \" .\" and \"{0.\" equivalent to \"{.\" Alternatively, add \"0\" if \".\" is the start of the string\n",
-    "    string = string.replace(\" .\", \" 0.\")\n",
-    "    string = string.replace(\"{.\", \"{0.\")\n",
-    "    # if empty, return empty string\n",
-    "    if len(string) == 0:\n",
-    "        return string\n",
-    "    if string[0] == \".\":\n",
-    "        string = \"0\" + string\n",
-    "\n",
-    "    # to consider: get rid of e.g. \"k = \" or \"q = \" at beginning\n",
-    "    if len(string.split(\"=\")) == 2:\n",
-    "        if len(string.split(\"=\")[0]) <= 2:\n",
-    "            string = string.split(\"=\")[1]\n",
-    "\n",
-    "    # fix sqrt3 --> sqrt{3}\n",
-    "    string = _fix_sqrt(string)\n",
-    "\n",
-    "    # remove spaces\n",
-    "    string = string.replace(\" \", \"\")\n",
-    "\n",
-    "    # \\frac1b or \\frac12 --> \\frac{1}{b} and \\frac{1}{2}, etc.\n",
-    "    # Even works with \\frac1{72} (but not \\frac{72}1).\n",
-    "    # Also does a/b --> \\\\frac{a}{b}\n",
-    "    string = _fix_fracs(string)\n",
-    "\n",
-    "    # manually change 0.5 --> \\frac{1}{2}\n",
-    "    if string == \"0.5\":\n",
-    "        string = \"\\\\frac{1}{2}\"\n",
-    "\n",
-    "    # NOTE: X/Y changed to \\frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y\n",
-    "    string = _fix_a_slash_b(string)\n",
-    "\n",
-    "    return string\n",
-    "\n",
-    "\n",
-    "def get_answer(solution: Optional[str]) -> Optional[str]:\n",
-    "    if solution is None:\n",
-    "        return None\n",
-    "    last_boxed = last_boxed_only_string(solution)\n",
-    "    if last_boxed is None:\n",
-    "        return None\n",
-    "    answer = remove_boxed(last_boxed)\n",
-    "    if answer is None:\n",
-    "        return None\n",
-    "    return answer\n",
-    "\n",
-    "\n",
-    "def is_equiv(str1: Optional[str], str2: Optional[str]) -> float:\n",
-    "    \"\"\"Returns (as a float) whether two strings containing math are equivalent up to differences of formatting in\n",
-    "    - units\n",
-    "    - fractions\n",
-    "    - square roots\n",
-    "    - superfluous LaTeX.\n",
-    "    Source: https://github.com/hendrycks/math\n",
-    "    \"\"\"\n",
-    "    if str1 is None and str2 is None:\n",
-    "        print(\"WARNING: Both None\")\n",
-    "        return 1.0\n",
-    "    if str1 is None or str2 is None:\n",
-    "        return 0.0\n",
-    "\n",
-    "    try:\n",
-    "        ss1 = _strip_string(str1)\n",
-    "        ss2 = _strip_string(str2)\n",
-    "        return float(ss1 == ss2)\n",
-    "    except Exception:\n",
-    "        return float(str1 == str2)\n",
-    "\n",
-    "\n",
-    "def is_equiv_chain_of_thought(str1: str, str2: str) -> float:\n",
-    "    \"\"\"Strips the solution first before calling `is_equiv`.\"\"\"\n",
-    "    ans1 = get_answer(str1)\n",
-    "    ans2 = get_answer(str2)\n",
-    "\n",
-    "    return is_equiv(ans1, ans2)\n",
-    "\n",
-    "\n",
-    "def success_metrics(responses, solution, **args):\n",
-    "    \"\"\"Check if each response is correct.\n",
-    "    \n",
-    "    Args:\n",
-    "        responses (list): The list of responses.\n",
-    "        solution (str): The canonical solution.\n",
-    "    \n",
-    "    Returns:\n",
-    "        dict: The success metrics.\n",
-    "    \"\"\"\n",
-    "    success_list = []\n",
-    "    n = len(responses)\n",
-    "    for i in range(n):\n",
-    "        response = responses[i]\n",
-    "        succeed = is_equiv_chain_of_thought(response, solution)\n",
-    "        success_list.append(succeed)\n",
-    "    return {\n",
-    "        \"expected_success\": 1 - pow(1 - sum(success_list) / n, n),\n",
-    "        \"success\": any(s for s in success_list),\n",
-    "    }\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "## Use the tuning data to find a good configuration\n",
-    "\n",
-    "### Import the oai and tune subpackages from flaml.\n",
-    "\n",
-    "FLAML has provided an API for hyperparameter optimization of OpenAI ChatGPT models: `oai.ChatCompletion.tune` and to make a request with the tuned config: `oai.ChatCompletion.create`. First, we import oai from flaml:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:54.634335Z",
-     "iopub.status.busy": "2023-02-13T23:40:54.633929Z",
-     "iopub.status.idle": "2023-02-13T23:40:56.105700Z",
-     "shell.execute_reply": "2023-02-13T23:40:56.105085Z"
-    },
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "from flaml import oai"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For (local) reproducibility and cost efficiency, we cache responses from OpenAI."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:56.109177Z",
-     "iopub.status.busy": "2023-02-13T23:40:56.108624Z",
-     "iopub.status.idle": "2023-02-13T23:40:56.112651Z",
-     "shell.execute_reply": "2023-02-13T23:40:56.112076Z"
-    },
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "oai.ChatCompletion.set_cache(seed)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This will create a disk cache in \".cache/{seed}\". You can change `cache_path` in `set_cache()`. The cache for different seeds are stored separately.\n",
-    "\n",
-    "### Perform tuning\n",
-    "\n",
-    "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n",
-    "\n",
-    "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.002 means the target inference budget is 0.002 dollars, which translates to 1000 tokens (input + output combined) if the gpt-3.5-turbo model is used.\n",
-    "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 0.5 means 0.5 dollars are allowed in total, which translates to 250K tokens for the gpt-3.5-turbo model.\n",
-    "* `num_sumples` is the number of different hyperparameter configurations which is allowed to try. The tuning will stop after either num_samples trials or after optimization_budget dollars spent, whichever happens first. -1 means no hard restriction in the number of trials and the actual number is decided by `optimization_budget`.\n",
-    "\n",
-    "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
-    "\n",
-    "```python\n",
-    "price1K = {\n",
-    "    \"gpt-3.5-turbo\": 0.002,\n",
-    "}\n",
-    "\n",
-    "default_search_space = {\n",
-    "    \"model\": tune.choice(list(price1K.keys())),\n",
-    "    \"temperature_or_top_p\": tune.choice(\n",
-    "        [\n",
-    "            {\"temperature\": tune.uniform(0, 1)},\n",
-    "            {\"top_p\": tune.uniform(0, 1)},\n",
-    "        ]\n",
-    "    ),\n",
-    "    \"max_tokens\": tune.lograndint(50, 1000),\n",
-    "    \"n\": tune.randint(1, 100),\n",
-    "    \"prompt\": \"{prompt}\",\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "The default search space can be overriden by users' input.\n",
-    "For example, the following code specifies a fixed prompt template and a list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:40:56.115383Z",
-     "iopub.status.busy": "2023-02-13T23:40:56.114975Z",
-     "iopub.status.idle": "2023-02-13T23:41:55.045654Z",
-     "shell.execute_reply": "2023-02-13T23:41:55.044973Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32m[I 2023-03-05 05:01:24,381]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 1 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 10, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.011049999999999999, 'cost': 0.011049999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.36280922847807595}, 'max_tokens': 347, 'n': 10, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.36280922847807595}, 'config/max_tokens': 347, 'config/n': 10, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0027980804443359375}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 2 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'n': 50, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6336482349262754}, 'max_tokens': 470, 'n': 50, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6336482349262754}, 'config/max_tokens': 470, 'config/n': 50, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0004801750183105469}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 3 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0.5308234838865221, 'success': 0.6, 'total_cost': 0.043492, 'cost': 0.032442, 'inference_cost': 0.0016220999999999998, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7605307121989587}, 'config/max_tokens': 82, 'config/n': 9, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0066220760345458984}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.003948266327914451}, 'max_tokens': 231, 'n': 81, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.049, 'cost': 0.005508, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.003948266327914451}, 'max_tokens': 231, 'n': 81, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.003948266327914451}, 'config/max_tokens': 231, 'config/n': 81, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0020475387573242188}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 5 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.29187606817063316}, 'max_tokens': 781, 'n': 71, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.29187606817063316}, 'max_tokens': 781, 'n': 71, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.29187606817063316}, 'config/max_tokens': 781, 'config/n': 71, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005230903625488281}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 6 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3733407600514692}, 'max_tokens': 375, 'n': 44, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3733407600514692}, 'max_tokens': 375, 'n': 44, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3733407600514692}, 'config/max_tokens': 375, 'config/n': 44, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.000446319580078125}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 7 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.5131382425543909}, 'max_tokens': 350, 'n': 60, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.5131382425543909}, 'max_tokens': 350, 'n': 60, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.5131382425543909}, 'config/max_tokens': 350, 'config/n': 60, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.00055694580078125}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 8 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.08172600000000001, 'cost': 0.032726000000000005, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9086488808086682}, 'max_tokens': 129, 'n': 9, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9086488808086682}, 'config/max_tokens': 129, 'config/n': 9, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.004898548126220703}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 9 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.09077800000000001, 'cost': 0.009052000000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.8286813263076767}, 'config/max_tokens': 57, 'config/n': 63, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0021355152130126953}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 10 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.1989475396788123}, 'max_tokens': 650, 'n': 35, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.1989475396788123}, 'max_tokens': 650, 'n': 35, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.1989475396788123}, 'config/max_tokens': 650, 'config/n': 35, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0006568431854248047}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8839364795611863}, 'max_tokens': 132, 'n': 17, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.09582600000000001, 'cost': 0.005048, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8839364795611863}, 'max_tokens': 132, 'n': 17, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.8839364795611863}, 'config/max_tokens': 132, 'config/n': 17, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.009762048721313477}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 12 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8211056578369285}, 'max_tokens': 78, 'n': 39, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.8211056578369285}, 'max_tokens': 78, 'n': 39, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.8211056578369285}, 'config/max_tokens': 78, 'config/n': 39, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.007121086120605469}\n",
-      "[flaml.tune.tune: 03-05 05:01:24] {811} INFO - trial 13 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.0422875090290305}, 'max_tokens': 56, 'n': 3, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:35] {215} INFO - result: {'expected_success': 0.15, 'success': 0.15, 'total_cost': 0.10778599999999998, 'cost': 0.011960000000000002, 'inference_cost': 0.000598, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.0422875090290305}, 'max_tokens': 56, 'n': 3, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.0422875090290305}, 'config/max_tokens': 56, 'config/n': 3, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 10.761135816574097}\n",
-      "[flaml.tune.tune: 03-05 05:01:35] {811} INFO - trial 14 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.11030610637969397}, 'max_tokens': 52, 'n': 3, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:52] {215} INFO - result: {'expected_success': 0.1, 'success': 0.1, 'total_cost': 0.11931399999999996, 'cost': 0.011528, 'inference_cost': 0.0005764, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.11030610637969397}, 'max_tokens': 52, 'n': 3, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.11030610637969397}, 'config/max_tokens': 52, 'config/n': 3, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 17.322299242019653}\n",
-      "[flaml.tune.tune: 03-05 05:01:52] {811} INFO - trial 15 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5632321190691856}, 'max_tokens': 89, 'n': 22, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:52] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5632321190691856}, 'max_tokens': 89, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5632321190691856}, 'config/max_tokens': 89, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0008306503295898438}\n",
-      "[flaml.tune.tune: 03-05 05:01:52] {811} INFO - trial 16 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.04561271084264061}, 'max_tokens': 51, 'n': 98, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:01:54] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.12412799999999996, 'cost': 0.004814, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.04561271084264061}, 'max_tokens': 51, 'n': 98, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.04561271084264061}, 'config/max_tokens': 51, 'config/n': 98, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 1.575875997543335}\n",
-      "[flaml.tune.tune: 03-05 05:01:54] {811} INFO - trial 17 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5087240651577944}, 'max_tokens': 95, 'n': 1, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:02:20] {215} INFO - result: {'expected_success': 0.3, 'success': 0.3, 'total_cost': 0.13279399999999997, 'cost': 0.008666, 'inference_cost': 0.0004333, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5087240651577944}, 'max_tokens': 95, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5087240651577944}, 'config/max_tokens': 95, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 26.14193034172058}\n",
-      "[flaml.tune.tune: 03-05 05:02:20] {811} INFO - trial 18 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6040740802039921}, 'max_tokens': 129, 'n': 25, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:02:20] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6040740802039921}, 'max_tokens': 129, 'n': 25, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6040740802039921}, 'config/max_tokens': 129, 'config/n': 25, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0008137226104736328}\n",
-      "[flaml.tune.tune: 03-05 05:02:20] {811} INFO - trial 19 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3754115138138923}, 'max_tokens': 86, 'n': 12, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:02:33] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.149274, 'cost': 0.01648, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3754115138138923}, 'max_tokens': 86, 'n': 12, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3754115138138923}, 'config/max_tokens': 86, 'config/n': 12, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 13.519219398498535}\n",
-      "[flaml.tune.tune: 03-05 05:02:33] {811} INFO - trial 20 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6887263877538047}, 'max_tokens': 173, 'n': 28, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:02:33] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.6887263877538047}, 'max_tokens': 173, 'n': 28, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.6887263877538047}, 'config/max_tokens': 173, 'config/n': 28, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005598068237304688}\n",
-      "[flaml.tune.tune: 03-05 05:02:33] {811} INFO - trial 21 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.40706161658517775}, 'max_tokens': 217, 'n': 5, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:03:20] {215} INFO - result: {'expected_success': 0.739152, 'success': 0.8, 'total_cost': 0.17876000000000006, 'cost': 0.029486000000000002, 'inference_cost': 0.0014743, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.40706161658517775}, 'max_tokens': 217, 'n': 5, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.40706161658517775}, 'config/max_tokens': 217, 'config/n': 5, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 47.16692495346069}\n",
-      "[flaml.tune.tune: 03-05 05:03:20] {811} INFO - trial 22 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.27048488009754645}}\n",
-      "[flaml.tune.tune: 03-05 05:04:01] {215} INFO - result: {'expected_success': 0.5125, 'success': 0.55, 'total_cost': 0.19355200000000006, 'cost': 0.014792000000000003, 'inference_cost': 0.0007396000000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.27048488009754645}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 174, 'config/n': 2, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.27048488009754645}, 'experiment_tag': 'exp', 'time_total_s': 40.51927351951599}\n",
-      "[flaml.tune.tune: 03-05 05:04:01] {811} INFO - trial 23 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3413175996734835}, 'max_tokens': 275, 'n': 52, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:04:01] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3413175996734835}, 'max_tokens': 275, 'n': 52, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3413175996734835}, 'config/max_tokens': 275, 'config/n': 52, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0007867813110351562}\n",
-      "[flaml.tune.tune: 03-05 05:04:01] {811} INFO - trial 24 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2645495244555195}, 'max_tokens': 499, 'n': 12, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:04:01] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2645495244555195}, 'max_tokens': 499, 'n': 12, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.2645495244555195}, 'config/max_tokens': 499, 'config/n': 12, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0006549358367919922}\n",
-      "[flaml.tune.tune: 03-05 05:04:01] {811} INFO - trial 25 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.48492162197022287}, 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:04:40] {215} INFO - result: {'expected_success': 0.55, 'success': 0.6, 'total_cost': 0.2079620000000001, 'cost': 0.01441, 'inference_cost': 0.0007205, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.48492162197022287}, 'max_tokens': 174, 'n': 2, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.48492162197022287}, 'config/max_tokens': 174, 'config/n': 2, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 38.88523626327515}\n",
-      "[flaml.tune.tune: 03-05 05:04:40] {811} INFO - trial 26 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7008948011018361}, 'max_tokens': 188, 'n': 2, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:05:20] {215} INFO - result: {'expected_success': 0.6375, 'success': 0.65, 'total_cost': 0.22241600000000009, 'cost': 0.014454, 'inference_cost': 0.0007227000000000001, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7008948011018361}, 'max_tokens': 188, 'n': 2, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7008948011018361}, 'config/max_tokens': 188, 'config/n': 2, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 40.07520294189453}\n",
-      "[flaml.tune.tune: 03-05 05:05:20] {811} INFO - trial 27 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.45563880608336627}, 'max_tokens': 181, 'n': 1, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:05:54] {215} INFO - result: {'expected_success': 0.55, 'success': 0.55, 'total_cost': 0.23225200000000013, 'cost': 0.009836000000000001, 'inference_cost': 0.0004918, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.45563880608336627}, 'max_tokens': 181, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.45563880608336627}, 'config/max_tokens': 181, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 34.365720987319946}\n",
-      "[flaml.tune.tune: 03-05 05:05:54] {811} INFO - trial 28 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.21155867162942757}, 'max_tokens': 183, 'n': 17, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:05:57] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.23748400000000014, 'cost': 0.005232, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.21155867162942757}, 'max_tokens': 183, 'n': 17, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.21155867162942757}, 'config/max_tokens': 183, 'config/n': 17, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 2.9915997982025146}\n",
-      "[flaml.tune.tune: 03-05 05:05:57] {811} INFO - trial 29 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.652909170066013}, 'max_tokens': 285, 'n': 31, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:05:57] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.652909170066013}, 'max_tokens': 285, 'n': 31, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.652909170066013}, 'config/max_tokens': 285, 'config/n': 31, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005283355712890625}\n",
-      "[flaml.tune.tune: 03-05 05:05:57] {811} INFO - trial 30 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9990495004030453}, 'max_tokens': 219, 'n': 18, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:06:02] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.24319000000000013, 'cost': 0.005706, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9990495004030453}, 'max_tokens': 219, 'n': 18, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9990495004030453}, 'config/max_tokens': 219, 'config/n': 18, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.099469184875488}\n",
-      "[flaml.tune.tune: 03-05 05:06:02] {811} INFO - trial 31 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4467837016610728}, 'max_tokens': 404, 'n': 1, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:06:50] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.25467800000000024, 'cost': 0.011488, 'inference_cost': 0.0005744, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4467837016610728}, 'max_tokens': 404, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4467837016610728}, 'config/max_tokens': 404, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 47.18360900878906}\n",
-      "[flaml.tune.tune: 03-05 05:06:50] {811} INFO - trial 32 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7150017857658078}, 'max_tokens': 469, 'n': 9, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:06:50] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7150017857658078}, 'max_tokens': 469, 'n': 9, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7150017857658078}, 'config/max_tokens': 469, 'config/n': 9, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.000614166259765625}\n",
-      "[flaml.tune.tune: 03-05 05:06:50] {811} INFO - trial 33 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2594708806296415}, 'max_tokens': 352, 'n': 7, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:07:35] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.29123200000000016, 'cost': 0.036554, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.2594708806296415}, 'max_tokens': 352, 'n': 7, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.2594708806296415}, 'config/max_tokens': 352, 'config/n': 7, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 45.43464660644531}\n",
-      "[flaml.tune.tune: 03-05 05:07:35] {811} INFO - trial 34 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5691158455115929}, 'max_tokens': 520, 'n': 22, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:07:35] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5691158455115929}, 'max_tokens': 520, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5691158455115929}, 'config/max_tokens': 520, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0005013942718505859}\n",
-      "[flaml.tune.tune: 03-05 05:07:35] {811} INFO - trial 35 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4357505186889488}, 'max_tokens': 153, 'n': 1, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:08:11] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.3012180000000001, 'cost': 0.009986, 'inference_cost': 0.0004993, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4357505186889488}, 'max_tokens': 153, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4357505186889488}, 'config/max_tokens': 153, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 36.294803857803345}\n",
-      "[flaml.tune.tune: 03-05 05:08:11] {811} INFO - trial 36 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.43174456068612144}, 'max_tokens': 244, 'n': 1, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:08:50] {215} INFO - result: {'expected_success': 0.45, 'success': 0.45, 'total_cost': 0.3115360000000001, 'cost': 0.010318, 'inference_cost': 0.0005159, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.43174456068612144}, 'max_tokens': 244, 'n': 1, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.43174456068612144}, 'config/max_tokens': 244, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 38.782007455825806}\n",
-      "[flaml.tune.tune: 03-05 05:08:50] {811} INFO - trial 37 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.31174598735063297}, 'max_tokens': 152, 'n': 93, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:08:50] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.31174598735063297}, 'max_tokens': 152, 'n': 93, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.31174598735063297}, 'config/max_tokens': 152, 'config/n': 93, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.000728607177734375}\n",
-      "[flaml.tune.tune: 03-05 05:08:50] {811} INFO - trial 38 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9998765149838305}, 'max_tokens': 968, 'n': 13, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:08:50] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.9998765149838305}, 'max_tokens': 968, 'n': 13, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.9998765149838305}, 'config/max_tokens': 968, 'config/n': 13, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0006527900695800781}\n",
-      "[flaml.tune.tune: 03-05 05:08:50] {811} INFO - trial 39 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4077967938262427}, 'max_tokens': 208, 'n': 6, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:09:37] {215} INFO - result: {'expected_success': 0.8148458933470506, 'success': 0.85, 'total_cost': 0.344804, 'cost': 0.03326799999999999, 'inference_cost': 0.0016634000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4077967938262427}, 'max_tokens': 208, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4077967938262427}, 'config/max_tokens': 208, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 46.54340124130249}\n",
-      "[flaml.tune.tune: 03-05 05:09:37] {811} INFO - trial 40 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 340, 'n': 1, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.4404342494313882}}\n",
-      "[flaml.tune.tune: 03-05 05:10:23] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 0.356122, 'cost': 0.011318000000000002, 'inference_cost': 0.0005658999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 340, 'n': 1, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.4404342494313882}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 340, 'config/n': 1, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.4404342494313882}, 'experiment_tag': 'exp', 'time_total_s': 45.89974808692932}\n",
-      "[flaml.tune.tune: 03-05 05:10:23] {811} INFO - trial 41 config: {'model': 'gpt-3.5-turbo', 'max_tokens': 127, 'n': 16, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.37515933822109715}}\n",
-      "[flaml.tune.tune: 03-05 05:10:26] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.361062, 'cost': 0.00494, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'max_tokens': 127, 'n': 16, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.37515933822109715}}, 'config/model': 'gpt-3.5-turbo', 'config/max_tokens': 127, 'config/n': 16, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.37515933822109715}, 'experiment_tag': 'exp', 'time_total_s': 3.5503623485565186}\n",
-      "[flaml.tune.tune: 03-05 05:10:26] {811} INFO - trial 42 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.996156173020253}, 'max_tokens': 107, 'n': 7, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:11:06] {215} INFO - result: {'expected_success': 0.646968646445905, 'success': 0.7, 'total_cost': 0.39229600000000003, 'cost': 0.031234, 'inference_cost': 0.0015617, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.996156173020253}, 'max_tokens': 107, 'n': 7, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.996156173020253}, 'config/max_tokens': 107, 'config/n': 7, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 40.09834337234497}\n",
-      "[flaml.tune.tune: 03-05 05:11:06] {811} INFO - trial 43 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.712309746815617}, 'max_tokens': 112, 'n': 77, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:11:06] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.712309746815617}, 'max_tokens': 112, 'n': 77, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.712309746815617}, 'config/max_tokens': 112, 'config/n': 77, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0007219314575195312}\n",
-      "[flaml.tune.tune: 03-05 05:11:06] {811} INFO - trial 44 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7694213309158455}, 'max_tokens': 226, 'n': 8, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:11:55] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.42729200000000006, 'cost': 0.034996, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.7694213309158455}, 'max_tokens': 226, 'n': 8, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.7694213309158455}, 'config/max_tokens': 226, 'config/n': 8, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 48.949331283569336}\n",
-      "[flaml.tune.tune: 03-05 05:11:55] {811} INFO - trial 45 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9557646172390091}, 'max_tokens': 293, 'n': 45, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:11:55] {215} INFO - result: {'inference_cost': inf, 'expected_success': -inf, 'cost': 0, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9557646172390091}, 'max_tokens': 293, 'n': 45, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9557646172390091}, 'config/max_tokens': 293, 'config/n': 45, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0007379055023193359}\n",
-      "[flaml.tune.tune: 03-05 05:11:55] {811} INFO - trial 46 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9767564075397783}, 'max_tokens': 65, 'n': 16, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:12:03] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.436042, 'cost': 0.008749999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9767564075397783}, 'max_tokens': 65, 'n': 16, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9767564075397783}, 'config/max_tokens': 65, 'config/n': 16, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 8.102897882461548}\n",
-      "[flaml.tune.tune: 03-05 05:12:03] {811} INFO - trial 47 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3783227519390696}, 'max_tokens': 111, 'n': 6, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:12:39] {215} INFO - result: {'expected_success': 0.5908468364197531, 'success': 0.65, 'total_cost': 0.46333, 'cost': 0.027288, 'inference_cost': 0.0013644, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.3783227519390696}, 'max_tokens': 111, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.3783227519390696}, 'config/max_tokens': 111, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 35.84658098220825}\n",
-      "[flaml.tune.tune: 03-05 05:12:39] {811} INFO - trial 48 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5239740220006481}, 'max_tokens': 150, 'n': 10, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:12:49] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.47180400000000006, 'cost': 0.008474, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.5239740220006481}, 'max_tokens': 150, 'n': 10, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.5239740220006481}, 'config/max_tokens': 150, 'config/n': 10, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 9.35022783279419}\n",
-      "[flaml.tune.tune: 03-05 05:12:49] {811} INFO - trial 49 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4090242730676276}, 'max_tokens': 198, 'n': 6, 'prompt': 0, 'stop': 0}\n",
-      "[flaml.tune.tune: 03-05 05:13:30] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.500916, 'cost': 0.029112000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4090242730676276}, 'max_tokens': 198, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4090242730676276}, 'config/max_tokens': 198, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 40.903329372406006}\n",
-      "[flaml.tune.tune: 03-05 05:13:30] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import logging\n",
-    "\n",
-    "config, analysis = oai.ChatCompletion.tune(\n",
-    "    data=tune_data,  # the data for tuning\n",
-    "    metric=\"expected_success\",  # the metric to optimize\n",
-    "    mode=\"max\",  # the optimization mode\n",
-    "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
-    "    # log_file_name=\"logs/math.log\",  # the log file name\n",
-    "    inference_budget=0.002,  # the inference budget (dollar)\n",
-    "    optimization_budget=0.5,  # the optimization budget (dollar)\n",
-    "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
-    "    # -1 means decided by the optimization budget only\n",
-    "    num_samples=-1,\n",
-    "    prompt=prompts,  # the prompt templates to choose from\n",
-    "    stop=\"###\",  # the stop sequence\n",
-    "    logging_level=logging.INFO,  # the logging level\n",
-    ")\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Output tuning results\n",
-    "\n",
-    "After the tuning, we can print out the config and the result found by FLAML:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:41:55.049204Z",
-     "iopub.status.busy": "2023-02-13T23:41:55.048871Z",
-     "iopub.status.idle": "2023-02-13T23:41:55.053284Z",
-     "shell.execute_reply": "2023-02-13T23:41:55.052574Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "optimized config {'model': 'gpt-3.5-turbo', 'max_tokens': 208, 'n': 6, 'prompt': <function <lambda> at 0x7f80e405b430>, 'stop': '###', 'temperature': 0.4077967938262427}\n",
-      "best result on tuning data {'expected_success': 0.8148458933470506, 'success': 0.85, 'total_cost': 0.344804, 'cost': 0.03326799999999999, 'inference_cost': 0.0016634000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4077967938262427}, 'max_tokens': 208, 'n': 6, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4077967938262427}, 'config/max_tokens': 208, 'config/n': 6, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 46.54340124130249}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"optimized config\", config)\n",
-    "print(\"best result on tuning data\", analysis.best_result)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "slideshow": {
-     "slide_type": "slide"
-    }
-   },
-   "source": [
-    "### Make a request with the tuned config\n",
-    "\n",
-    "We can apply the tuned config on the request for an example task:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:41:55.056205Z",
-     "iopub.status.busy": "2023-02-13T23:41:55.055631Z",
-     "iopub.status.idle": "2023-02-13T23:41:56.039259Z",
-     "shell.execute_reply": "2023-02-13T23:41:56.038427Z"
-    },
-    "slideshow": {
-     "slide_type": "subslide"
-    },
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "  \"choices\": [\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 0,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\nAnswer: Using the logarithmic identity $\\\\log_{a}(b\\\\cdot c)=\\\\log_{a}(b)+\\\\log_{a}(c)$, we can simplify the expression as follows: $$\\\\log_{10} 40 +\\\\log_{10} 25=\\\\log_{10}(40\\\\cdot 25)=\\\\log_{10}(1000)=\\\\boxed{3}.$$\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": null,\n",
-      "      \"index\": 1,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\nAnswer: Using the logarithmic property $\\\\log_a b + \\\\log_a c = \\\\log_a (bc)$, we can combine the two logarithms to get $\\\\log_{10} 40 \\\\cdot 25$. Simplifying, we get $\\\\log_{10} 1000$. Since $10^3 = 1000$, we have $\\\\log_{10} 1000 = \\\\boxed{3}$.\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 2,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\nAnswer: Using the logarithmic property $\\\\log_a b + \\\\log_a c = \\\\log_a (bc)$, we can simplify the expression as follows: $$\\\\log_{10} 40 + \\\\log_{10} 25 = \\\\log_{10} (40 \\\\cdot 25) = \\\\log_{10} 1000$$ Since $1000$ is equal to $10^3$, we have $\\\\log_{10} 1000 = \\\\boxed{3}$.\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 3,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\nAnswer: Using the logarithmic identity $\\\\log_{a}(b\\\\cdot c) = \\\\log_{a}(b) + \\\\log_{a}(c)$, we can simplify the expression as follows:\\n\\n$$\\\\log_{10} 40 +\\\\log_{10} 25 = \\\\log_{10} (40\\\\cdot 25) = \\\\log_{10} 1000$$\\n\\nSince $1000 = 10^3$, we have $\\\\log_{10} 1000 = \\\\boxed{3}$.\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 4,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\nAnswer: Using the logarithmic property $\\\\log_{a}(b) + \\\\log_{a}(c) = \\\\log_{a}(bc)$, we can simplify the expression to $\\\\log_{10}(40 \\\\cdot 25)$. Multiplying $40$ and $25$ gives us $1000$. Therefore, the expression simplifies to $\\\\log_{10}1000$. Since $10^3=1000$, we have $\\\\log_{10}1000 = \\\\boxed{3}$.\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 5,\n",
-      "      \"message\": {\n",
-      "        \"content\": \"\\n\\nAnswer: Using the logarithmic identity $\\\\log_{a}(b) + \\\\log_{a}(c) = \\\\log_{a}(bc)$, we can simplify the expression to $\\\\log_{10}(40\\\\cdot25)$. Evaluating $40\\\\cdot25$ gives us $1000$, so our final answer is $\\\\log_{10}(1000) = \\\\boxed{3}$.\",\n",
-      "        \"role\": \"assistant\"\n",
-      "      }\n",
-      "    }\n",
-      "  ],\n",
-      "  \"created\": 1677992931,\n",
-      "  \"id\": \"chatcmpl-6qau3onXVENQuWDXUttbTe3rJ27vH\",\n",
-      "  \"model\": \"gpt-3.5-turbo-0301\",\n",
-      "  \"object\": \"chat.completion\",\n",
-      "  \"usage\": {\n",
-      "    \"completion_tokens\": 575,\n",
-      "    \"prompt_tokens\": 112,\n",
-      "    \"total_tokens\": 687\n",
-      "  }\n",
-      "}\n",
-      "{'expected_success': 1.0, 'success': True}\n"
-     ]
-    }
-   ],
-   "source": [
-    "responses = oai.ChatCompletion.create(context=tune_data[1], **config)\n",
-    "print(responses)\n",
-    "print(success_metrics([response[\"message\"][\"content\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Evaluate the success rate on the test data\n",
-    "\n",
-    "You can use flaml's `oai.ChatCompletion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.ChatCompletion.data` to the data to evaluate. The following code will take a while to evaluate all the 438 test data instances."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-02-13T23:41:56.042764Z",
-     "iopub.status.busy": "2023-02-13T23:41:56.042086Z",
-     "iopub.status.idle": "2023-02-13T23:53:05.597643Z",
-     "shell.execute_reply": "2023-02-13T23:53:05.596603Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'expected_success': 0.7719714162844925, 'success': 0.8123569794050344, 'total_cost': 1.1100199999999998, 'cost': 0.6091040000000002, 'inference_cost': 0.001393830663615561}\n"
-     ]
-    }
-   ],
-   "source": [
-    "oai.ChatCompletion.data = test_data\n",
-    "result = oai.ChatCompletion.eval(analysis.best_config, prune=False, eval_only=True)\n",
-    "print(result)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
-   }
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "454146d0f7224f038689031002906e6f": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HBoxModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HBoxView",
-       "box_style": "",
-       "children": [
-        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
-        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
-        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
-       ],
-       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
-       "tabbable": null,
-       "tooltip": null
-      }
-     },
-     "577e1e3cc4db4942b0883577b3b52755": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "FloatProgressModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "ProgressView",
-       "bar_style": "success",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
-       "max": 1,
-       "min": 0,
-       "orientation": "horizontal",
-       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
-       "tabbable": null,
-       "tooltip": null,
-       "value": 1
-      }
-     },
-     "6086462a12d54bafa59d3c4566f06cb2": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "74a6ba0c3cbc4051be0a83e152fe1e62": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "ProgressStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "bar_color": null,
-       "description_width": ""
-      }
-     },
-     "7d3f3d9e15894d05a4d188ff4f466554": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "background": null,
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HTMLView",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
-       "placeholder": "​",
-       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
-       "tabbable": null,
-       "tooltip": null,
-       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
-      }
-     },
-     "ca245376fd9f4354af6b2befe4af4466": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLStyleModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "StyleView",
-       "background": null,
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "dc83c7bff2f241309537a8119dfc7555": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     },
-     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "_dom_classes": [],
-       "_model_module": "@jupyter-widgets/controls",
-       "_model_module_version": "2.0.0",
-       "_model_name": "HTMLModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/controls",
-       "_view_module_version": "2.0.0",
-       "_view_name": "HTMLView",
-       "description": "",
-       "description_allow_html": false,
-       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
-       "placeholder": "​",
-       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
-       "tabbable": null,
-       "tooltip": null,
-       "value": "100%"
-      }
-     },
-     "f1355871cc6f4dd4b50d9df5af20e5c8": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {
-       "_model_module": "@jupyter-widgets/base",
-       "_model_module_version": "2.0.0",
-       "_model_name": "LayoutModel",
-       "_view_count": null,
-       "_view_module": "@jupyter-widgets/base",
-       "_view_module_version": "2.0.0",
-       "_view_name": "LayoutView",
-       "align_content": null,
-       "align_items": null,
-       "align_self": null,
-       "border_bottom": null,
-       "border_left": null,
-       "border_right": null,
-       "border_top": null,
-       "bottom": null,
-       "display": null,
-       "flex": null,
-       "flex_flow": null,
-       "grid_area": null,
-       "grid_auto_columns": null,
-       "grid_auto_flow": null,
-       "grid_auto_rows": null,
-       "grid_column": null,
-       "grid_gap": null,
-       "grid_row": null,
-       "grid_template_areas": null,
-       "grid_template_columns": null,
-       "grid_template_rows": null,
-       "height": null,
-       "justify_content": null,
-       "justify_items": null,
-       "left": null,
-       "margin": null,
-       "max_height": null,
-       "max_width": null,
-       "min_height": null,
-       "min_width": null,
-       "object_fit": null,
-       "object_position": null,
-       "order": null,
-       "overflow": null,
-       "padding": null,
-       "right": null,
-       "top": null,
-       "visibility": null,
-       "width": null
-      }
-     }
-    },
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/notebook/integrate_openai.ipynb b/notebook/integrate_openai.ipynb
index 8edf39635a9e..6b7434265cdf 100644
--- a/notebook/integrate_openai.ipynb
+++ b/notebook/integrate_openai.ipynb
@@ -15,6 +15,8 @@
     "\n",
     "# Use FLAML to Tune OpenAI Models\n",
     "\n",
+    "FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of LLMs.\n",
+    "\n",
     "In this notebook, we tune OpenAI models for code generation. We use [the HumanEval benchmark](https://huggingface.co/datasets/openai_humaneval) released by OpenAI for synthesizing programs from docstrings. \n",
     "\n",
     "## Requirements\n",
@@ -126,7 +128,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "35cd066a31b242bb87b2c106ee72e5f2",
+       "model_id": "d025d7cf0bc3438ba290e24d97855d8f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -441,7 +443,7 @@
     "\n",
     "### Perform tuning\n",
     "\n",
-    "The tuning will take a while to finish, depending on the optimization budget (~1 min for the current budget). The tuning will be performed under the specified optimization budgets.\n",
+    "The tuning will take a while to finish, depending on the optimization budget. The tuning will be performed under the specified optimization budgets.\n",
     "\n",
     "* `inference_budget` is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.\n",
     "* `optimization_budget` is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.\n",
@@ -450,19 +452,14 @@
     "Users can specify tuning data, optimization metric, optimization mode, evaluation function, search spaces etc.. The default search space is:\n",
     "\n",
     "```python\n",
-    "price1K = {\n",
-    "    \"text-ada-001\": 0.0004,\n",
-    "    \"text-babbage-001\": 0.0005,\n",
-    "    \"text-curie-001\": 0.002,\n",
-    "    \"code-cushman-001\": 0.024,\n",
-    "    \"code-davinci-002\": 0.1,\n",
-    "    \"text-davinci-002\": 0.02,\n",
-    "    \"text-davinci-003\": 0.02,\n",
-    "    \"gpt-3.5-turbo\": 0.002,\n",
-    "}\n",
-    "\n",
     "default_search_space = {\n",
-    "    \"model\": tune.choice(list(price1K.keys())),\n",
+    "    \"model\": tune.choice([\n",
+    "        \"text-ada-001\",\n",
+    "        \"text-babbage-001\",\n",
+    "        \"text-davinci-003\",\n",
+    "        \"gpt-3.5-turbo\",\n",
+    "        \"gpt-4\",\n",
+    "    ]),\n",
     "    \"temperature_or_top_p\": tune.choice(\n",
     "        [\n",
     "            {\"temperature\": tune.uniform(0, 1)},\n",
@@ -475,13 +472,13 @@
     "}\n",
     "```\n",
     "\n",
-    "The default search space can be overriden by users' input.\n",
-    "For example, the following code specifies two choices for the model, four choices for the prompt and a fixed list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used."
+    "The default search space can be overridden by users' input.\n",
+    "For example, the following code specifies four choices for the prompt and a fixed list of stop sequences. For hyperparameters which don't appear in users' input, the default search space will be used. If you don't have access to gpt-4 or would like to modify the choice of models, you can provide a different search space for model."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 21,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:25:40.593603Z",
@@ -495,119 +492,69 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m[I 2023-02-24 23:25:40,643]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32m[I 2023-02-24 23:25:40,646]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:25:40] {811} INFO - trial 1 config: {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:25:44] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.4624999999999999, 'cost': 0.4624999999999999, 'inference_cost': 0.023125, 'training_iteration': 0, 'config': {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'code-davinci-002', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 3.687161445617676}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:25:44] {811} INFO - trial 2 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:25:45] {215} INFO - result: {'expected_success': 0.35, 'success': 0.35, 'total_cost': 0.5671159999999997, 'cost': 0.104616, 'inference_cost': 0.0052308, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6666913032531738}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:25:45] {811} INFO - trial 3 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0, 'stop': 0}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:01] {215} INFO - result: {'expected_success': 0.5080706992649381, 'success': 0.55, 'total_cost': 1.1424679999999998, 'cost': 0.575352, 'inference_cost': 0.0287676, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.4985070123025904}, 'max_tokens': 97, 'n': 20, 'prompt': 0, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'top_p': 0.4985070123025904}, 'config/max_tokens': 97, 'config/n': 20, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 16.66586470603943}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:01] {811} INFO - trial 4 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.6125260668293881}, 'max_tokens': 433, 'n': 29, 'prompt': 0, 'stop': 0}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0.6186627404336135, 'success': 0.65, 'total_cost': 2.3693479999999987, 'cost': 1.2268800000000002, 'inference_cost': 0.059620799999999995, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.6125260668293881}, 'max_tokens': 433, 'n': 29, 'prompt': 0, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'top_p': 0.6125260668293881}, 'config/max_tokens': 433, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 36.605130434036255}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {811} INFO - trial 5 config: {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'n': 65, 'prompt': 3, 'stop': 0}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.5295479999999984, 'cost': 0.1602, 'training_iteration': 0, 'config': {'model': 'code-davinci-002', 'temperature_or_top_p': {'temperature': 0.6177669784693172}, 'max_tokens': 231, 'n': 65, 'prompt': 3, 'stop': 0}, 'config/model': 'code-davinci-002', 'config/temperature_or_top_p': {'temperature': 0.6177669784693172}, 'config/max_tokens': 231, 'config/n': 65, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0020499229431152344}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {811} INFO - trial 6 config: {'model': 'code-davinci-002', 'max_tokens': 263, 'n': 41, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'top_p': 0.49834557213253655}}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.8578479999999984, 'cost': 0.32830000000000004, 'training_iteration': 0, 'config': {'model': 'code-davinci-002', 'max_tokens': 263, 'n': 41, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'top_p': 0.49834557213253655}}, 'config/model': 'code-davinci-002', 'config/max_tokens': 263, 'config/n': 41, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'top_p': 0.49834557213253655}, 'experiment_tag': 'exp', 'time_total_s': 0.002808809280395508}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {811} INFO - trial 7 config: {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 3, 'stop': 0}\n"
+      "\u001b[32m[I 2023-03-26 02:53:26,384]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
+      "\u001b[32m[I 2023-03-26 02:53:26,387]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {215} INFO - result: {'expected_success': 0, 'total_cost': 4.028831999999999, 'cost': 1.170984, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'temperature': 0.8286813263076767}, 'max_tokens': 57, 'n': 63, 'prompt': 3, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'temperature': 0.8286813263076767}, 'config/max_tokens': 57, 'config/n': 63, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.015198230743408203}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[flaml.tune.tune: 02-24 23:26:38] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
+      "[flaml.tune.tune: 03-26 02:53:26] {811} INFO - trial 1 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:29] {215} INFO - result: {'expected_success': 0.6, 'success': 0.6, 'total_cost': 0.09264000000000001, 'cost': 0.09264000000000001, 'inference_cost': 0.004632, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 3.5772321224212646}\n",
+      "[flaml.tune.tune: 03-26 02:53:29] {811} INFO - trial 2 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:30] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09429879999999999, 'cost': 0.0016588, 'inference_cost': 7.264e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5873167514801025}\n",
+      "[flaml.tune.tune: 03-26 02:53:30] {811} INFO - trial 3 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.09782479999999999, 'cost': 0.003526, 'inference_cost': 0.00016342499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6068365573883057}\n",
+      "[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 4 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:31] {215} INFO - result: {'expected_success': 0.2, 'success': 0.2, 'total_cost': 0.10643079999999999, 'cost': 0.008606, 'inference_cost': 0.0004394, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.5878369808197021}\n",
+      "[flaml.tune.tune: 03-26 02:53:31] {811} INFO - trial 5 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:32] {215} INFO - result: {'expected_success': 0.8, 'success': 0.8, 'total_cost': 0.2603308, 'cost': 0.15389999999999998, 'inference_cost': 0.007861499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.36865945026811975}, 'max_tokens': 347, 'n': 1, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'config/max_tokens': 347, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.6071126461029053}\n",
+      "[flaml.tune.tune: 03-26 02:53:32] {811} INFO - trial 6 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.2629064, 'cost': 0.0025756000000000004, 'inference_cost': 0.00011848, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.7605307121989587}, 'max_tokens': 82, 'n': 9, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.7605307121989587}, 'config/max_tokens': 82, 'config/n': 9, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.4761645793914795}\n",
+      "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 7 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.4132364, 'cost': 0.15033000000000002, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.14217004760152696}, 'max_tokens': 152, 'n': 67, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.14217004760152696}, 'config/max_tokens': 152, 'config/n': 67, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022079944610595703}\n",
+      "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 8 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:37] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.6260264, 'cost': 0.21278999999999998, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.30070005663620336}, 'max_tokens': 70, 'n': 83, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.30070005663620336}, 'config/max_tokens': 70, 'config/n': 83, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0022161006927490234}\n",
+      "[flaml.tune.tune: 03-26 02:53:37] {811} INFO - trial 9 config: {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:53:43] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6310854, 'cost': 0.005059, 'inference_cost': 0.00023457499999999997, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'temperature_or_top_p': {'temperature': 0.16501589771914849}, 'max_tokens': 161, 'n': 10, 'prompt': 3, 'stop': 0}, 'config/model': 'text-babbage-001', 'config/temperature_or_top_p': {'temperature': 0.16501589771914849}, 'config/max_tokens': 161, 'config/n': 10, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 5.868851661682129}\n",
+      "[flaml.tune.tune: 03-26 02:53:43] {811} INFO - trial 10 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:54:05] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.6344234000000001, 'cost': 0.003338, 'inference_cost': 0.0001522, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'temperature': 0.5902013629854229}, 'max_tokens': 56, 'n': 36, 'prompt': 3, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'temperature': 0.5902013629854229}, 'config/max_tokens': 56, 'config/n': 36, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 21.348156690597534}\n",
+      "[flaml.tune.tune: 03-26 02:54:05] {811} INFO - trial 11 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0.3476191678990812, 'success': 0.35, 'total_cost': 0.7530034000000003, 'cost': 0.11858000000000002, 'inference_cost': 0.005490999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.763240587143681}, 'max_tokens': 693, 'n': 42, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.763240587143681}, 'config/max_tokens': 693, 'config/n': 42, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 28.24349284172058}\n",
+      "[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 12 config: {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:54:33] {215} INFO - result: {'expected_success': 0, 'total_cost': 0.9340534000000004, 'cost': 0.18105, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'temperature_or_top_p': {'temperature': 0.2927979762895091}, 'max_tokens': 60, 'n': 97, 'prompt': 2, 'stop': 0}, 'config/model': 'gpt-4', 'config/temperature_or_top_p': {'temperature': 0.2927979762895091}, 'config/max_tokens': 60, 'config/n': 97, 'config/prompt': 2, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.002497434616088867}\n",
+      "[flaml.tune.tune: 03-26 02:54:33] {811} INFO - trial 13 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:54:35] {215} INFO - result: {'expected_success': 0.28359375, 'success': 0.35, 'total_cost': 0.9496594000000004, 'cost': 0.015605999999999998, 'inference_cost': 0.0007894, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'top_p': 0.7186028103822503}, 'max_tokens': 288, 'n': 4, 'prompt': 1, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'top_p': 0.7186028103822503}, 'config/max_tokens': 288, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 2.29030704498291}\n",
+      "[flaml.tune.tune: 03-26 02:54:35] {811} INFO - trial 14 config: {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 0.9550898000000005, 'cost': 0.0054304, 'inference_cost': 0.00026122, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'temperature_or_top_p': {'top_p': 0.3653649712141158}, 'max_tokens': 96, 'n': 75, 'prompt': 1, 'stop': 0}, 'config/model': 'text-ada-001', 'config/temperature_or_top_p': {'top_p': 0.3653649712141158}, 'config/max_tokens': 96, 'config/n': 75, 'config/prompt': 1, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 44.837317943573}\n",
+      "[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 15 config: {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:55:20] {215} INFO - result: {'expected_success': 0, 'total_cost': 1.0798498000000005, 'cost': 0.12475999999999998, 'training_iteration': 0, 'config': {'model': 'text-davinci-003', 'temperature_or_top_p': {'temperature': 0.3814115349046321}, 'max_tokens': 791, 'n': 92, 'prompt': 3, 'stop': 0}, 'config/model': 'text-davinci-003', 'config/temperature_or_top_p': {'temperature': 0.3814115349046321}, 'config/max_tokens': 791, 'config/n': 92, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 0.0024149417877197266}\n",
+      "[flaml.tune.tune: 03-26 02:55:20] {811} INFO - trial 16 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:55:29] {215} INFO - result: {'expected_success': 0.5484931390416686, 'success': 0.55, 'total_cost': 1.1118038000000003, 'cost': 0.031954, 'inference_cost': 0.0015885000000000003, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.4284507389678964}, 'max_tokens': 398, 'n': 11, 'prompt': 3, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.4284507389678964}, 'config/max_tokens': 398, 'config/n': 11, 'config/prompt': 3, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 9.271101951599121}\n",
+      "[flaml.tune.tune: 03-26 02:55:29] {811} INFO - trial 17 config: {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}\n",
+      "[flaml.tune.tune: 03-26 02:55:46] {215} INFO - result: {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n",
+      "[flaml.tune.tune: 03-26 02:55:46] {811} INFO - trial 18 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:56:05] {215} INFO - result: {'expected_success': 0.3551828400470255, 'success': 0.4, 'total_cost': 2.1919698000000003, 'cost': 0.061546, 'inference_cost': 0.0030944, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9761031076386442}, 'max_tokens': 349, 'n': 23, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9761031076386442}, 'config/max_tokens': 349, 'config/n': 23, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 19.451276063919067}\n",
+      "[flaml.tune.tune: 03-26 02:56:05] {811} INFO - trial 19 config: {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}\n",
+      "[flaml.tune.tune: 03-26 02:56:18] {215} INFO - result: {'expected_success': 0.2898979473186428, 'success': 0.35, 'total_cost': 2.2507018000000003, 'cost': 0.058732, 'inference_cost': 0.0029537, 'training_iteration': 0, 'config': {'model': 'gpt-3.5-turbo', 'temperature_or_top_p': {'temperature': 0.9822374507369328}, 'max_tokens': 393, 'n': 22, 'prompt': 0, 'stop': 0}, 'config/model': 'gpt-3.5-turbo', 'config/temperature_or_top_p': {'temperature': 0.9822374507369328}, 'config/max_tokens': 393, 'config/n': 22, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 13.075204372406006}\n",
+      "[flaml.tune.tune: 03-26 02:56:18] {811} INFO - trial 20 config: {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}\n",
+      "[flaml.tune.tune: 03-26 02:56:19] {215} INFO - result: {'expected_success': 0.75, 'success': 0.75, 'total_cost': 2.4012418000000006, 'cost': 0.15053999999999995, 'inference_cost': 0.007693499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 348, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.36865945026811975}}, 'config/model': 'gpt-4', 'config/max_tokens': 348, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.36865945026811975}, 'experiment_tag': 'exp', 'time_total_s': 0.6143312454223633}\n",
+      "[flaml.tune.tune: 03-26 02:56:19] {811} INFO - trial 21 config: {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}\n",
+      "[flaml.tune.tune: 03-26 02:56:32] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4043414000000016, 'cost': 0.0030996000000000005, 'inference_cost': 0.00014468, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 130, 'n': 22, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.22084263211180838}}, 'config/model': 'text-ada-001', 'config/max_tokens': 130, 'config/n': 22, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.22084263211180838}, 'experiment_tag': 'exp', 'time_total_s': 13.137321710586548}\n",
+      "[flaml.tune.tune: 03-26 02:56:32] {811} INFO - trial 22 config: {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}\n",
+      "[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.4061918000000024, 'cost': 0.0018504, 'inference_cost': 8.222e-05, 'training_iteration': 0, 'config': {'model': 'text-ada-001', 'max_tokens': 342, 'n': 4, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.2881152790307279}}, 'config/model': 'text-ada-001', 'config/max_tokens': 342, 'config/n': 4, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.2881152790307279}, 'experiment_tag': 'exp', 'time_total_s': 2.4484035968780518}\n",
+      "[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 23 config: {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}\n",
+      "[flaml.tune.tune: 03-26 02:56:35] {215} INFO - result: {'expected_success': 0, 'total_cost': 2.618831800000003, 'cost': 0.21264, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 253, 'n': 23, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.41254458573656}}, 'config/model': 'gpt-4', 'config/max_tokens': 253, 'config/n': 23, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.41254458573656}, 'experiment_tag': 'exp', 'time_total_s': 0.003139972686767578}\n",
+      "[flaml.tune.tune: 03-26 02:56:35] {811} INFO - trial 24 config: {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}\n",
+      "[flaml.tune.tune: 03-26 02:56:36] {215} INFO - result: {'expected_success': 0.8185185185185185, 'success': 0.85, 'total_cost': 2.912231800000003, 'cost': 0.29339999999999994, 'inference_cost': 0.014836499999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 176, 'n': 3, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.0964133254059763}}, 'config/model': 'gpt-4', 'config/max_tokens': 176, 'config/n': 3, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.0964133254059763}, 'experiment_tag': 'exp', 'time_total_s': 1.8556303977966309}\n",
+      "[flaml.tune.tune: 03-26 02:56:36] {811} INFO - trial 25 config: {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}\n",
+      "[flaml.tune.tune: 03-26 02:56:55] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9569863000000023, 'cost': 0.04475450000000001, 'inference_cost': 0.00222485, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 343, 'n': 27, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24286268913046594}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 343, 'config/n': 27, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24286268913046594}, 'experiment_tag': 'exp', 'time_total_s': 19.013901472091675}\n",
+      "[flaml.tune.tune: 03-26 02:56:55] {811} INFO - trial 26 config: {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}\n",
+      "[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0.0, 'success': 0.0, 'total_cost': 2.9595088000000023, 'cost': 0.0025224999999999996, 'inference_cost': 0.00011325, 'training_iteration': 0, 'config': {'model': 'text-babbage-001', 'max_tokens': 130, 'n': 1, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.26609522201207036}}, 'config/model': 'text-babbage-001', 'config/max_tokens': 130, 'config/n': 1, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.26609522201207036}, 'experiment_tag': 'exp', 'time_total_s': 0.5786199569702148}\n",
+      "[flaml.tune.tune: 03-26 02:56:56] {811} INFO - trial 27 config: {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}\n",
+      "[flaml.tune.tune: 03-26 02:56:56] {215} INFO - result: {'expected_success': 0, 'total_cost': 3.0123088000000022, 'cost': 0.05279999999999999, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 212, 'n': 29, 'prompt': 0, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.24802150727233283}}, 'config/model': 'gpt-4', 'config/max_tokens': 212, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.24802150727233283}, 'experiment_tag': 'exp', 'time_total_s': 0.0019483566284179688}\n",
+      "[flaml.tune.tune: 03-26 02:56:56] {834} WARNING - fail to sample a trial for 100 times in a row, stopping.\n"
      ]
     }
    ],
@@ -618,20 +565,11 @@
     "    mode=\"max\",  # the optimization mode\n",
     "    eval_func=success_metrics,  # the evaluation function to return the success metrics\n",
     "    # log_file_name=\"logs/humaneval.log\",  # the log file name\n",
-    "    inference_budget=0.1,  # the inference budget (dollar)\n",
-    "    optimization_budget=4,  # the optimization budget (dollar)\n",
+    "    inference_budget=0.05,  # the inference budget (dollar)\n",
+    "    optimization_budget=3,  # the optimization budget (dollar)\n",
     "    # num_samples can further limit the number of trials for different hyperparameter configurations;\n",
     "    # -1 means decided by the optimization budget only\n",
     "    num_samples=-1,\n",
-    "    model=tune.choice(\n",
-    "        [\n",
-    "            # These two models are currently free to use from OpenAI,\n",
-    "            # so no actual cost will incur. They are not free in Azure OpenAI.\n",
-    "            # The optimization is based on the price in Azure OpenAI.\n",
-    "            \"code-cushman-001\", \n",
-    "            \"code-davinci-002\",\n",
-    "        ]\n",
-    "    ),\n",
     "    prompt=[\n",
     "        \"{prompt}\",\n",
     "        \"# Python 3{prompt}\",\n",
@@ -654,7 +592,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 22,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:26:38.352710Z",
@@ -668,8 +606,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "optimized config {'model': 'code-cushman-001', 'max_tokens': 433, 'n': 29, 'prompt': '{prompt}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'top_p': 0.6125260668293881}\n",
-      "best result on tuning data {'expected_success': 0.6186627404336135, 'success': 0.65, 'total_cost': 2.3693479999999987, 'cost': 1.2268800000000002, 'inference_cost': 0.059620799999999995, 'training_iteration': 0, 'config': {'model': 'code-cushman-001', 'temperature_or_top_p': {'top_p': 0.6125260668293881}, 'max_tokens': 433, 'n': 29, 'prompt': 0, 'stop': 0}, 'config/model': 'code-cushman-001', 'config/temperature_or_top_p': {'top_p': 0.6125260668293881}, 'config/max_tokens': 433, 'config/n': 29, 'config/prompt': 0, 'config/stop': 0, 'experiment_tag': 'exp', 'time_total_s': 36.605130434036255}\n"
+      "optimized config {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': '# Python 3{prompt}', 'stop': ['\\nclass', '\\ndef', '\\nif', '\\nprint'], 'temperature': 0.25447895557126815}\n",
+      "best result on tuning data {'expected_success': 0.8822303234803123, 'success': 0.9, 'total_cost': 2.1304238, 'cost': 1.0186199999999999, 'inference_cost': 0.0484995, 'training_iteration': 0, 'config': {'model': 'gpt-4', 'max_tokens': 211, 'n': 13, 'prompt': 1, 'stop': 0, 'temperature_or_top_p': {'temperature': 0.25447895557126815}}, 'config/model': 'gpt-4', 'config/max_tokens': 211, 'config/n': 13, 'config/prompt': 1, 'config/stop': 0, 'config/temperature_or_top_p': {'temperature': 0.25447895557126815}, 'experiment_tag': 'exp', 'time_total_s': 16.604310512542725}\n"
      ]
     }
    ],
@@ -694,7 +632,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 23,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:26:38.359902Z",
@@ -717,194 +655,118 @@
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 0,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 1,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 2,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 3,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(game[i]-guess[i]) for i in range(len(game))]\\n\\n#print(compare([1,2,3,4,5,1],[1,2,3,4,2,-2]))\\n#print(compare([0,5,0,0,0,4],[4,1,1,0,0,-2]))\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 4,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    #\\n    # TODO: Define compare\\n    #\\n    return None\\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 5,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    # your code here\\n    return [abs(game[i]-guess[i]) for i in range(len(game))]\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 6,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(game[i]-guess[i]) for i in range(len(game))]\\n    \\n\\n# Recursion\\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        if game[i] == guess[i]:\\n            result.append(0)\\n        else:\\n            result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
-      "      \"finish_reason\": \"length\",\n",
+      "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 7,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    #return a list of the absolute difference between guess and score\\n    return [abs(score-guess) for score,guess in zip(game,guess)]\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    return [abs(game[i] - guess[i]) for i in range(len(game))]\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 8,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    #Your code goes here\\n    guess = np.array(guess)\\n    score = np.array(game)\\n    res = np.zeros(len(guess))\\n    for i in range(len(guess)):\\n        if guess[i] == score[i]:\\n            res[i] = 0\\n        else:\\n            res[i] = abs(guess[i] - score[i])\\n    return res\\n\\n#print(compare([1,2,3,4,5,1],[1,2,3,4,2,-2]))\\n#print(compare([0,5,0,0,0,4],[4,1,1,0,0,-2]))\\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    return [abs(a - b) for a, b in zip(game, guess)]\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 9,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    if len(game) != len(guess):\\n        return \\\"Your arrays are not of equal length.\\\"\\n    else:\\n        return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
-      "      \"finish_reason\": \"length\",\n",
+      "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 10,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    l = []\\n    for i in range(len(guess)):\\n        if guess[i] == game[i]:\\n            l.append(0)\\n        else:\\n            l.append(abs(guess[i] - game[i]))\\n    return l\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 11,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    },\n",
       "    {\n",
       "      \"finish_reason\": \"stop\",\n",
       "      \"index\": 12,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    assert len(game) == len(guess), \\\"the length of game and guess must be equal\\\"\\n    return [abs(guess[i] - game[i]) for i in range(len(game))]\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 13,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(a-b) for a,b in zip(game,guess)]\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 14,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(guess))]\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"length\",\n",
-      "      \"index\": 15,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    answer = []\\n    for i in range(len(guess)):\\n        answer.append(guess[i]-game[i])\\n    return answer\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 16,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(guess))]\\n\\n#%%\\n#EXAMPLE\\n#%%\\ngame = [1,2,3,4,5,1]\\nguess = [1,2,3,4,2,-2]\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 17,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(game[i]-guess[i]) for i in range(len(game))]\\n    \\n\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 18,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"length\",\n",
-      "      \"index\": 19,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(guess))]\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"length\",\n",
-      "      \"index\": 20,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    if len(game) != len(guess):\\n        return []\\n    results = []\\n    for i in range(len(game)):\\n        results.append(abs(guess[i] - game[i]))\\n    return results\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 21,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(game[i]-guess[i]) for i in range(len(game))]\\n\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"length\",\n",
-      "      \"index\": 22,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i] - game[i]) for i in range(len(game))]\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"length\",\n",
-      "      \"index\": 23,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i] - game[i]) for i in range(len(guess))]\\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \\n    \"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 24,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i] - game[i]) for i in range(len(guess))]\\n    \"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 25,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(guess))]\\n\\n#or use the following solution\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 26,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(guess[i]-game[i]) for i in range(len(game))]\\n\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 27,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    return [abs(score-guess) for score,guess in zip(game,guess)]\"\n",
-      "    },\n",
-      "    {\n",
-      "      \"finish_reason\": \"stop\",\n",
-      "      \"index\": 28,\n",
-      "      \"logprobs\": null,\n",
-      "      \"text\": \"    results = []\\n    for i in range(len(game)):\\n        if guess[i] == game[i]:\\n            results.append(0)\\n        else:\\n            results.append(abs(guess[i] - game[i]))\\n    return results\\n\"\n",
+      "      \"message\": {\n",
+      "        \"content\": \"def compare(game, guess):\\n    result = []\\n    for i in range(len(game)):\\n        if game[i] == guess[i]:\\n            result.append(0)\\n        else:\\n            result.append(abs(game[i] - guess[i]))\\n    return result\\n\",\n",
+      "        \"role\": \"assistant\"\n",
+      "      }\n",
       "    }\n",
       "  ],\n",
-      "  \"created\": 1675617146,\n",
-      "  \"id\": \"cmpl-6gcqwCz8JXC5eB62rsjxrcIgL3n4B\",\n",
-      "  \"model\": \"code-cushman-001\",\n",
-      "  \"object\": \"text_completion\",\n",
+      "  \"created\": 1679636800,\n",
+      "  \"id\": \"chatcmpl-6xUY4niTRrpJ5UShayb9QncgjS8rg\",\n",
+      "  \"model\": \"gpt-4-0314\",\n",
+      "  \"object\": \"chat.completion\",\n",
       "  \"usage\": {\n",
-      "    \"completion_tokens\": 3959,\n",
-      "    \"prompt_tokens\": 239,\n",
-      "    \"total_tokens\": 4198\n",
+      "    \"completion_tokens\": 440,\n",
+      "    \"prompt_tokens\": 236,\n",
+      "    \"total_tokens\": 676\n",
       "  }\n",
-      "}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "}\n",
       "{'expected_success': 1.0, 'success': True}\n"
      ]
     }
@@ -912,7 +774,7 @@
    "source": [
     "responses = oai.Completion.create(context=tune_data[1], **config)\n",
     "print(responses)\n",
-    "print(success_metrics([response[\"text\"].rstrip() for response in responses[\"choices\"]], **tune_data[1]))\n"
+    "print(success_metrics([response[\"message\"][\"content\"] if config[\"model\"] in oai.Completion.chat_models else response[\"text\"] for response in responses[\"choices\"]], **tune_data[1]))\n"
    ]
   },
   {
@@ -922,12 +784,12 @@
    "source": [
     "### Evaluate the success rate on the test data\n",
     "\n",
-    "You can use flaml's `oai.Completion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.Completion.data` to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances. Compared to the baseline success rate (46%) on the [HELM benchmark](https://crfm.stanford.edu/helm/latest/?group=code_humaneval), the tuned config has a success rate of 68%. It can be further improved if the inference budget and optimization budget are further increased."
+    "You can use flaml's `oai.Completion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.Completion.data` to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances. The cost is about $7 if you uncomment it and run it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 25,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2023-02-24T23:26:39.347295Z",
@@ -941,14 +803,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'expected_success': 0.6364503360372493, 'success': 0.6805555555555556, 'total_cost': 12.210191999999997, 'cost': 8.181360000000003, 'inference_cost': 0.056815}\n"
+      "{'expected_success': 0.8326778348739547, 'success': 0.8472222222222222, 'total_cost': 10.024478799999999, 'cost': 7.01217, 'inference_cost': 0.049131249999999994}\n"
      ]
     }
    ],
    "source": [
-    "oai.Completion.data = test_data\n",
-    "result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)\n",
-    "print(result)\n"
+    "# oai.Completion.data = test_data\n",
+    "# result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)\n",
+    "# print(result)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result will vary with the inference budget and optimization budget.\n"
    ]
   }
  ],
@@ -968,7 +838,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.15"
+   "version": "3.9.16"
   },
   "vscode": {
    "interpreter": {
diff --git a/setup.py b/setup.py
index a5634ce36abf..b5764866063e 100644
--- a/setup.py
+++ b/setup.py
@@ -120,7 +120,7 @@
             "pytorch-forecasting>=0.9.0",
         ],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
-        "openai": ["openai==0.23.1", "diskcache", "optuna==2.8.0"],
+        "openai": ["openai==0.27.0", "diskcache", "optuna==2.8.0"],
         "synapse": ["joblibspark>=0.5.0", "optuna==2.8.0", "pyspark>=3.2.0"],
     },
     classifiers=[
diff --git a/test/openai/test_completion.py b/test/openai/test_completion.py
index 191f8be1e5e0..8c2cfa598b3a 100644
--- a/test/openai/test_completion.py
+++ b/test/openai/test_completion.py
@@ -109,7 +109,6 @@ def success_metrics(responses, prompt, test, entry_point):
         )
         responses = oai.ChatCompletion.create(context=test_data[0], **config)
         print(responses)
-        return
         # a more comprehensive tuning example
         config, analysis = oai.Completion.tune(
             data=tune_data,
diff --git a/test/openai/test_notebook.py b/test/openai/test_notebook.py
index e8cb515881ce..0285350c8965 100644
--- a/test/openai/test_notebook.py
+++ b/test/openai/test_notebook.py
@@ -43,7 +43,7 @@ def test_integrate_openai(save=False):
     reason="do not run openai test if openai is not installed",
 )
 def test_integrate_chatgpt(save=False):
-    run_notebook("integrate_chatgpt_math.ipynb", save=save)
+    run_notebook("integrate_chatgpt.ipynb", save=save)
 
 
 if __name__ == "__main__":
diff --git a/website/docs/Examples/Integrate - OpenAI.md b/website/docs/Examples/Integrate - OpenAI.md
index 65e8bfa06363..f6289fd3ce92 100644
--- a/website/docs/Examples/Integrate - OpenAI.md	
+++ b/website/docs/Examples/Integrate - OpenAI.md	
@@ -1,4 +1,5 @@
-FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673). In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task. Our study shows that tuning hyperparameters can significantly affect the utility of the OpenAI API.
+FLAML offers a cost-effective hyperparameter optimization technique [EcoOptiGen](https://arxiv.org/abs/2303.04673) for tuning Large Language Models. Our study finds that tuning hyperparameters can significantly improve the utility of the OpenAI API.
+In this example, we will tune several hyperparameters for the OpenAI's completion API, including the temperature, prompt and n (number of completions), to optimize the inference performance for a code generation task.
 
 ### Prerequisites
 
@@ -7,7 +8,6 @@ Install the [openai] option. The OpenAI integration is in preview. ChaptGPT supp
 pip install "flaml[openai]==1.2.0"
 ```
 
-
 Setup your OpenAI key:
 ```python
 import os
@@ -122,7 +122,7 @@ def success_metrics(responses, prompt, test, entry_point):
 
 ### Tuning Hyperparameters for OpenAI
 
-The tuning will take a while to finish, depending on the optimization budget (~1 min for the current budget). The tuning will be performed under the specified optimization budgets.
+The tuning will be performed under the specified optimization budgets.
 
 * inference_budget is the target average inference budget per instance in the benchmark. For example, 0.02 means the target inference budget is 0.02 dollars, which translates to 1000 tokens (input + output combined) if the text Davinci model is used.
 * optimization_budget is the total budget allowed to perform the tuning. For example, 5 means 5 dollars are allowed in total, which translates to 250K tokens for the text Davinci model.
@@ -142,15 +142,6 @@ config, analysis = oai.Completion.tune(
     # num_samples can further limit the number of trials for different hyperparameter configurations;
     # -1 means decided by the optimization budget only
     num_samples=-1,
-    model=tune.choice(
-        [
-            # These two models are in Beta test and free to use from OpenAI as of Feb 2023,
-            # so no actual cost will incur (please double check when you run it). They are not free in Azure OpenAI.
-            # The optimization is based on the price in Azure OpenAI as of Feb 2023.
-            "code-cushman-001",
-            "code-davinci-002",
-        ]
-    ),
     prompt=[
         "{prompt}",
         "# Python 3{prompt}",
@@ -182,7 +173,7 @@ print(success_metrics([response["text"].rstrip() for response in responses["choi
 
 #### Evaluate the success rate on the test data
 
-You can use flaml's oai.Completion.eval to evaluate the performance of an entire dataset with the tuned config. To do that you need to set oai.Completion.data to the data to evaluate. The following code will take a while to evaluate all the 144 test data instances. Compared to the baseline success rate (0.46) on the HELM benchmark, the tuned config has a success rate of 0.68. It can be further improved if the inference budget and optimization budget are further increased.
+You can use flaml's `oai.Completion.eval` to evaluate the performance of an entire dataset with the tuned config. To do that you need to set `oai.Completion.data` to the data to evaluate.
 
 ```python
 oai.Completion.data = test_data
@@ -190,4 +181,6 @@ result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)
 print(result)
 ```
 
+The result will vary with the inference budget and optimization budget.
+
 [Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_openai.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_openai.ipynb)