From 57761052bd5b5d8ddc73ea21696c9f9ab4238e2f Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 29 Jan 2024 02:10:01 +0000 Subject: [PATCH 1/4] Reproduce #1440 --- notebook/agentchat_groupchat_RAG.ipynb | 541 ++++++++++++++++++------- 1 file changed, 390 insertions(+), 151 deletions(-) diff --git a/notebook/agentchat_groupchat_RAG.ipynb b/notebook/agentchat_groupchat_RAG.ipynb index 1f5c20d95335..5a1ae00aff3a 100644 --- a/notebook/agentchat_groupchat_RAG.ipynb +++ b/notebook/agentchat_groupchat_RAG.ipynb @@ -51,12 +51,28 @@ "execution_count": 2, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "LLM models: ['gpt-35-turbo', 'gpt-35-turbo-0613']\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", + " return torch._C._cuda_getDeviceCount() > 0\n" + ] } ], "source": [ @@ -116,18 +132,7 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11060). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", - " return torch._C._cuda_getDeviceCount() > 0\n" - ] - } - ], + "outputs": [], "source": [ "llm_config = {\n", " \"timeout\": 60,\n", @@ -308,6 +313,17 @@ " )" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# # Uncomment below code if docker is not available.\n", + "# import os\n", + "# os.environ[\"AUTOGEN_USE_DOCKER\"] = \"0\"" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -321,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -335,62 +351,126 @@ "--------------------------------------------------------------------------------\n", "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "To use Spark for parallel training in FLAML, you can use the `SparkTrials` class provided by FLAML. Here is a sample code:\n", + "Sure, here's an example of how to use Spark for parallel training in FLAML:\n", "\n", "```python\n", "from flaml import AutoML\n", - "from flaml.data import load_credit\n", - "from flaml.model import SparkTrials\n", + "from pyspark.sql import SparkSession\n", "\n", - "# Load data\n", - "X_train, y_train, X_test, y_test = load_credit()\n", + "# Create a Spark session\n", + "spark = SparkSession.builder \\\n", + " .appName(\"FLAML Example\") \\\n", + " .getOrCreate()\n", "\n", - "# Define the search space\n", - "search_space = {\n", - " \"n_estimators\": {\"domain\": range(10, 100)},\n", - " \"max_depth\": {\"domain\": range(6, 10)},\n", - " \"learning_rate\": {\"domain\": (0.01, 0.1, 1)},\n", + "# Load your data into a Spark DataFrame\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", + "\n", + "# Convert the Spark DataFrame to a Pandas DataFrame\n", + "pandas_df = data.toPandas()\n", + "\n", + "# Initialize FLAML with Spark\n", + "automl = AutoML()\n", + "\n", + "# Set the Spark context\n", + "automl.set_context(spark)\n", + "\n", + "# Specify the search space for hyperparameters\n", + "automl_settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"task\": \"classification\", # task type\n", + " \"log_file_name\": \"flaml.log\", # log file name\n", + " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", + " \"split_type\": \"stratified\", # data split type\n", + " \"eval_method\": \"auto\", # evaluation metric for optimization\n", + " \"n_splits\": 5, # number of splits for cross-validation\n", + " \"seed\": 42 # random seed\n", "}\n", "\n", - "# Create an AutoML instance with SparkTrials\n", - "automl = AutoML(\n", - " search_space=search_space,\n", - " task=\"classification\",\n", - " n_jobs=1,\n", - " ensemble_size=0,\n", - " max_trials=10,\n", - " trials=SparkTrials(parallelism=2),\n", - ")\n", + "# Train the model using FLAML\n", + "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", "\n", - "# Train the model\n", - "automl.fit(X_train=X_train, y_train=y_train)\n", + "# Get the best model found by FLAML\n", + "best_model = automl.best_model\n", "\n", - "# Evaluate the model\n", - "print(\"Best model:\", automl.best_model)\n", - "print(\"Best hyperparameters:\", automl.best_config)\n", - "print(\"Test accuracy:\", automl.score(X_test=X_test, y_test=y_test))\n", + "# Print the best model's hyperparameters and performance\n", + "print(\"Best model hyperparameters:\", best_model)\n", + "\n", + "# Terminate the Spark session\n", + "spark.stop()\n", "\n", - "# Terminate\n", + "# Terminate the conversation\n", "TERMINATE\n", "```\n", "\n", - "In this code, we first load the credit dataset. Then, we define the search space for the hyperparameters. We create an `AutoML` instance with `SparkTrials` as the `trials` parameter. We set the `parallelism` parameter to 2, which means that FLAML will use 2 Spark workers to run the trials in parallel. Finally, we fit the model and evaluate it.\n", + "Make sure to replace `\"data.csv\"` with the path to your actual data file. You can also modify the `automl_settings` dictionary to customize the FLAML configuration according to your needs.\n", + "\n", + "Remember to install FLAML and its dependencies before running this code.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", "\n", - "Great! That's a clear and concise example. No further questions from my side.\n", + "Looks good! Just a small correction: `automl.set_context(spark)` should be placed before `automl.fit()` to ensure that the Spark context is set before training begins. Here's the corrected code:\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "```python\n", + "from flaml import AutoML\n", + "from pyspark.sql import SparkSession\n", + "\n", + "# Create a Spark session\n", + "spark = SparkSession.builder \\\n", + " .appName(\"FLAML Example\") \\\n", + " .getOrCreate()\n", + "\n", + "# Load your data into a Spark DataFrame\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", + "\n", + "# Convert the Spark DataFrame to a Pandas DataFrame\n", + "pandas_df = data.toPandas()\n", + "\n", + "# Initialize FLAML with Spark\n", + "automl = AutoML()\n", + "\n", + "# Set the Spark context\n", + "automl.set_context(spark)\n", + "\n", + "# Specify the search space for hyperparameters\n", + "automl_settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"task\": \"classification\", # task type\n", + " \"log_file_name\": \"flaml.log\", # log file name\n", + " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", + " \"split_type\": \"stratified\", # data split type\n", + " \"eval_method\": \"auto\", # evaluation metric for optimization\n", + " \"n_splits\": 5, # number of splits for cross-validation\n", + " \"seed\": 42 # random seed\n", + "}\n", + "\n", + "# Set the Spark context\n", + "automl.set_context(spark)\n", + "\n", + "# Train the model using FLAML\n", + "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", + "\n", + "# Get the best model found by FLAML\n", + "best_model = automl.best_model\n", + "\n", + "# Print the best model's hyperparameters and performance\n", + "print(\"Best model hyperparameters:\", best_model)\n", + "\n", + "# Terminate the Spark session\n", + "spark.stop()\n", + "\n", + "# Terminate the conversation\n", + "TERMINATE\n", + "```\n", + "\n", + "Let me know if you have any further questions!\n", + "\n", + "--------------------------------------------------------------------------------\n", "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", - "Thank you! Let me know if you have any other questions.\n", + "Thank you for catching that mistake! I apologize for the oversight. The correction has been made in the code. I appreciate your attention to detail.\n", + "\n", + "If you have any more questions or need further assistance, feel free to ask.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mBoss\u001b[0m (to chat_manager):\n", @@ -404,8 +484,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "GroupChat select_speaker failed to resolve the next speaker's name. Speaker selection will default to the next speaker in the list. This is because the speaker selection OAI call returned:\n", - "The next role to play is not specified in the conversation. Please provide more information.\n" + "GroupChat select_speaker failed to resolve the next speaker's name. This is because the speaker selection OAI call returned:\n", + "TERMINATE\n" ] }, { @@ -438,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -599,12 +679,120 @@ "\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", "\n", - "To use Spark for parallel training in FLAML, you can activate Spark as the parallel backend during parallel tuning in both AutoML and Hyperparameter Tuning, by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. Here is an example code snippet for using parallel Spark jobs:\n", + "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", + "context provided by the user.\n", + "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", + "For code generation, you must obey the following rules:\n", + "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", + "Rule 2. You must follow the formats below to write your code:\n", + "```language\n", + "# your code\n", + "```\n", + "\n", + "User's question is: How to use spark for parallel training in FLAML? Give me sample code.\n", + "\n", + "Context is: # Integrate - Spark\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", + "\n", + "## Spark ML Estimators\n", + "\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", + "\n", + "### Data\n", + "\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "\n", + "Here is an example code snippet for Spark Data:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "# Creating a dictionary\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "\n", + "Here is an example of how to use it:\n", + "```python\n", + "from pyspark.ml.feature import VectorAssembler\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "```\n", + "\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "\n", + "### Estimators\n", + "#### Model List\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", + "\n", + "#### Usage\n", + "First, prepare your data in the required format as described in the previous section.\n", + "\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "\n", + "Here is an example code snippet using SparkML models in AutoML:\n", "\n", "```python\n", "import flaml\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", + "\n", + "## Parallel Spark Jobs\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", + "\n", + "\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", + "```python\n", + "import flaml\n", "automl_experiment = flaml.AutoML()\n", "automl_settings = {\n", " \"time_budget\": 30,\n", @@ -622,14 +810,60 @@ ")\n", "```\n", "\n", - "Note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", "\n", - "I hope this helps! Let me know if you have any further questions.\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\n", + "To use Spark for parallel training in FLAML, you need to set the `use_spark` parameter to `True` and specify the number of concurrent trials using the `n_concurrent_trials` parameter. Here's a sample code snippet:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl_experiment.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", + "\n", + "Make sure to replace `dataframe` and `label` with your actual data and label. This code will run the FLAML AutoML experiment using Spark for parallel training with 2 concurrent trials.\n", + "\n", + "Please let me know if you need any further assistance.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", - "TERMINATE\n", + "Great! If you have any more questions, feel free to ask. Otherwise, feel free to terminate the conversation.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\n", + "Thank you for your assistance! I don't have any more questions at the moment. I will terminate the conversation.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\n", + "Great! If you have any more questions in the future, feel free to ask. Have a great day! TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -651,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -662,135 +896,133 @@ "\n", "How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "To use Spark for parallel training in FLAML, you can follow these steps:\n", + "--------------------------------------------------------------------------------\n", + "How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", - "1. Install PySpark and FLAML on your machine.\n", - "2. Start a Spark cluster using the `pyspark` command.\n", - "3. Import the necessary libraries and initialize a SparkSession object.\n", - "4. Load your data into a Spark DataFrame.\n", - "5. Define your search space and search strategy using FLAML's API.\n", - "6. Create a SparkEstimator object and pass it to FLAML's `fit()` method.\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "Here's some sample code to get you started:\n", + "Sure, here's an example of how to use Spark for parallel training in FLAML:\n", "\n", "```python\n", - "from pyspark.sql import SparkSession\n", "from flaml import AutoML\n", - "from flaml.data import get_output_from_log\n", + "from pyspark.sql import SparkSession\n", "\n", - "# Initialize a SparkSession object\n", - "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n", + "# Create a Spark session\n", + "spark = SparkSession.builder \\\n", + " .appName(\"FLAML Example\") \\\n", + " .getOrCreate()\n", "\n", "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"path/to/data.csv\")\n", - "\n", - "# Define your search space and search strategy\n", - "search_space = {\n", - " \"n_estimators\": {\"domain\": range(10, 100)},\n", - " \"max_depth\": {\"domain\": range(1, 10)},\n", - " \"learning_rate\": {\"domain\": [0.001, 0.01, 0.1]},\n", - "}\n", - "search_strategy = \"skopt\"\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", "\n", - "# Create a SparkEstimator object\n", - "from pyspark.ml.classification import GBTClassifier\n", - "estimator = GBTClassifier()\n", + "# Convert the Spark DataFrame to a Pandas DataFrame\n", + "pandas_df = data.toPandas()\n", "\n", - "# Pass the SparkEstimator object to FLAML's fit() method\n", + "# Initialize FLAML with Spark\n", "automl = AutoML()\n", - "automl.fit(\n", - " X_train=data,\n", - " estimator=estimator,\n", - " task=\"classification\",\n", - " search_space=search_space,\n", - " search_alg=search_strategy,\n", - " n_jobs=-1,\n", - ")\n", "\n", - "# Get the best model and its hyperparameters\n", - "best_model = automl.model\n", - "best_params = automl.best_config\n", + "# Set the Spark context\n", + "automl.set_context(spark)\n", + "\n", + "# Specify the search space for hyperparameters\n", + "automl_settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"task\": \"classification\", # task type\n", + " \"log_file_name\": \"flaml.log\", # log file name\n", + " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", + " \"split_type\": \"stratified\", # data split type\n", + " \"eval_method\": \"auto\", # evaluation metric for optimization\n", + " \"n_splits\": 5, # number of splits for cross-validation\n", + " \"seed\": 42 # random seed\n", + "}\n", + "\n", + "# Train the model using FLAML\n", + "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", "\n", - "# Print the results\n", - "print(f\"Best model: {best_model}\")\n", - "print(f\"Best hyperparameters: {best_params}\")\n", + "# Get the best model found by FLAML\n", + "best_model = automl.best_model\n", "\n", - "# Stop the SparkSession object\n", + "# Print the best model's hyperparameters and performance\n", + "print(\"Best model hyperparameters:\", best_model)\n", + "\n", + "# Terminate the Spark session\n", "spark.stop()\n", + "\n", + "# Terminate the conversation\n", + "TERMINATE\n", "```\n", "\n", - "Note that the `n_jobs` parameter is set to `-1` to use all available cores on the Spark cluster. You can adjust this value to control the level of parallelism. Also, the `get_output_from_log()` function can be used to extract the results from the FLAML log file. \n", + "Make sure to replace `\"data.csv\"` with the path to your actual data file. You can also modify the `automl_settings` dictionary to customize the FLAML configuration according to your needs.\n", "\n", - "TERMINATE\n", + "Remember to install FLAML and its dependencies before running this code.\n", "\n", "--------------------------------------------------------------------------------\n", - "To use Spark for parallel training in FLAML, you can follow these steps:\n", - "\n", - "1. Install PySpark and FLAML on your machine.\n", - "2. Start a Spark cluster using the `pyspark` command.\n", - "3. Import the necessary libraries and initialize a SparkSession object.\n", - "4. Load your data into a Spark DataFrame.\n", - "5. Define your search space and search strategy using FLAML's API.\n", - "6. Create a SparkEstimator object and pass it to FLAML's `fit()` method.\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", "\n", - "Here's some sample code to get you started:\n", + "Looks good! Just a small correction: `automl.set_context(spark)` should be placed before `automl.fit()` to ensure that the Spark context is set before training begins. Here's the corrected code:\n", "\n", "```python\n", - "from pyspark.sql import SparkSession\n", "from flaml import AutoML\n", - "from flaml.data import get_output_from_log\n", + "from pyspark.sql import SparkSession\n", "\n", - "# Initialize a SparkSession object\n", - "spark = SparkSession.builder.appName(\"FLAML-Spark\").getOrCreate()\n", + "# Create a Spark session\n", + "spark = SparkSession.builder \\\n", + " .appName(\"FLAML Example\") \\\n", + " .getOrCreate()\n", "\n", "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"path/to/data.csv\")\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", + "\n", + "# Convert the Spark DataFrame to a Pandas DataFrame\n", + "pandas_df = data.toPandas()\n", "\n", - "# Define your search space and search strategy\n", - "search_space = {\n", - " \"n_estimators\": {\"domain\": range(10, 100)},\n", - " \"max_depth\": {\"domain\": range(1, 10)},\n", - " \"learning_rate\": {\"domain\": [0.001, 0.01, 0.1]},\n", + "# Initialize FLAML with Spark\n", + "automl = AutoML()\n", + "\n", + "# Set the Spark context\n", + "automl.set_context(spark)\n", + "\n", + "# Specify the search space for hyperparameters\n", + "automl_settings = {\n", + " \"time_budget\": 60, # total running time in seconds\n", + " \"task\": \"classification\", # task type\n", + " \"log_file_name\": \"flaml.log\", # log file name\n", + " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", + " \"split_type\": \"stratified\", # data split type\n", + " \"eval_method\": \"auto\", # evaluation metric for optimization\n", + " \"n_splits\": 5, # number of splits for cross-validation\n", + " \"seed\": 42 # random seed\n", "}\n", - "search_strategy = \"skopt\"\n", "\n", - "# Create a SparkEstimator object\n", - "from pyspark.ml.classification import GBTClassifier\n", - "estimator = GBTClassifier()\n", + "# Set the Spark context\n", + "automl.set_context(spark)\n", "\n", - "# Pass the SparkEstimator object to FLAML's fit() method\n", - "automl = AutoML()\n", - "automl.fit(\n", - " X_train=data,\n", - " estimator=estimator,\n", - " task=\"classification\",\n", - " search_space=search_space,\n", - " search_alg=search_strategy,\n", - " n_jobs=-1,\n", - ")\n", + "# Train the model using FLAML\n", + "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", "\n", - "# Get the best model and its hyperparameters\n", - "best_model = automl.model\n", - "best_params = automl.best_config\n", + "# Get the best model found by FLAML\n", + "best_model = automl.best_model\n", "\n", - "# Print the results\n", - "print(f\"Best model: {best_model}\")\n", - "print(f\"Best hyperparameters: {best_params}\")\n", + "# Print the best model's hyperparameters and performance\n", + "print(\"Best model hyperparameters:\", best_model)\n", "\n", - "# Stop the SparkSession object\n", + "# Terminate the Spark session\n", "spark.stop()\n", + "\n", + "# Terminate the conversation\n", + "TERMINATE\n", "```\n", "\n", - "Note that the `n_jobs` parameter is set to `-1` to use all available cores on the Spark cluster. You can adjust this value to control the level of parallelism. Also, the `get_output_from_log()` function can be used to extract the results from the FLAML log file. \n", + "Let me know if you have any further questions!\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mBoss\u001b[0m (to chat_manager):\n", + "\n", + "Reply `TERMINATE` if the task is done.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", "TERMINATE\n", "\n", @@ -801,6 +1033,13 @@ "source": [ "call_rag_chat()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -819,7 +1058,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.12" } }, "nbformat": 4, From e9063196a3ef70eef1af18e3fb3e9ac572666d9c Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 13 Feb 2024 13:15:56 +0800 Subject: [PATCH 2/4] Updated code with latest APIs --- notebook/agentchat_groupchat_RAG.ipynb | 774 +++++++++++++------------ 1 file changed, 403 insertions(+), 371 deletions(-) diff --git a/notebook/agentchat_groupchat_RAG.ipynb b/notebook/agentchat_groupchat_RAG.ipynb index 1f5251d67417..3ff363185796 100644 --- a/notebook/agentchat_groupchat_RAG.ipynb +++ b/notebook/agentchat_groupchat_RAG.ipynb @@ -22,7 +22,7 @@ "\n", "AutoGen requires `Python>=3.8`. To run this notebook example, please install:\n", "```bash\n", - "pip install \"pyautogen[retrievechat]>=0.2.3\"\n", + "pip install \"pyautogen[retrievechat]>=0.2.13\"\n", "```" ] }, @@ -33,7 +33,7 @@ "outputs": [], "source": [ "%%capture --no-stderr\n", - "# %pip install \"pyautogen[retrievechat]>=0.2.3\"" + "# %pip install \"pyautogen[retrievechat]>=0.2.13\"" ] }, { @@ -48,37 +48,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "LLM models: ['gpt-35-turbo', 'gpt-35-turbo-0613']\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/torch/cuda/__init__.py:138: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", - " return torch._C._cuda_getDeviceCount() > 0\n" - ] } ], "source": [ + "from typing_extensions import Annotated\n", "import chromadb\n", "\n", "import autogen\n", + "from autogen.cache import Cache\n", "from autogen import AssistantAgent\n", "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", "\n", @@ -130,19 +116,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "llm_config = {\n", " \"timeout\": 60,\n", - " \"cache_seed\": 42,\n", " \"config_list\": config_list,\n", - " \"temperature\": 0,\n", "}\n", "\n", - "# autogen.ChatCompletion.start_logging()\n", - "\n", "\n", "def termination_msg(x):\n", " return isinstance(x, dict) and \"TERMINATE\" == str(x.get(\"content\", \"\"))[-9:].upper()\n", @@ -152,15 +134,14 @@ " name=\"Boss\",\n", " is_termination_msg=termination_msg,\n", " human_input_mode=\"NEVER\",\n", - " system_message=\"The boss who ask questions and give tasks.\",\n", " code_execution_config=False, # we don't want to execute code in this case.\n", " default_auto_reply=\"Reply `TERMINATE` if the task is done.\",\n", + " description=\"The boss who ask questions and give tasks.\",\n", ")\n", "\n", "boss_aid = RetrieveUserProxyAgent(\n", " name=\"Boss_Assistant\",\n", " is_termination_msg=termination_msg,\n", - " system_message=\"Assistant who has extra content retrieval power for solving difficult problems.\",\n", " human_input_mode=\"NEVER\",\n", " max_consecutive_auto_reply=3,\n", " retrieve_config={\n", @@ -173,13 +154,15 @@ " \"get_or_create\": True,\n", " },\n", " code_execution_config=False, # we don't want to execute code in this case.\n", + " description=\"Assistant who has extra content retrieval power for solving difficult problems.\",\n", ")\n", "\n", "coder = AssistantAgent(\n", " name=\"Senior_Python_Engineer\",\n", " is_termination_msg=termination_msg,\n", - " system_message=\"You are a senior python engineer. Reply `TERMINATE` in the end when everything is done.\",\n", + " system_message=\"You are a senior python engineer, you provide python code to answer questions. Reply `TERMINATE` in the end when everything is done.\",\n", " llm_config=llm_config,\n", + " description=\"Senior Python Engineer who can write code to solve problems and answer questions.\",\n", ")\n", "\n", "pm = autogen.AssistantAgent(\n", @@ -187,6 +170,7 @@ " is_termination_msg=termination_msg,\n", " system_message=\"You are a product manager. Reply `TERMINATE` in the end when everything is done.\",\n", " llm_config=llm_config,\n", + " description=\"Product Manager who can design and plan the project.\",\n", ")\n", "\n", "reviewer = autogen.AssistantAgent(\n", @@ -194,6 +178,7 @@ " is_termination_msg=termination_msg,\n", " system_message=\"You are a code reviewer. Reply `TERMINATE` in the end when everything is done.\",\n", " llm_config=llm_config,\n", + " description=\"Code Reviewer who can review the code.\",\n", ")\n", "\n", "PROBLEM = \"How to use spark for parallel training in FLAML? Give me sample code.\"\n", @@ -210,7 +195,7 @@ "def rag_chat():\n", " _reset_agents()\n", " groupchat = autogen.GroupChat(\n", - " agents=[boss_aid, coder, pm, reviewer], messages=[], max_round=12, speaker_selection_method=\"round_robin\"\n", + " agents=[boss_aid, pm, coder, reviewer], messages=[], max_round=12, speaker_selection_method=\"round_robin\"\n", " )\n", " manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)\n", "\n", @@ -225,7 +210,7 @@ "def norag_chat():\n", " _reset_agents()\n", " groupchat = autogen.GroupChat(\n", - " agents=[boss, coder, pm, reviewer],\n", + " agents=[boss, pm, coder, reviewer],\n", " messages=[],\n", " max_round=12,\n", " speaker_selection_method=\"auto\",\n", @@ -247,7 +232,13 @@ " # with RAG user proxy agent.\n", " # In order to use RAG user proxy agent, we need to wrap RAG agents in a function and call\n", " # it from other agents.\n", - " def retrieve_content(message, n_results=3):\n", + " def retrieve_content(\n", + " message: Annotated[\n", + " str,\n", + " \"Refined message which keeps the original meaning and can be used to retrieve content for code generation and question answering.\",\n", + " ],\n", + " n_results: int = 3,\n", + " ) -> str:\n", " boss_aid.n_results = n_results # Set the number of results to be retrieved.\n", " # Check if we need to update the context.\n", " update_context_case1, update_context_case2 = boss_aid._check_update_context(message)\n", @@ -260,51 +251,23 @@ "\n", " boss_aid.human_input_mode = \"NEVER\" # Disable human input for boss_aid since it only retrieves content.\n", "\n", - " llm_config = {\n", - " \"functions\": [\n", - " {\n", - " \"name\": \"retrieve_content\",\n", - " \"description\": \"retrieve content for code generation and question answering.\",\n", - " \"parameters\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"message\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"Refined message which keeps the original meaning and can be used to retrieve content for code generation and question answering.\",\n", - " }\n", - " },\n", - " \"required\": [\"message\"],\n", - " },\n", - " },\n", - " ],\n", - " \"config_list\": config_list,\n", - " \"timeout\": 60,\n", - " \"cache_seed\": 42,\n", - " }\n", - "\n", - " for agent in [coder, pm, reviewer]:\n", - " # update llm_config for assistant agents.\n", - " agent.llm_config.update(llm_config)\n", + " for caller in [pm, coder, reviewer]:\n", + " d_retrieve_content = caller.register_for_llm(\n", + " description=\"retrieve content for code generation and question answering.\", api_style=\"function\"\n", + " )(retrieve_content)\n", "\n", - " for agent in [boss, coder, pm, reviewer]:\n", - " # register functions for all agents.\n", - " agent.register_function(\n", - " function_map={\n", - " \"retrieve_content\": retrieve_content,\n", - " }\n", - " )\n", + " for executor in [boss, pm]:\n", + " executor.register_for_execution()(d_retrieve_content)\n", "\n", " groupchat = autogen.GroupChat(\n", - " agents=[boss, coder, pm, reviewer],\n", + " agents=[boss, pm, coder, reviewer],\n", " messages=[],\n", " max_round=12,\n", - " speaker_selection_method=\"random\",\n", + " speaker_selection_method=\"round_robin\",\n", " allow_repeat_speaker=False,\n", " )\n", "\n", - " manager_llm_config = llm_config.copy()\n", - " manager_llm_config.pop(\"functions\")\n", - " manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=manager_llm_config)\n", + " manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)\n", "\n", " # Start chatting with the boss as this is the user proxy agent.\n", " boss.initiate_chat(\n", @@ -313,17 +276,6 @@ " )" ] }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# # Uncomment below code if docker is not available.\n", - "# import os\n", - "# os.environ[\"AUTOGEN_USE_DOCKER\"] = \"0\"" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -337,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -351,128 +303,81 @@ "--------------------------------------------------------------------------------\n", "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "Sure, here's an example of how to use Spark for parallel training in FLAML:\n", + "To use Spark for parallel training in FLAML, you need to install PySpark by running the following command in your terminal:\n", + "```\n", + "pip install pyspark\n", + "```\n", + "Here's a sample code that shows how to use Spark for parallel training in FLAML:\n", "\n", "```python\n", "from flaml import AutoML\n", "from pyspark.sql import SparkSession\n", "\n", - "# Create a Spark session\n", - "spark = SparkSession.builder \\\n", - " .appName(\"FLAML Example\") \\\n", - " .getOrCreate()\n", + "# create a SparkSession\n", + "spark = SparkSession.builder.appName(\"Spark_FLAML\").getOrCreate()\n", "\n", - "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", + "# load your data into a Spark DataFrame\n", + "data = spark.read.load('/path/to/your/data.csv', format='csv', header='true', inferSchema='true')\n", "\n", - "# Convert the Spark DataFrame to a Pandas DataFrame\n", - "pandas_df = data.toPandas()\n", + "# define the target column and other parameters\n", + "target_column = 'label'\n", + "time_budget = 600 # time_budget is in seconds\n", "\n", - "# Initialize FLAML with Spark\n", + "# create an instance of AutoML with Spark backend\n", "automl = AutoML()\n", - "\n", - "# Set the Spark context\n", - "automl.set_context(spark)\n", - "\n", - "# Specify the search space for hyperparameters\n", - "automl_settings = {\n", - " \"time_budget\": 60, # total running time in seconds\n", - " \"task\": \"classification\", # task type\n", - " \"log_file_name\": \"flaml.log\", # log file name\n", - " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", - " \"split_type\": \"stratified\", # data split type\n", - " \"eval_method\": \"auto\", # evaluation metric for optimization\n", - " \"n_splits\": 5, # number of splits for cross-validation\n", - " \"seed\": 42 # random seed\n", + "settings = {\n", + " \"time_budget\": time_budget,\n", + " \"task\": \"classification\",\n", + " \"log_file_name\": \"log.txt\",\n", + " \"log_training_metric\": True,\n", "}\n", + "automl.initialize(**settings)\n", + "automl.fit(\n", + " X_train=data.drop(target_column, axis=1),\n", + " y_train=data[target_column],\n", + " estimator_list=['lgbm', 'rf', 'xgboost', 'catboost'],\n", + " task='classification',\n", + " time_budget=time_budget,\n", + " spark=spark, # pass the SparkSession to FLAML\n", + ")\n", "\n", - "# Train the model using FLAML\n", - "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", - "\n", - "# Get the best model found by FLAML\n", - "best_model = automl.best_model\n", - "\n", - "# Print the best model's hyperparameters and performance\n", - "print(\"Best model hyperparameters:\", best_model)\n", + "# get the best model\n", + "best_model = automl.model\n", "\n", - "# Terminate the Spark session\n", + "# stop the SparkSession\n", "spark.stop()\n", "\n", - "# Terminate the conversation\n", + "# finally, print the best model found by FLAML\n", + "print(best_model)\n", + "\n", + "# terminate\n", "TERMINATE\n", "```\n", "\n", - "Make sure to replace `\"data.csv\"` with the path to your actual data file. You can also modify the `automl_settings` dictionary to customize the FLAML configuration according to your needs.\n", + "In this example, we first create a SparkSession and load our data into a Spark DataFrame. We then define the target column and other parameters such as `time_budget`. We create an instance of `AutoML` with `Spark` backend and pass the `SparkSession` to FLAML. Then, we call the `fit` method of the `AutoML` object to fit various estimators on our training data in a parallel way using Spark. Finally, we obtain the best model and stop the SparkSession.\n", "\n", - "Remember to install FLAML and its dependencies before running this code.\n", + "I hope this helps! Let me know if you have any other questions.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", "\n", - "Looks good! Just a small correction: `automl.set_context(spark)` should be placed before `automl.fit()` to ensure that the Spark context is set before training begins. Here's the corrected code:\n", - "\n", - "```python\n", - "from flaml import AutoML\n", - "from pyspark.sql import SparkSession\n", - "\n", - "# Create a Spark session\n", - "spark = SparkSession.builder \\\n", - " .appName(\"FLAML Example\") \\\n", - " .getOrCreate()\n", - "\n", - "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", - "\n", - "# Convert the Spark DataFrame to a Pandas DataFrame\n", - "pandas_df = data.toPandas()\n", + "Great explanation! I have no further questions. TERMINATE.\n", "\n", - "# Initialize FLAML with Spark\n", - "automl = AutoML()\n", - "\n", - "# Set the Spark context\n", - "automl.set_context(spark)\n", - "\n", - "# Specify the search space for hyperparameters\n", - "automl_settings = {\n", - " \"time_budget\": 60, # total running time in seconds\n", - " \"task\": \"classification\", # task type\n", - " \"log_file_name\": \"flaml.log\", # log file name\n", - " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", - " \"split_type\": \"stratified\", # data split type\n", - " \"eval_method\": \"auto\", # evaluation metric for optimization\n", - " \"n_splits\": 5, # number of splits for cross-validation\n", - " \"seed\": 42 # random seed\n", - "}\n", - "\n", - "# Set the Spark context\n", - "automl.set_context(spark)\n", - "\n", - "# Train the model using FLAML\n", - "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", - "\n", - "# Get the best model found by FLAML\n", - "best_model = automl.best_model\n", - "\n", - "# Print the best model's hyperparameters and performance\n", - "print(\"Best model hyperparameters:\", best_model)\n", - "\n", - "# Terminate the Spark session\n", - "spark.stop()\n", - "\n", - "# Terminate the conversation\n", - "TERMINATE\n", - "```\n", - "\n", - "Let me know if you have any further questions!\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "Thank you for catching that mistake! I apologize for the oversight. The correction has been made in the code. I appreciate your attention to detail.\n", - "\n", - "If you have any more questions or need further assistance, feel free to ask.\n", - "\n", - "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GroupChat select_speaker failed to resolve the next speaker's name. This is because the speaker selection OAI call returned:\n", + "The role to play now depends on the situation, as there is no specific request or question from the previous role to continue the conversation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\u001b[33mBoss\u001b[0m (to chat_manager):\n", "\n", "Reply `TERMINATE` if the task is done.\n", @@ -485,14 +390,14 @@ "output_type": "stream", "text": [ "GroupChat select_speaker failed to resolve the next speaker's name. This is because the speaker selection OAI call returned:\n", - "TERMINATE\n" + "No role to play at the moment. The conversation is done.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", "TERMINATE\n", "\n", @@ -504,7 +409,8 @@ } ], "source": [ - "norag_chat()" + "with Cache.disk():\n", + " norag_chat()" ] }, { @@ -518,30 +424,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Trying to create collection.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "doc_ids: [['doc_0', 'doc_1']]\n", + "Trying to create collection.\n", + "doc_ids: [['doc_0', 'doc_1', 'doc_122']]\n", "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_122 to context.\u001b[0m\n", "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -675,13 +569,234 @@ "\n", "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", + "2684,4/26/2011,2,0,4,17,0,2,1,1,0.68,0.6364,0.61,0.3582,521\n", + "2685,4/26/2011,2,0,4,18,0,2,1,1,0.68,0.6364,0.65,0.4478,528\n", + "2686,4/26/2011,2,0,4,19,0,2,1,1,0.64,0.6061,0.73,0.4179,328\n", + "2687,4/26/2011,2,0,4,20,0,2,1,1,0.64,0.6061,0.73,0.3582,234\n", + "2688,4/26/2011,2,0,4,21,0,2,1,1,0.62,0.5909,0.78,0.2836,195\n", + "2689,4/26/2011,2,0,4,22,0,2,1,2,0.6,0.5606,0.83,0.194,148\n", + "2690,4/26/2011,2,0,4,23,0,2,1,2,0.6,0.5606,0.83,0.2239,78\n", + "2691,4/27/2011,2,0,4,0,0,3,1,1,0.6,0.5606,0.83,0.2239,27\n", + "2692,4/27/2011,2,0,4,1,0,3,1,1,0.6,0.5606,0.83,0.2537,17\n", + "2693,4/27/2011,2,0,4,2,0,3,1,1,0.58,0.5455,0.88,0.2537,5\n", + "2694,4/27/2011,2,0,4,3,0,3,1,2,0.58,0.5455,0.88,0.2836,7\n", + "2695,4/27/2011,2,0,4,4,0,3,1,1,0.56,0.5303,0.94,0.2239,6\n", + "2696,4/27/2011,2,0,4,5,0,3,1,2,0.56,0.5303,0.94,0.2537,17\n", + "2697,4/27/2011,2,0,4,6,0,3,1,1,0.56,0.5303,0.94,0.2537,84\n", + "2698,4/27/2011,2,0,4,7,0,3,1,2,0.58,0.5455,0.88,0.2836,246\n", + "2699,4/27/2011,2,0,4,8,0,3,1,2,0.58,0.5455,0.88,0.3284,444\n", + "2700,4/27/2011,2,0,4,9,0,3,1,2,0.6,0.5455,0.88,0.4179,181\n", + "2701,4/27/2011,2,0,4,10,0,3,1,2,0.62,0.5758,0.83,0.2836,92\n", + "2702,4/27/2011,2,0,4,11,0,3,1,2,0.64,0.5909,0.78,0.2836,156\n", + "2703,4/27/2011,2,0,4,12,0,3,1,1,0.66,0.6061,0.78,0.3284,173\n", + "2704,4/27/2011,2,0,4,13,0,3,1,1,0.64,0.5909,0.78,0.2985,150\n", + "2705,4/27/2011,2,0,4,14,0,3,1,1,0.68,0.6364,0.74,0.2836,148\n", "\n", "\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "\n", + "UPDATE CONTEXT. Please provide more context or clarify your question.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\n", + "Sorry about that. Let me clarify my previous question.\n", + "\n", + "How can we use Spark for parallel training in FLAML? Can you provide some sample code?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel Spark jobs.\n", + "\n", + "To perform AutoML on Spark data, you need to first organize your data in the required format for Spark estimators. They require data in pandas-on-spark (psdf) format and [VectorAssembler](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column. Then you can use the `spark_lgbm` estimator in your estimator list for FLAML AutoML. See an example below.\n", + "\n", + "First, use `to_pandas_on_spark` to convert your data into pandas-on-spark format. Here is an example:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "\n", + "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "To use Spark ML models, you need to format your data appropriately.\n", + "\n", + "```python\n", + "from pyspark.ml.feature.scaler import StandardScaler\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "data = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label]\n", + "data.show()\n", + "```\n", + "\n", + "You can then use FLAML's `AutoML` class to tune hyperparameters and find the best model.\n", + "\n", + "```python\n", + "import flaml\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"task\": \"regression\",\n", + "}\n", + "automl.fit(\n", + " psdf=data,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "To use Spark for parallel training Spark jobs, you set the `use_spark` flag in `Hyperparameter-Tuner` or in `AutoML` to True. See example below:\n", + "\n", + "```python\n", + "from sklearn.datasets import load_diabetes\n", + "from flaml import AutoML\n", + "from flaml.data import convert_diabetes_to_spark\n", + "from pyspark.sql.session import SparkSession\n", + "import joblib\n", + "\n", + "data = load_diabetes(as_frame=True, return_X_y=False)\n", + "label = \"target\"\n", + "\n", + "spark = SparkSession.builder.master(\"local[4]\").appName(\"flaml-example\").getOrCreate()\n", + "ps_df = convert_diabetes_to_spark(spark, data, label)\n", + "\n", + "n_trials = 10\n", + "automl = AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_jobs\": -1, # Run wherever we have capacity\n", + " \"use_spark\": True, # Parallel training in Spark workers\n", + " \"n_trials\": n_trials,\n", + "}\n", + "\n", + "with joblib.parallel_backend(\"spark\", batch_size=1):\n", + " result = automl.fit(\n", + " psdf=ps_df,\n", + " label=label,\n", + " **automl_settings,\n", + " )\n", + "\n", + "print(f\"Best ML learner: {result['best_estimator']}\")\n", + "```\n", + "\n", + "Here we have used `joblib.parallel_backend` to specify the parallel backend (Spark). We have also set `use_spark` to True in `automl_settings` and passed it to `automl.fit()`. Spark related arguments have been set in this dict.\n", + "\n", + "I hope this helps!\n", + "\n", + "--------------------------------------------------------------------------------\n", "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "\n", + "Great job! That was a comprehensive answer to the question. No further action required.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\n", + "Thank you! Don't hesitate to ask if you have any further questions.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\n", + "You're welcome! I'm always here to help if you have any other questions or concerns.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "\n", + "Thanks for your cooperation. I will close this task now.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\n", + "You're welcome! Feel free to reach out if you need more help in the future. Have a great day!\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\n", + "Thank you! Have a great day ahead!\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "with Cache.disk():\n", + " rag_chat()\n", + "# type exit to terminate the chat" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Call RetrieveUserProxyAgent while init chat with another user proxy agent\n", + "Sometimes, there might be a need to use RetrieveUserProxyAgent in group chat without initializing the chat with it. In such scenarios, it becomes essential to create a function that wraps the RAG agents and allows them to be called from other agents." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mBoss\u001b[0m (to chat_manager):\n", + "\n", + "How to use spark for parallel training in FLAML? Give me sample code.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "How to use spark for parallel training in FLAML? Give me sample code.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "\n", + "\u001b[32m***** Suggested function Call: retrieve_content *****\u001b[0m\n", + "Arguments: \n", + "{\n", + " \"message\": \"How to use spark for parallel training in FLAML? Give me sample code.\"\n", + "}\n", + "\u001b[32m*****************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION retrieve_content...\u001b[0m\n", + "Trying to create collection.\n", + "doc_ids: [['doc_0', 'doc_1', 'doc_122']]\n", + "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", + "\u001b[32mAdding doc_id doc_122 to context.\u001b[0m\n", + "\u001b[33mBoss\u001b[0m (to chat_manager):\n", + "\n", + "\u001b[32m***** Response from calling function \"retrieve_content\" *****\u001b[0m\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", "context provided by the user.\n", "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", @@ -813,216 +928,132 @@ "\n", "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", "\n", + "2684,4/26/2011,2,0,4,17,0,2,1,1,0.68,0.6364,0.61,0.3582,521\n", + "2685,4/26/2011,2,0,4,18,0,2,1,1,0.68,0.6364,0.65,0.4478,528\n", + "2686,4/26/2011,2,0,4,19,0,2,1,1,0.64,0.6061,0.73,0.4179,328\n", + "2687,4/26/2011,2,0,4,20,0,2,1,1,0.64,0.6061,0.73,0.3582,234\n", + "2688,4/26/2011,2,0,4,21,0,2,1,1,0.62,0.5909,0.78,0.2836,195\n", + "2689,4/26/2011,2,0,4,22,0,2,1,2,0.6,0.5606,0.83,0.194,148\n", + "2690,4/26/2011,2,0,4,23,0,2,1,2,0.6,0.5606,0.83,0.2239,78\n", + "2691,4/27/2011,2,0,4,0,0,3,1,1,0.6,0.5606,0.83,0.2239,27\n", + "2692,4/27/2011,2,0,4,1,0,3,1,1,0.6,0.5606,0.83,0.2537,17\n", + "2693,4/27/2011,2,0,4,2,0,3,1,1,0.58,0.5455,0.88,0.2537,5\n", + "2694,4/27/2011,2,0,4,3,0,3,1,2,0.58,0.5455,0.88,0.2836,7\n", + "2695,4/27/2011,2,0,4,4,0,3,1,1,0.56,0.5303,0.94,0.2239,6\n", + "2696,4/27/2011,2,0,4,5,0,3,1,2,0.56,0.5303,0.94,0.2537,17\n", + "2697,4/27/2011,2,0,4,6,0,3,1,1,0.56,0.5303,0.94,0.2537,84\n", + "2698,4/27/2011,2,0,4,7,0,3,1,2,0.58,0.5455,0.88,0.2836,246\n", + "2699,4/27/2011,2,0,4,8,0,3,1,2,0.58,0.5455,0.88,0.3284,444\n", + "2700,4/27/2011,2,0,4,9,0,3,1,2,0.6,0.5455,0.88,0.4179,181\n", + "2701,4/27/2011,2,0,4,10,0,3,1,2,0.62,0.5758,0.83,0.2836,92\n", + "2702,4/27/2011,2,0,4,11,0,3,1,2,0.64,0.5909,0.78,0.2836,156\n", + "2703,4/27/2011,2,0,4,12,0,3,1,1,0.66,0.6061,0.78,0.3284,173\n", + "2704,4/27/2011,2,0,4,13,0,3,1,1,0.64,0.5909,0.78,0.2985,150\n", + "2705,4/27/2011,2,0,4,14,0,3,1,1,0.68,0.6364,0.74,0.2836,148\n", + "\n", + "\n", + "\u001b[32m*************************************************************\u001b[0m\n", "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", + "To use Spark for parallel training in FLAML, you need to follow these steps:\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "1. Prepare your data in the required format as Spark data. FLAML provides a utility function `to_pandas_on_spark` to convert your pandas or PySpark data into a pandas-on-spark dataframe. Here is an example:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "\n", + "# Create a pandas DataFrame\n", + "dataframe = pd.DataFrame({\n", + " \"feature1\": [1, 2, 3, 4, 5],\n", + " \"feature2\": [0.1, 0.2, 0.3, 0.4, 0.5],\n", + " \"target\": [10, 20, 30, 40, 50]\n", + "})\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "2. Choose the Spark ML estimator you want to use for training. FLAML integrates Spark ML estimators for AutoML. For example, you can use the `lgbm_spark` estimator for LightGBM models. Import the estimator class and create an instance. \n", + "\n", + "3. Define the settings for the AutoML experiment, including the time budget, metric, task type, and the list of estimators to try. \n", "\n", - "To use Spark for parallel training in FLAML, you need to set the `use_spark` parameter to `True` and specify the number of concurrent trials using the `n_concurrent_trials` parameter. Here's a sample code snippet:\n", + "4. Create an instance of `flaml.AutoML` and call the `fit` method, passing the pandas-on-spark dataframe, label column name, and the settings. Here is an example:\n", "\n", "```python\n", "import flaml\n", "\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", + "automl = flaml.AutoML()\n", + "settings = {\n", " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", "}\n", "\n", - "automl_experiment.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=\"target\",\n", + " **settings,\n", ")\n", "```\n", "\n", - "Make sure to replace `dataframe` and `label` with your actual data and label. This code will run the FLAML AutoML experiment using Spark for parallel training with 2 concurrent trials.\n", + "This code will run the AutoML experiment using Spark for parallel training with the specified Spark ML estimator.\n", "\n", - "Please let me know if you need any further assistance.\n", + "You can find example notebooks on how to integrate Spark with FLAML in the FLAML GitHub repository:\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", - "\n", - "Great! If you have any more questions, feel free to ask. Otherwise, feel free to terminate the conversation.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "- [Integrating FLAML with Spark for AutoML](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "- [Using Spark Estimators with FLAML](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", "\n", - "Thank you for your assistance! I don't have any more questions at the moment. I will terminate the conversation.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", - "\n", - "\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "Great! If you have any more questions in the future, feel free to ask. Have a great day! TERMINATE\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "rag_chat()\n", - "# type exit to terminate the chat" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Call RetrieveUserProxyAgent while init chat with another user proxy agent\n", - "Sometimes, there might be a need to use RetrieveUserProxyAgent in group chat without initializing the chat with it. In such scenarios, it becomes essential to create a function that wraps the RAG agents and allows them to be called from other agents." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "How to use spark for parallel training in FLAML? Give me sample code.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "How to use spark for parallel training in FLAML? Give me sample code.\n", + "Please let me know if you need any further assistance.\n", "\n", "--------------------------------------------------------------------------------\n", "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "Sure, here's an example of how to use Spark for parallel training in FLAML:\n", + "Here is the sample code to use Spark for parallel training in FLAML:\n", "\n", "```python\n", - "from flaml import AutoML\n", - "from pyspark.sql import SparkSession\n", - "\n", - "# Create a Spark session\n", - "spark = SparkSession.builder \\\n", - " .appName(\"FLAML Example\") \\\n", - " .getOrCreate()\n", - "\n", - "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", - "\n", - "# Convert the Spark DataFrame to a Pandas DataFrame\n", - "pandas_df = data.toPandas()\n", - "\n", - "# Initialize FLAML with Spark\n", - "automl = AutoML()\n", - "\n", - "# Set the Spark context\n", - "automl.set_context(spark)\n", + "import pandas as pd\n", + "from flaml.automl import AutoML\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", "\n", - "# Specify the search space for hyperparameters\n", - "automl_settings = {\n", - " \"time_budget\": 60, # total running time in seconds\n", - " \"task\": \"classification\", # task type\n", - " \"log_file_name\": \"flaml.log\", # log file name\n", - " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", - " \"split_type\": \"stratified\", # data split type\n", - " \"eval_method\": \"auto\", # evaluation metric for optimization\n", - " \"n_splits\": 5, # number of splits for cross-validation\n", - " \"seed\": 42 # random seed\n", + "# Step 1: Prepare data\n", + "data = {\n", + " \"feature1\": [1, 2, 3, 4, 5],\n", + " \"feature2\": [0.1, 0.2, 0.3, 0.4, 0.5],\n", + " \"target\": [10, 20, 30, 40, 50]\n", "}\n", "\n", - "# Train the model using FLAML\n", - "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", - "\n", - "# Get the best model found by FLAML\n", - "best_model = automl.best_model\n", - "\n", - "# Print the best model's hyperparameters and performance\n", - "print(\"Best model hyperparameters:\", best_model)\n", - "\n", - "# Terminate the Spark session\n", - "spark.stop()\n", - "\n", - "# Terminate the conversation\n", - "TERMINATE\n", - "```\n", - "\n", - "Make sure to replace `\"data.csv\"` with the path to your actual data file. You can also modify the `automl_settings` dictionary to customize the FLAML configuration according to your needs.\n", - "\n", - "Remember to install FLAML and its dependencies before running this code.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", - "\n", - "Looks good! Just a small correction: `automl.set_context(spark)` should be placed before `automl.fit()` to ensure that the Spark context is set before training begins. Here's the corrected code:\n", - "\n", - "```python\n", - "from flaml import AutoML\n", - "from pyspark.sql import SparkSession\n", - "\n", - "# Create a Spark session\n", - "spark = SparkSession.builder \\\n", - " .appName(\"FLAML Example\") \\\n", - " .getOrCreate()\n", - "\n", - "# Load your data into a Spark DataFrame\n", - "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"data.csv\")\n", + "dataframe = pd.DataFrame(data)\n", + "psdf = to_pandas_on_spark(dataframe)\n", "\n", - "# Convert the Spark DataFrame to a Pandas DataFrame\n", - "pandas_df = data.toPandas()\n", + "# Step 2: Choose estimator and define settings\n", + "estimator_list = [\"lgbm_spark\"]\n", + "time_budget = 30\n", + "metric = 'r2'\n", + "task = 'regression'\n", "\n", - "# Initialize FLAML with Spark\n", + "# Step 3: Run AutoML with Spark\n", "automl = AutoML()\n", - "\n", - "# Set the Spark context\n", - "automl.set_context(spark)\n", - "\n", - "# Specify the search space for hyperparameters\n", "automl_settings = {\n", - " \"time_budget\": 60, # total running time in seconds\n", - " \"task\": \"classification\", # task type\n", - " \"log_file_name\": \"flaml.log\", # log file name\n", - " \"estimator_list\": [\"lgbm\", \"rf\", \"xgboost\"], # list of ML models\n", - " \"split_type\": \"stratified\", # data split type\n", - " \"eval_method\": \"auto\", # evaluation metric for optimization\n", - " \"n_splits\": 5, # number of splits for cross-validation\n", - " \"seed\": 42 # random seed\n", + " \"time_budget\": time_budget,\n", + " \"metric\": metric,\n", + " \"task\": task,\n", + " \"estimator_list\": estimator_list,\n", + " \"use_spark\": True,\n", "}\n", "\n", - "# Set the Spark context\n", - "automl.set_context(spark)\n", - "\n", - "# Train the model using FLAML\n", - "automl.fit(X_train=pandas_df.drop(columns=[\"target\"]), y_train=pandas_df[\"target\"], **automl_settings)\n", - "\n", - "# Get the best model found by FLAML\n", - "best_model = automl.best_model\n", - "\n", - "# Print the best model's hyperparameters and performance\n", - "print(\"Best model hyperparameters:\", best_model)\n", - "\n", - "# Terminate the Spark session\n", - "spark.stop()\n", - "\n", - "# Terminate the conversation\n", - "TERMINATE\n", + "automl.fit(dataframe=psdf, label=\"target\", **automl_settings)\n", "```\n", "\n", - "Let me know if you have any further questions!\n", + "Make sure you have Spark installed and properly configured. This code uses the `lgbm_spark` estimator for LightGBM models, runs the AutoML for a maximum of 30 seconds, and uses \"r2\" as the evaluation metric. The `use_spark` parameter is set to `True` to enable parallel training using Spark.\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", - "\n", - "Reply `TERMINATE` if the task is done.\n", + "Please let me know if you need any further assistance.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", "\n", "TERMINATE\n", "\n", @@ -1031,7 +1062,8 @@ } ], "source": [ - "call_rag_chat()" + "with Cache.disk():\n", + " call_rag_chat()" ] }, { @@ -1058,7 +1090,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.13" } }, "nbformat": 4, From 7603f5207b88c68d801096b0e14b88eb698d1801 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 13 Feb 2024 19:02:09 +0800 Subject: [PATCH 3/4] Reran notebook --- notebook/agentchat_groupchat_RAG.ipynb | 503 ++++++++++++++----------- 1 file changed, 290 insertions(+), 213 deletions(-) diff --git a/notebook/agentchat_groupchat_RAG.ipynb b/notebook/agentchat_groupchat_RAG.ipynb index 3ff363185796..a70195f1922b 100644 --- a/notebook/agentchat_groupchat_RAG.ipynb +++ b/notebook/agentchat_groupchat_RAG.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -116,12 +116,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "llm_config = {\n", " \"timeout\": 60,\n", + " \"temperature\": 0,\n", " \"config_list\": config_list,\n", "}\n", "\n", @@ -237,7 +238,7 @@ " str,\n", " \"Refined message which keeps the original meaning and can be used to retrieve content for code generation and question answering.\",\n", " ],\n", - " n_results: int = 3,\n", + " n_results: Annotated[int, \"number of results\"] = 3,\n", " ) -> str:\n", " boss_aid.n_results = n_results # Set the number of results to be retrieved.\n", " # Check if we need to update the context.\n", @@ -289,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -300,108 +301,120 @@ "\n", "How to use spark for parallel training in FLAML? Give me sample code.\n", "\n", - "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", - "To use Spark for parallel training in FLAML, you need to install PySpark by running the following command in your terminal:\n", - "```\n", - "pip install pyspark\n", - "```\n", - "Here's a sample code that shows how to use Spark for parallel training in FLAML:\n", + "To use Spark for parallel training in FLAML, you need to set up a Spark cluster and configure FLAML to use Spark as the backend. Here's a sample code to demonstrate how to use Spark for parallel training in FLAML:\n", "\n", "```python\n", "from flaml import AutoML\n", "from pyspark.sql import SparkSession\n", "\n", - "# create a SparkSession\n", - "spark = SparkSession.builder.appName(\"Spark_FLAML\").getOrCreate()\n", - "\n", - "# load your data into a Spark DataFrame\n", - "data = spark.read.load('/path/to/your/data.csv', format='csv', header='true', inferSchema='true')\n", + "# Create a Spark session\n", + "spark = SparkSession.builder \\\n", + " .appName(\"FLAML with Spark\") \\\n", + " .getOrCreate()\n", "\n", - "# define the target column and other parameters\n", - "target_column = 'label'\n", - "time_budget = 600 # time_budget is in seconds\n", + "# Load your data into a Spark DataFrame\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"your_data.csv\")\n", "\n", - "# create an instance of AutoML with Spark backend\n", + "# Initialize FLAML with Spark backend\n", "automl = AutoML()\n", + "automl.initialize(spark=spark)\n", + "\n", + "# Specify the search space and other settings\n", "settings = {\n", - " \"time_budget\": time_budget,\n", - " \"task\": \"classification\",\n", - " \"log_file_name\": \"log.txt\",\n", - " \"log_training_metric\": True,\n", + " \"time_budget\": 60, # total time in seconds\n", + " \"metric\": 'accuracy',\n", + " \"task\": 'classification',\n", + " \"log_file_name\": 'flaml.log',\n", "}\n", - "automl.initialize(**settings)\n", - "automl.fit(\n", - " X_train=data.drop(target_column, axis=1),\n", - " y_train=data[target_column],\n", - " estimator_list=['lgbm', 'rf', 'xgboost', 'catboost'],\n", - " task='classification',\n", - " time_budget=time_budget,\n", - " spark=spark, # pass the SparkSession to FLAML\n", - ")\n", "\n", - "# get the best model\n", - "best_model = automl.model\n", + "# Train and tune the model using FLAML\n", + "automl.fit(data=data, **settings)\n", "\n", - "# stop the SparkSession\n", - "spark.stop()\n", + "# Get the best model and its hyperparameters\n", + "best_model = automl.best_model\n", + "best_config = automl.best_config\n", "\n", - "# finally, print the best model found by FLAML\n", - "print(best_model)\n", + "# Print the best model and its hyperparameters\n", + "print(\"Best model:\", best_model)\n", + "print(\"Best hyperparameters:\", best_config)\n", "\n", - "# terminate\n", - "TERMINATE\n", + "# Terminate the Spark session\n", + "spark.stop()\n", "```\n", "\n", - "In this example, we first create a SparkSession and load our data into a Spark DataFrame. We then define the target column and other parameters such as `time_budget`. We create an instance of `AutoML` with `Spark` backend and pass the `SparkSession` to FLAML. Then, we call the `fit` method of the `AutoML` object to fit various estimators on our training data in a parallel way using Spark. Finally, we obtain the best model and stop the SparkSession.\n", + "Make sure to replace `\"your_data.csv\"` with the path to your actual data file. Adjust the `settings` dictionary according to your requirements.\n", + "\n", + "This code initializes a Spark session, loads the data into a Spark DataFrame, and then uses FLAML's `AutoML` class to train and tune a model in parallel using Spark. Finally, it prints the best model and its hyperparameters.\n", "\n", - "I hope this helps! Let me know if you have any other questions.\n", + "Remember to install FLAML and PySpark before running this code.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "To use Spark for parallel training in FLAML, you need to set up a Spark cluster and configure FLAML to use Spark as the backend. Here's a sample code to demonstrate how to use Spark for parallel training in FLAML:\n", "\n", - "Great explanation! I have no further questions. TERMINATE.\n", + "```python\n", + "from flaml import AutoML\n", + "from pyspark.sql import SparkSession\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GroupChat select_speaker failed to resolve the next speaker's name. This is because the speaker selection OAI call returned:\n", - "The role to play now depends on the situation, as there is no specific request or question from the previous role to continue the conversation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mBoss\u001b[0m (to chat_manager):\n", + "# Create a Spark session\n", + "spark = SparkSession.builder \\\n", + " .appName(\"FLAML with Spark\") \\\n", + " .getOrCreate()\n", "\n", - "Reply `TERMINATE` if the task is done.\n", + "# Load your data into a Spark DataFrame\n", + "data = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"your_data.csv\")\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GroupChat select_speaker failed to resolve the next speaker's name. This is because the speaker selection OAI call returned:\n", - "No role to play at the moment. The conversation is done.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "# Initialize FLAML with Spark backend\n", + "automl = AutoML()\n", + "automl.initialize(spark=spark)\n", "\n", - "TERMINATE\n", + "# Specify the search space and other settings\n", + "settings = {\n", + " \"time_budget\": 60, # total time in seconds\n", + " \"metric\": 'accuracy',\n", + " \"task\": 'classification',\n", + " \"log_file_name\": 'flaml.log',\n", + "}\n", + "\n", + "# Train and tune the model using FLAML\n", + "automl.fit(data=data, **settings)\n", + "\n", + "# Get the best model and its hyperparameters\n", + "best_model = automl.best_model\n", + "best_config = automl.best_config\n", + "\n", + "# Print the best model and its hyperparameters\n", + "print(\"Best model:\", best_model)\n", + "print(\"Best hyperparameters:\", best_config)\n", + "\n", + "# Terminate the Spark session\n", + "spark.stop()\n", + "```\n", + "\n", + "Make sure to replace `\"your_data.csv\"` with the path to your actual data file. Adjust the `settings` dictionary according to your requirements.\n", + "\n", + "This code initializes a Spark session, loads the data into a Spark DataFrame, and then uses FLAML's `AutoML` class to train and tune a model in parallel using Spark. Finally, it prints the best model and its hyperparameters.\n", + "\n", + "Remember to install FLAML and PySpark before running this code.\n", "\n", "--------------------------------------------------------------------------------\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\n", + "Looks good to me! The code demonstrates how to use Spark for parallel training in FLAML. It initializes a Spark session, loads the data into a Spark DataFrame, and then uses FLAML's `AutoML` class to train and tune a model in parallel using Spark. Finally, it prints the best model and its hyperparameters. Just make sure to replace `\"your_data.csv\"` with the actual path to the data file and adjust the `settings` dictionary as needed. \n", + "\n", + "If there are no further questions, I will terminate.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "\n", "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" @@ -424,14 +437,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Trying to create collection.\n", "doc_ids: [['doc_0', 'doc_1', 'doc_122']]\n", "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", @@ -594,152 +606,256 @@ "\n", "\n", "\n", - "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", - "UPDATE CONTEXT. Please provide more context or clarify your question.\n", + "To use Spark for parallel training in FLAML, you can follow these steps:\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "1. Prepare your data in the required format using the `to_pandas_on_spark` function from the `flaml.automl.spark.utils` module. This function converts your data into a pandas-on-spark dataframe, which is required by Spark estimators. Here is an example code snippet:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", "\n", - "Sorry about that. Let me clarify my previous question.\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]\n", + "}\n", "\n", - "How can we use Spark for parallel training in FLAML? Can you provide some sample code?\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "2. Format your data appropriately for Spark ML models. Use the `VectorAssembler` from `pyspark.ml.feature` to merge all feature columns into a single vector column. Here is an example:\n", "\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel Spark jobs.\n", + "```python\n", + "from pyspark.ml.feature import VectorAssembler\n", "\n", - "To perform AutoML on Spark data, you need to first organize your data in the required format for Spark estimators. They require data in pandas-on-spark (psdf) format and [VectorAssembler](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column. Then you can use the `spark_lgbm` estimator in your estimator list for FLAML AutoML. See an example below.\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "```\n", + "\n", + "3. Use the Spark ML models in FLAML's AutoML. Include the models you want to try in the `estimator_list` argument to `flaml.AutoML()`. FLAML will start trying configurations for these models. Here is an example code snippet:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"task\": \"regression\"\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings\n", + ")\n", + "```\n", "\n", - "First, use `to_pandas_on_spark` to convert your data into pandas-on-spark format. Here is an example:\n", + "4. To enable parallel Spark jobs during parallel tuning, set the `use_spark` parameter to `True`. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. Here is an example code snippet:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings\n", + ")\n", + "```\n", + "\n", + "Please note that you should not set `use_spark` to `True` when applying AutoML and Tuning for Spark Data, as SparkML models will be used for Spark Data in AutoML and Tuning.\n", + "\n", + "Let me know if you need anything else.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "To use Spark for parallel training in FLAML, you can follow these steps:\n", + "\n", + "1. Prepare your data in the required format using the `to_pandas_on_spark` function from the `flaml.automl.spark.utils` module. This function converts your data into a pandas-on-spark dataframe, which is required by Spark estimators. Here is an example code snippet:\n", "\n", "```python\n", "import pandas as pd\n", "from flaml.automl.spark.utils import to_pandas_on_spark\n", "\n", - "data = {\"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000]}\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]\n", + "}\n", + "\n", + "# Creating a pandas DataFrame\n", "dataframe = pd.DataFrame(data)\n", "label = \"Price\"\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", "psdf = to_pandas_on_spark(dataframe)\n", "```\n", "\n", - "To use Spark ML models, you need to format your data appropriately.\n", + "2. Format your data appropriately for Spark ML models. Use the `VectorAssembler` from `pyspark.ml.feature` to merge all feature columns into a single vector column. Here is an example:\n", "\n", "```python\n", - "from pyspark.ml.feature.scaler import StandardScaler\n", "from pyspark.ml.feature import VectorAssembler\n", "\n", "columns = psdf.columns\n", "feature_cols = [col for col in columns if col != label]\n", "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "data = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label]\n", - "data.show()\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "```\n", "\n", - "You can then use FLAML's `AutoML` class to tune hyperparameters and find the best model.\n", + "3. Use the Spark ML models in FLAML's AutoML. Include the models you want to try in the `estimator_list` argument to `flaml.AutoML()`. FLAML will start trying configurations for these models. Here is an example code snippet:\n", "\n", "```python\n", "import flaml\n", + "\n", "automl = flaml.AutoML()\n", "settings = {\n", " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", " \"estimator_list\": [\"lgbm_spark\"],\n", - " \"task\": \"regression\",\n", + " \"task\": \"regression\"\n", "}\n", + "\n", "automl.fit(\n", - " psdf=data,\n", + " dataframe=psdf,\n", " label=label,\n", - " **settings,\n", + " **settings\n", ")\n", "```\n", "\n", - "To use Spark for parallel training Spark jobs, you set the `use_spark` flag in `Hyperparameter-Tuner` or in `AutoML` to True. See example below:\n", + "4. To enable parallel Spark jobs during parallel tuning, set the `use_spark` parameter to `True`. FLAML will dispatch your job to the distributed Spark backend using `joblib-spark`. Here is an example code snippet:\n", "\n", "```python\n", - "from sklearn.datasets import load_diabetes\n", - "from flaml import AutoML\n", - "from flaml.data import convert_diabetes_to_spark\n", - "from pyspark.sql.session import SparkSession\n", - "import joblib\n", - "\n", - "data = load_diabetes(as_frame=True, return_X_y=False)\n", - "label = \"target\"\n", - "\n", - "spark = SparkSession.builder.master(\"local[4]\").appName(\"flaml-example\").getOrCreate()\n", - "ps_df = convert_diabetes_to_spark(spark, data, label)\n", + "import flaml\n", "\n", - "n_trials = 10\n", - "automl = AutoML()\n", + "automl_experiment = flaml.AutoML()\n", "automl_settings = {\n", " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", " \"task\": \"regression\",\n", - " \"n_jobs\": -1, # Run wherever we have capacity\n", - " \"use_spark\": True, # Parallel training in Spark workers\n", - " \"n_trials\": n_trials,\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True\n", "}\n", "\n", - "with joblib.parallel_backend(\"spark\", batch_size=1):\n", - " result = automl.fit(\n", - " psdf=ps_df,\n", - " label=label,\n", - " **automl_settings,\n", - " )\n", - "\n", - "print(f\"Best ML learner: {result['best_estimator']}\")\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings\n", + ")\n", "```\n", "\n", - "Here we have used `joblib.parallel_backend` to specify the parallel backend (Spark). We have also set `use_spark` to True in `automl_settings` and passed it to `automl.fit()`. Spark related arguments have been set in this dict.\n", + "Please note that you should not set `use_spark` to `True` when applying AutoML and Tuning for Spark Data, as SparkML models will be used for Spark Data in AutoML and Tuning.\n", "\n", - "I hope this helps!\n", + "Let me know if you need anything else.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", + "Here is the sample code to use Spark for parallel training in FLAML:\n", "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "from pyspark.ml.feature import VectorAssembler\n", + "import flaml\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "# Step 1: Prepare your data in the required format\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]\n", + "}\n", "\n", - "Great job! That was a comprehensive answer to the question. No further action required.\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "psdf = to_pandas_on_spark(dataframe)\n", "\n", - "Thank you! Don't hesitate to ask if you have any further questions.\n", + "# Step 2: Format your data for Spark ML models\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "# Step 3: Use Spark ML models in FLAML's AutoML\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"task\": \"regression\"\n", + "}\n", "\n", - "You're welcome! I'm always here to help if you have any other questions or concerns.\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings\n", + ")\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "# Step 4: Enable parallel Spark jobs during parallel tuning\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True\n", + "}\n", "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings\n", + ")\n", + "```\n", "\n", + "Let me know if you need anything else.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", + "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", "\n", - "Thanks for your cooperation. I will close this task now.\n", + "The code you provided is correct and follows the guidelines for using Spark for parallel training in FLAML. It includes the necessary steps to prepare the data, format it for Spark ML models, and use Spark ML models in FLAML's AutoML. It also demonstrates how to enable parallel Spark jobs during parallel tuning.\n", + "\n", + "Great job! You can now terminate the conversation.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", + "\u001b[33mBoss_Assistant\u001b[0m (to chat_manager):\n", + "\n", "\n", - "You're welcome! Feel free to reach out if you need more help in the future. Have a great day!\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", - "Thank you! Have a great day ahead!\n", + "TERMINATE\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -762,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -789,7 +905,6 @@ "--------------------------------------------------------------------------------\n", "\u001b[35m\n", ">>>>>>>> EXECUTING FUNCTION retrieve_content...\u001b[0m\n", - "Trying to create collection.\n", "doc_ids: [['doc_0', 'doc_1', 'doc_122']]\n", "\u001b[32mAdding doc_id doc_0 to context.\u001b[0m\n", "\u001b[32mAdding doc_id doc_1 to context.\u001b[0m\n", @@ -957,30 +1072,30 @@ "--------------------------------------------------------------------------------\n", "\u001b[33mProduct_Manager\u001b[0m (to chat_manager):\n", "\n", - "To use Spark for parallel training in FLAML, you need to follow these steps:\n", + "To use Spark for parallel training in FLAML, you can follow these steps:\n", "\n", - "1. Prepare your data in the required format as Spark data. FLAML provides a utility function `to_pandas_on_spark` to convert your pandas or PySpark data into a pandas-on-spark dataframe. Here is an example:\n", + "1. Prepare your data in the required format using Spark data. You can use the `to_pandas_on_spark` function from the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark dataframe.\n", "\n", "```python\n", "import pandas as pd\n", "from flaml.automl.spark.utils import to_pandas_on_spark\n", "\n", - "# Create a pandas DataFrame\n", - "dataframe = pd.DataFrame({\n", - " \"feature1\": [1, 2, 3, 4, 5],\n", - " \"feature2\": [0.1, 0.2, 0.3, 0.4, 0.5],\n", - " \"target\": [10, 20, 30, 40, 50]\n", - "})\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000]\n", + "}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", "\n", "# Convert to pandas-on-spark dataframe\n", "psdf = to_pandas_on_spark(dataframe)\n", "```\n", "\n", - "2. Choose the Spark ML estimator you want to use for training. FLAML integrates Spark ML estimators for AutoML. For example, you can use the `lgbm_spark` estimator for LightGBM models. Import the estimator class and create an instance. \n", - "\n", - "3. Define the settings for the AutoML experiment, including the time budget, metric, task type, and the list of estimators to try. \n", - "\n", - "4. Create an instance of `flaml.AutoML` and call the `fit` method, passing the pandas-on-spark dataframe, label column name, and the settings. Here is an example:\n", + "2. Use the Spark ML estimators provided by FLAML. You can include the models you want to try in the `estimator_list` argument of the `flaml.AutoML` class. FLAML will start trying configurations for these models.\n", "\n", "```python\n", "import flaml\n", @@ -989,71 +1104,33 @@ "settings = {\n", " \"time_budget\": 30,\n", " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"],\n", + " \"estimator_list\": [\"lgbm_spark\"], # Optional: specify the Spark estimator\n", " \"task\": \"regression\",\n", "}\n", "\n", "automl.fit(\n", " dataframe=psdf,\n", - " label=\"target\",\n", + " label=label,\n", " **settings,\n", ")\n", "```\n", "\n", - "This code will run the AutoML experiment using Spark for parallel training with the specified Spark ML estimator.\n", - "\n", - "You can find example notebooks on how to integrate Spark with FLAML in the FLAML GitHub repository:\n", - "\n", - "- [Integrating FLAML with Spark for AutoML](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "- [Using Spark Estimators with FLAML](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "Please let me know if you need any further assistance.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", - "\n", - "Here is the sample code to use Spark for parallel training in FLAML:\n", + "3. Enable parallel Spark jobs by setting the `use_spark` parameter to `True` in the `fit` method. This will dispatch the job to the distributed Spark backend using `joblib-spark`.\n", "\n", "```python\n", - "import pandas as pd\n", - "from flaml.automl import AutoML\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "\n", - "# Step 1: Prepare data\n", - "data = {\n", - " \"feature1\": [1, 2, 3, 4, 5],\n", - " \"feature2\": [0.1, 0.2, 0.3, 0.4, 0.5],\n", - " \"target\": [10, 20, 30, 40, 50]\n", - "}\n", - "\n", - "dataframe = pd.DataFrame(data)\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "\n", - "# Step 2: Choose estimator and define settings\n", - "estimator_list = [\"lgbm_spark\"]\n", - "time_budget = 30\n", - "metric = 'r2'\n", - "task = 'regression'\n", - "\n", - "# Step 3: Run AutoML with Spark\n", - "automl = AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": time_budget,\n", - " \"metric\": metric,\n", - " \"task\": task,\n", - " \"estimator_list\": estimator_list,\n", - " \"use_spark\": True,\n", - "}\n", - "\n", - "automl.fit(dataframe=psdf, label=\"target\", **automl_settings)\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " use_spark=True,\n", + ")\n", "```\n", "\n", - "Make sure you have Spark installed and properly configured. This code uses the `lgbm_spark` estimator for LightGBM models, runs the AutoML for a maximum of 30 seconds, and uses \"r2\" as the evaluation metric. The `use_spark` parameter is set to `True` to enable parallel training using Spark.\n", + "Note: Make sure you have Spark installed and configured properly before running the code.\n", "\n", "Please let me know if you need any further assistance.\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33mCode_Reviewer\u001b[0m (to chat_manager):\n", + "\u001b[33mSenior_Python_Engineer\u001b[0m (to chat_manager):\n", "\n", "TERMINATE\n", "\n", From 34c6000e276c90f577ed364e986425bec23145da Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Thu, 15 Feb 2024 12:36:19 +0800 Subject: [PATCH 4/4] Fix usage of cache --- notebook/agentchat_function_call.ipynb | 3 ++- notebook/agentchat_function_call_async.ipynb | 6 ++++-- ...gentchat_function_call_currency_calculator.ipynb | 11 ++++++----- notebook/agentchat_groupchat_RAG.ipynb | 12 ++++-------- test/agentchat/contrib/test_web_surfer.py | 13 ++++++------- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/notebook/agentchat_function_call.ipynb b/notebook/agentchat_function_call.ipynb index 888206401fa1..834f77772de4 100644 --- a/notebook/agentchat_function_call.ipynb +++ b/notebook/agentchat_function_call.ipynb @@ -298,11 +298,12 @@ } ], "source": [ - "with Cache.disk():\n", + "with Cache.disk() as cache:\n", " # start the conversation\n", " user_proxy.initiate_chat(\n", " chatbot,\n", " message=\"Draw two agents chatting with each other with an example dialog. Don't add plt.show().\",\n", + " cache=cache,\n", " )" ] }, diff --git a/notebook/agentchat_function_call_async.ipynb b/notebook/agentchat_function_call_async.ipynb index 962708dff118..e7da183f5431 100644 --- a/notebook/agentchat_function_call_async.ipynb +++ b/notebook/agentchat_function_call_async.ipynb @@ -202,10 +202,11 @@ } ], "source": [ - "with Cache.disk():\n", + "with Cache.disk() as cache:\n", " await user_proxy.a_initiate_chat( # noqa: F704\n", " coder,\n", " message=\"Create a timer for 5 seconds and then a stopwatch for 5 seconds.\",\n", + " cache=cache,\n", " )" ] }, @@ -347,10 +348,11 @@ "3) when 1 and 2 are done, terminate the group chat\n", "\"\"\"\n", "\n", - "with Cache.disk():\n", + "with Cache.disk() as cache:\n", " await user_proxy.a_initiate_chat( # noqa: F704\n", " manager,\n", " message=message,\n", + " cache=cache,\n", " )" ] }, diff --git a/notebook/agentchat_function_call_currency_calculator.ipynb b/notebook/agentchat_function_call_currency_calculator.ipynb index d498e020e37e..798432e0f489 100644 --- a/notebook/agentchat_function_call_currency_calculator.ipynb +++ b/notebook/agentchat_function_call_currency_calculator.ipynb @@ -300,10 +300,10 @@ } ], "source": [ - "with Cache.disk():\n", + "with Cache.disk() as cache:\n", " # start the conversation\n", " res = user_proxy.initiate_chat(\n", - " chatbot, message=\"How much is 123.45 USD in EUR?\", summary_method=\"reflection_with_llm\"\n", + " chatbot, message=\"How much is 123.45 USD in EUR?\", summary_method=\"reflection_with_llm\", cache=cache\n", " )" ] }, @@ -485,10 +485,10 @@ } ], "source": [ - "with Cache.disk():\n", + "with Cache.disk() as cache:\n", " # start the conversation\n", " res = user_proxy.initiate_chat(\n", - " chatbot, message=\"How much is 112.23 Euros in US Dollars?\", summary_method=\"reflection_with_llm\"\n", + " chatbot, message=\"How much is 112.23 Euros in US Dollars?\", summary_method=\"reflection_with_llm\", cache=cache\n", " )" ] }, @@ -563,11 +563,12 @@ } ], "source": [ - "with Cache.disk():\n", + "with Cache.disk() as cache:\n", " # start the conversation\n", " res = user_proxy.initiate_chat(\n", " chatbot,\n", " message=\"How much is 123.45 US Dollars in Euros?\",\n", + " cache=cache,\n", " )" ] }, diff --git a/notebook/agentchat_groupchat_RAG.ipynb b/notebook/agentchat_groupchat_RAG.ipynb index 1220d0537241..6e06a0ec23a4 100644 --- a/notebook/agentchat_groupchat_RAG.ipynb +++ b/notebook/agentchat_groupchat_RAG.ipynb @@ -63,7 +63,6 @@ "import chromadb\n", "\n", "import autogen\n", - "from autogen.cache import Cache\n", "from autogen import AssistantAgent\n", "from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent\n", "\n", @@ -394,8 +393,7 @@ } ], "source": [ - "with Cache.disk():\n", - " norag_chat()" + "norag_chat()" ] }, { @@ -834,8 +832,7 @@ } ], "source": [ - "with Cache.disk():\n", - " rag_chat()\n", + "rag_chat()\n", "# type exit to terminate the chat" ] }, @@ -1111,8 +1108,7 @@ } ], "source": [ - "with Cache.disk():\n", - " call_rag_chat()" + "call_rag_chat()" ] }, { @@ -1139,7 +1135,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index aa28a540dae5..d5dae0beb1cc 100644 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -135,17 +135,16 @@ def test_web_surfer_oai() -> None: is_termination_msg=lambda x: True, ) - with Cache.disk(): - # Make some requests that should test function calling - user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") + # Make some requests that should test function calling + user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") - user_proxy.initiate_chat(web_surfer, message="Please scroll down.") + user_proxy.initiate_chat(web_surfer, message="Please scroll down.") - user_proxy.initiate_chat(web_surfer, message="Please scroll up.") + user_proxy.initiate_chat(web_surfer, message="Please scroll up.") - user_proxy.initiate_chat(web_surfer, message="When was it founded?") + user_proxy.initiate_chat(web_surfer, message="When was it founded?") - user_proxy.initiate_chat(web_surfer, message="What's this page about?") + user_proxy.initiate_chat(web_surfer, message="What's this page about?") @pytest.mark.skipif(