From ab3ff53cc5dfa638c8ef7731fbe0d33f505802f3 Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Tue, 4 Jun 2024 09:46:14 -0500 Subject: [PATCH 01/14] Calculate the dimension size based off model chosen. --- .../agentchat/contrib/vectordb/pgvectordb.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 38507cb7998e..9eb527755de3 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -526,12 +526,15 @@ def delete_collection(self, collection_name: Optional[str] = None) -> None: cursor.execute(f"DROP TABLE IF EXISTS {self.name}") cursor.close() - def create_collection(self, collection_name: Optional[str] = None) -> None: + def create_collection( + self, collection_name: Optional[str] = None, dimension: Optional[Union[str, int]] = 384 + ) -> None: """ Create a new collection. Args: collection_name (Optional[str]): The name of the new collection. + dimension (Optional[Union[str, int]]): The dimension size of the sentence embedding model Returns: None @@ -541,7 +544,7 @@ def create_collection(self, collection_name: Optional[str] = None) -> None: cursor = self.client.cursor() cursor.execute( f"CREATE TABLE {self.name} (" - f"documents text, id CHAR(8) PRIMARY KEY, metadatas JSONB, embedding vector(384));" + f"documents text, id CHAR(8) PRIMARY KEY, metadatas JSONB, embedding vector({dimension}));" f"CREATE INDEX " f'ON {self.name} USING hnsw (embedding vector_l2_ops) WITH (m = {self.metadata["hnsw:M"]}, ' f'ef_construction = {self.metadata["hnsw:construction_ef"]});' @@ -618,6 +621,14 @@ def __init__( self.embedding_function = ( SentenceTransformer(self.model_name) if embedding_function is None else embedding_function ) + # This will get the model dimension size by computing the embeddings dimensions + sentences = [ + "The weather is lovely today in paradise.", + "It's so sunny outside in the garden!", + "He reached his max potential within his lifetime.", + ] + embeddings = self.embedding_function.encode(sentences) + self.dimension = embeddings.shape[1] except Exception as e: logger.error( f"Validate the model name entered: {self.model_name} " @@ -741,7 +752,7 @@ def create_collection( model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) - collection.create_collection(collection_name=collection_name) + collection.create_collection(collection_name=collection_name, dimension=self.dimension) return collection elif overwrite: self.delete_collection(collection_name) @@ -754,7 +765,7 @@ def create_collection( model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) - collection.create_collection(collection_name=collection_name) + collection.create_collection(collection_name=collection_name, dimension=self.dimension) return collection elif get_or_create: return collection @@ -768,7 +779,7 @@ def create_collection( model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) - collection.create_collection(collection_name=collection_name) + collection.create_collection(collection_name=collection_name, dimension=self.dimension) return collection else: raise ValueError(f"Collection {collection_name} already exists.") From eb89774be7f702b6a4df42d6f9824bf6a698ba29 Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Tue, 4 Jun 2024 16:03:09 -0500 Subject: [PATCH 02/14] Added example docstring. --- .../agentchat/contrib/vectordb/pgvectordb.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 9eb527755de3..860e31400124 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -605,6 +605,37 @@ def __init__( Returns: None + + Example: + RetrieveUserProxyAgent( + name="ragproxyagent", + human_input_mode="NEVER", + max_consecutive_auto_reply=3, + retrieve_config={ + "task": "code", + "docs_path": [ + "https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md", + "https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md", + "https://raw.githubusercontent.com/Knuckles-Team/geniusbot/main/README.md", + "https://raw.githubusercontent.com/Knuckles-Team/repository-manager/main/README.md", + "https://raw.githubusercontent.com/Knuckles-Team/gitlab-api/main/README.md", + "https://raw.githubusercontent.com/Knuckles-Team/media-downloader/main/README.md", + os.path.join(os.path.abspath(""), "..", "website", "docs"), + ], + "custom_text_types": ["non-existent-type"], + "chunk_token_size": 2000, + "model": config_list[0]["model"], + "vector_db": "pgvector", # PGVector database + "collection_name": "test_collection", + "db_config": { + "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", + }, + "embedding_function": "all-distilroberta-v1", + "get_or_create": True, # set to False if you don't want to reuse an existing collection + "overwrite": False, # set to True if you want to overwrite an existing collection + }, + code_execution_config=False, # set to False if you don't want to execute the code + ) """ self.client = self.establish_connection( conn=conn, From 172bfbc5f30e4ff3fd7305029957444dc5aafae4 Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Tue, 4 Jun 2024 17:20:17 -0500 Subject: [PATCH 03/14] Validated working notebook with sentence models of different dimensions. --- .../agentchat/contrib/vectordb/pgvectordb.py | 22 +- .../agentchat_pgvector_RetrieveChat.ipynb | 363 +++--------------- 2 files changed, 78 insertions(+), 307 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 860e31400124..7f42c60ea33e 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -80,6 +80,14 @@ def __init__( self.metadata = metadata if metadata else {"hnsw:space": "ip", "hnsw:construction_ef": 32, "hnsw:M": 16} self.documents = "" self.get_or_create = get_or_create + # This will get the model dimension size by computing the embeddings dimensions + sentences = [ + "The weather is lovely today in paradise.", + "It's so sunny outside in the garden!", + "He reached his max potential within his lifetime.", + ] + embeddings = self.embedding_function.encode(sentences) + self.dimension = embeddings.shape[1] def set_collection_name(self, collection_name) -> str: name = re.sub("-", "_", collection_name) @@ -304,7 +312,7 @@ def get( ) except (psycopg.errors.UndefinedTable, psycopg.errors.UndefinedColumn) as e: logger.info(f"Error executing select on non-existent table: {self.name}. Creating it instead. Error: {e}") - self.create_collection(collection_name=self.name) + self.create_collection(collection_name=self.name, dimension=self.dimension) logger.info(f"Created table {self.name}") cursor.close() @@ -541,10 +549,14 @@ def create_collection( """ if collection_name: self.name = collection_name + + if dimension: + self.dimension = dimension + cursor = self.client.cursor() cursor.execute( f"CREATE TABLE {self.name} (" - f"documents text, id CHAR(8) PRIMARY KEY, metadatas JSONB, embedding vector({dimension}));" + f"documents text, id CHAR(8) PRIMARY KEY, metadatas JSONB, embedding vector({self.dimension}));" f"CREATE INDEX " f'ON {self.name} USING hnsw (embedding vector_l2_ops) WITH (m = {self.metadata["hnsw:M"]}, ' f'ef_construction = {self.metadata["hnsw:construction_ef"]});' @@ -783,7 +795,7 @@ def create_collection( model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) - collection.create_collection(collection_name=collection_name, dimension=self.dimension) + collection.create_collection(collection_name=collection_name) return collection elif overwrite: self.delete_collection(collection_name) @@ -796,7 +808,7 @@ def create_collection( model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) - collection.create_collection(collection_name=collection_name, dimension=self.dimension) + collection.create_collection(collection_name=collection_name) return collection elif get_or_create: return collection @@ -810,7 +822,7 @@ def create_collection( model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) - collection.create_collection(collection_name=collection_name, dimension=self.dimension) + collection.create_collection(collection_name=collection_name) return collection else: raise ValueError(f"Collection {collection_name} already exists.") diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index 9b037b7c468d..f1862d09a857 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -72,14 +72,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "models to use: ['gpt-35-turbo', 'gpt4-1106-preview', 'gpt-35-turbo-0613']\n" + "models to use: ['Meta-Llama-3-8B-Instruct-imatrix', 'gpt-3.5-turbo-0125', 'gpt-35-turbo']\n" ] } ], @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -145,7 +145,7 @@ "output_type": "stream", "text": [ "Accepted file formats for `docs_path`:\n", - "['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n" + "['doc', 'pptx', 'org', 'log', 'rst', 'html', 'rtf', 'epub', 'xlsx', 'txt', 'htm', 'odt', 'json', 'md', 'tsv', 'ppt', 'jsonl', 'csv', 'yaml', 'msg', 'docx', 'yml', 'xml', 'pdf']\n" ] } ], @@ -156,17 +156,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 27, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/workspace/anaconda3/envs/autogen/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "/workspace/anaconda3/envs/autogen/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - " warnings.warn(\n" + "[autogen.oai.client: 06-04 17:15:23] {129} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" ] } ], @@ -185,6 +182,9 @@ "# Optionally create psycopg conn object\n", "# conn = psycopg.connect(conninfo=\"postgresql://postgres:postgres@localhost:5432/postgres\", autocommit=True)\n", "\n", + "# Optionally create embedding function object\n", + "# sentence_transformer_ef = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", + "\n", "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", "# `docs_path` is the path to the docs directory. It can also be the path to a single file, or the url to a single file. By default,\n", @@ -218,11 +218,12 @@ " # \"dbname\": \"postgres\", # Optional vector database name\n", " # \"username\": \"postgres\", # Optional vector database username\n", " # \"password\": \"postgres\", # Optional vector database password\n", - " \"model_name\": \"all-MiniLM-L6-v2\", # Sentence embedding model from https://huggingface.co/models?library=sentence-transformers or https://www.sbert.net/docs/pretrained_models.html\n", + " \"model_name\": \"all-distilroberta-v1\", # Sentence embedding model from https://huggingface.co/models?library=sentence-transformers or https://www.sbert.net/docs/pretrained_models.html\n", " # \"conn\": conn, # Optional - conn object to connect to database\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": False, # set to True if you want to overwrite an existing collection\n", + " # \"embedding_function\": sentence_transformer_ef,\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", ")" @@ -244,40 +245,35 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2024-05-23 08:48:18,875 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `flaml_collection`.\u001b[0m\n" + "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Trying to create collection.\n" + "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", + "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024-05-23 08:48:19,975 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "2024-05-23 08:48:19,977 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Error executing select on non-existent table: flaml_collection. Creating it instead. Error: relation \"flaml_collection\" does not exist\n", - "LINE 1: SELECT id, metadatas, documents, embedding FROM flaml_collec...\n", - " ^\u001b[0m\n", - "2024-05-23 08:48:19,996 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Created table flaml_collection\u001b[0m\n" + "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "VectorDB returns doc_ids: [['bdfbc921', '7968cf3c']]\n", - "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", @@ -540,271 +536,23 @@ "\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n", - "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", - "\n", - "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", - "context provided by the user.\n", - "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", - "For code generation, you must obey the following rules:\n", - "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", - "Rule 2. You must follow the formats below to write your code:\n", - "```language\n", - "# your code\n", - "```\n", - "\n", - "User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train for 30 seconds and force cancel jobs if time limit is reached.\n", - "\n", - "Context is: # Integrate - Spark\n", - "\n", - "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", - "\n", - "- Use Spark ML estimators for AutoML.\n", - "- Use Spark to run training in parallel spark jobs.\n", - "\n", - "## Spark ML Estimators\n", - "\n", - "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", - "\n", - "### Data\n", - "\n", - "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", - "\n", - "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", - "\n", - "This function also accepts optional arguments `index_col` and `default_index_type`.\n", - "\n", - "- `index_col` is the column name to use as the index, default is None.\n", - "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", - "\n", - "Here is an example code snippet for Spark Data:\n", - "\n", - "```python\n", - "import pandas as pd\n", - "from flaml.automl.spark.utils import to_pandas_on_spark\n", - "\n", - "# Creating a dictionary\n", - "data = {\n", - " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", - " \"Age_Years\": [20, 15, 10, 7, 25],\n", - " \"Price\": [100000, 200000, 300000, 240000, 120000],\n", - "}\n", - "\n", - "# Creating a pandas DataFrame\n", - "dataframe = pd.DataFrame(data)\n", - "label = \"Price\"\n", - "\n", - "# Convert to pandas-on-spark dataframe\n", - "psdf = to_pandas_on_spark(dataframe)\n", - "```\n", - "\n", - "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", - "\n", - "Here is an example of how to use it:\n", - "\n", - "```python\n", - "from pyspark.ml.feature import VectorAssembler\n", - "\n", - "columns = psdf.columns\n", - "feature_cols = [col for col in columns if col != label]\n", - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", - "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", - "```\n", - "\n", - "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", - "\n", - "### Estimators\n", - "\n", - "#### Model List\n", - "\n", - "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", - "\n", - "#### Usage\n", - "\n", - "First, prepare your data in the required format as described in the previous section.\n", - "\n", - "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", - "\n", - "Here is an example code snippet using SparkML models in AutoML:\n", - "\n", - "```python\n", - "import flaml\n", - "\n", - "# prepare your data in pandas-on-spark format as we previously mentioned\n", - "\n", - "automl = flaml.AutoML()\n", - "settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", - " \"task\": \"regression\",\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=psdf,\n", - " label=label,\n", - " **settings,\n", - ")\n", - "```\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", - "\n", - "## Parallel Spark Jobs\n", - "\n", - "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", - "\n", - "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", - "\n", - "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", - "\n", - "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", - "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", - "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", - "\n", - "An example code snippet for using parallel Spark jobs:\n", - "\n", - "```python\n", - "import flaml\n", - "\n", - "automl_experiment = flaml.AutoML()\n", - "automl_settings = {\n", - " \"time_budget\": 30,\n", - " \"metric\": \"r2\",\n", - " \"task\": \"regression\",\n", - " \"n_concurrent_trials\": 2,\n", - " \"use_spark\": True,\n", - " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", - "}\n", - "\n", - "automl.fit(\n", - " dataframe=dataframe,\n", - " label=label,\n", - " **automl_settings,\n", - ")\n", - "```\n", - "\n", - "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", - "# Research\n", - "\n", - "For technical details, please check our research publications.\n", - "\n", - "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021flaml,\n", - " title={FLAML: A Fast and Lightweight AutoML Library},\n", - " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", - " year={2021},\n", - " booktitle={MLSys},\n", - "}\n", - "```\n", - "\n", - "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021cfo,\n", - " title={Frugal Optimization for Cost-related Hyperparameters},\n", - " author={Qingyun Wu and Chi Wang and Silu Huang},\n", - " year={2021},\n", - " booktitle={AAAI},\n", - "}\n", - "```\n", - "\n", - "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021blendsearch,\n", - " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", - " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", - " year={2021},\n", - " booktitle={ICLR},\n", - "}\n", - "```\n", - "\n", - "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{liuwang2021hpolm,\n", - " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", - " author={Susan Xueqing Liu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ACL},\n", - "}\n", - "```\n", - "\n", - "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021chacha,\n", - " title={ChaCha for Online AutoML},\n", - " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", - " year={2021},\n", - " booktitle={ICML},\n", - "}\n", - "```\n", - "\n", - "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", - "\n", - "```bibtex\n", - "@inproceedings{wuwang2021fairautoml,\n", - " title={Fair AutoML},\n", - " author={Qingyun Wu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ArXiv preprint arXiv:2111.06495},\n", - "}\n", - "```\n", - "\n", - "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", - "\n", - "```bibtex\n", - "@inproceedings{kayaliwang2022default,\n", - " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", - " author={Moe Kayali and Chi Wang},\n", - " year={2022},\n", - " booktitle={ArXiv preprint arXiv:2202.09927},\n", - "}\n", - "```\n", - "\n", - "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", - "\n", - "```bibtex\n", - "@inproceedings{zhang2023targeted,\n", - " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", - " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", - " booktitle={International Conference on Learning Representations},\n", - " year={2023},\n", - " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", - "}\n", - "```\n", - "\n", - "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2023EcoOptiGen,\n", - " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", - " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2303.04673},\n", - "}\n", - "```\n", - "\n", - "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2023empirical,\n", - " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", - " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2306.01337},\n", - "}\n", - "```\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", + "It seems like you're sharing research papers related to the FLAML (Fast and Lightweight AutoML Library) project, which is a machine learning library developed by Microsoft Research.\n", "\n", + "The papers cover various topics such as:\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "1. **Frugal Optimization for Cost-related Hyperparameters**: This paper proposes a frugal optimization method for hyperparameter tuning in machine learning models.\n", + "2. **Economical Hyperparameter Optimization With Blended Search Strategy**: This paper presents a blended search strategy for economical hyperparameter optimization, which combines multiple optimization methods to achieve better results.\n", + "3. **An Empirical Study on Hyperparameter Optimization for Fine- Tuning Pre-trained Language Models**: This paper investigates the effectiveness of different hyperparameter optimization methods for fine-tuning pre-trained language models.\n", + "4. **ChaCha for Online AutoML**: This paper introduces ChaCha, a online autoML framework that can adapt to changing environments and optimize model performance in real-time.\n", + "5. **Fair AutoML**: This paper proposes a fair AutoML method that aims to reduce bias and improve fairness in machine learning models.\n", + "6. **Mining Robust Default Configurations for Resource-constrained AutoML**: This paper presents a method for mining robust default configurations for resource-constrained autoML, which can be useful for deploying AI models in real-world scenarios.\n", + "7. **Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives**: This paper proposes a targeted hyperparameter optimization method that can handle multiple objectives and lexicographic preferences.\n", + "8. **Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference**: This paper investigates cost-effective hyperparameter optimization methods for large language model generation inference, which is an important problem in natural language processing.\n", + "9. **An Empirical Study on Challenging Math Problem Solving with GPT-4**: This paper presents an empirical study on challenging math problem solving using GPT-4, a powerful AI model.\n", "\n", - "To use FLAML to perform a classification task and use Spark to do parallel training, you need to use the Spark ML estimators for AutoML. First, you need to prepare your data in the required format as described in the previous section. FLAML provides a convenient function \"to_pandas_on_spark\" to convert your data into a pandas-on-spark dataframe/series, which Spark estimators require. After that, use the pandas-on-spark data like non-spark data and pass them using X_train, y_train or dataframe, label. Finally, configure FLAML to use Spark as the parallel backend during parallel tuning by setting the use_spark to true. An example code snippet is provided in the context above.\n", + "These papers demonstrate the capabilities of FLAML and its potential applications in various fields, including natural language processing, computer vision, and reinforcement learning.\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -840,15 +588,35 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 29, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "VectorDB returns doc_ids: [['7968cf3c', 'bdfbc921']]\n", - "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n", + "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", @@ -1110,19 +878,17 @@ "\n", "\n", "\n", - "--------------------------------------------------------------------------------\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "--------------------------------------------------------------------------------\n", "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "The authors of FLAML are Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu.\n", + "This is a comprehensive guide to using FLAML (Fast and Lightweight AutoML Library) with Spark data. It covers the following topics:\n", "\n", - "--------------------------------------------------------------------------------\n", - "The authors of FLAML are Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu.\n", + "1. **Pandas-on-Spark**: How to prepare your data in a format that can be used by FLAML, specifically converting it to a pandas DataFrame on Spark.\n", + "2. **Spark ML models**: How to use Spark-based machine learning models, such as LightGBM, with FLAML.\n", + "3. **Estimators**: How to specify the models you want to try in AutoML using the `estimator_list` argument.\n", + "4. **Parallel Spark jobs**: How to activate Spark as a parallel backend for Hyperparameter Tuning and AutoML, using `use_spark`, `n_concurrent_trials`, and `force_cancel` arguments.\n", + "\n", + "The guide provides code snippets and references to official documentation for each topic, making it easy to follow along and implement FLAML with Spark data. It also includes links to notebooks that demonstrate the usage of FLAML with Spark data in Colab.\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -1135,13 +901,6 @@ "qa_problem = \"Who is the author of FLAML?\"\n", "chat_result = ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem=qa_problem)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1166,7 +925,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.11" }, "skip_test": "Requires interactive usage" }, From 6d6300b816dcac94069dfeeddf89343c44981ed7 Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Thu, 6 Jun 2024 02:06:03 -0500 Subject: [PATCH 04/14] Validated removal of model_name working. --- .../agentchat/contrib/vectordb/pgvectordb.py | 74 ++++++------------- .../agentchat_pgvector_RetrieveChat.ipynb | 33 ++++++--- 2 files changed, 47 insertions(+), 60 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 7f42c60ea33e..11810f8f15c3 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -32,10 +32,11 @@ class Collection: client: The PGVector client. collection_name (str): The name of the collection. Default is "documents". embedding_function (Callable): The embedding function used to generate the vector representation. + Default is None. SentenceTransformer("all-MiniLM-L6-v2") will be used when None. + Models can be chosen from: + https://huggingface.co/models?library=sentence-transformers metadata (Optional[dict]): The metadata of the collection. get_or_create (Optional): The flag indicating whether to get or create the collection. - model_name: (Optional str) | Sentence embedding model to use. Models can be chosen from: - https://huggingface.co/models?library=sentence-transformers """ def __init__( @@ -45,7 +46,6 @@ def __init__( embedding_function: Callable = None, metadata=None, get_or_create=None, - model_name="all-MiniLM-L6-v2", ): """ Initialize the Collection object. @@ -56,38 +56,33 @@ def __init__( embedding_function: The embedding function used to generate the vector representation. metadata: The metadata of the collection. get_or_create: The flag indicating whether to get or create the collection. - model_name: | Sentence embedding model to use. Models can be chosen from: - https://huggingface.co/models?library=sentence-transformers Returns: None """ self.client = client - self.embedding_function = embedding_function - self.model_name = model_name self.name = self.set_collection_name(collection_name) self.require_embeddings_or_documents = False self.ids = [] - try: - self.embedding_function = ( - SentenceTransformer(self.model_name) if embedding_function is None else embedding_function - ) - except Exception as e: - logger.error( - f"Validate the model name entered: {self.model_name} " - f"from https://huggingface.co/models?library=sentence-transformers\nError: {e}" - ) - raise e + if embedding_function: + self.embedding_function = embedding_function + else: + self.embedding_function = SentenceTransformer("all-MiniLM-L6-v2") self.metadata = metadata if metadata else {"hnsw:space": "ip", "hnsw:construction_ef": 32, "hnsw:M": 16} self.documents = "" self.get_or_create = get_or_create # This will get the model dimension size by computing the embeddings dimensions sentences = [ "The weather is lovely today in paradise.", - "It's so sunny outside in the garden!", - "He reached his max potential within his lifetime.", ] embeddings = self.embedding_function.encode(sentences) - self.dimension = embeddings.shape[1] + if hasattr(embeddings, "shape"): + self.dimension = embeddings.shape[1] + else: + logger.error( + "Dimension of embeddings was not calculated successfully. " + "Was the 'shape' attribute missing in the embedding function?" + ) + self.dimension = 384 def set_collection_name(self, collection_name) -> str: name = re.sub("-", "_", collection_name) @@ -588,7 +583,6 @@ def __init__( connect_timeout: Optional[int] = 10, embedding_function: Callable = None, metadata: Optional[dict] = None, - model_name: Optional[str] = "all-MiniLM-L6-v2", ) -> None: """ Initialize the vector database. @@ -606,15 +600,14 @@ def __init__( username: str | The database username to use. Default is None. password: str | The database user password to use. Default is None. connect_timeout: int | The timeout to set for the connection. Default is 10. - embedding_function: Callable | The embedding function used to generate the vector representation - of the documents. Default is None. + embedding_function (Callable): The embedding function used to generate the vector representation. + Default is None. SentenceTransformer("all-MiniLM-L6-v2") will be used when None. + Models can be chosen from: + https://huggingface.co/models?library=sentence-transformers metadata: dict | The metadata of the vector database. Default is None. If None, it will use this setting: {"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 16}. Creates Index on table using hnsw (embedding vector_l2_ops) WITH (m = hnsw:M) ef_construction = "hnsw:construction_ef". For more info: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw - model_name: str | Sentence embedding model to use. Models can be chosen from: - https://huggingface.co/models?library=sentence-transformers - Returns: None @@ -642,7 +635,7 @@ def __init__( "db_config": { "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", }, - "embedding_function": "all-distilroberta-v1", + "embedding_function": SentenceTransformer("all-distilroberta-v1"), "get_or_create": True, # set to False if you don't want to reuse an existing collection "overwrite": False, # set to True if you want to overwrite an existing collection }, @@ -659,25 +652,10 @@ def __init__( password=password, connect_timeout=connect_timeout, ) - self.model_name = model_name - try: - self.embedding_function = ( - SentenceTransformer(self.model_name) if embedding_function is None else embedding_function - ) - # This will get the model dimension size by computing the embeddings dimensions - sentences = [ - "The weather is lovely today in paradise.", - "It's so sunny outside in the garden!", - "He reached his max potential within his lifetime.", - ] - embeddings = self.embedding_function.encode(sentences) - self.dimension = embeddings.shape[1] - except Exception as e: - logger.error( - f"Validate the model name entered: {self.model_name} " - f"from https://huggingface.co/models?library=sentence-transformers\nError: {e}" - ) - raise e + if embedding_function: + self.embedding_function = embedding_function + else: + self.embedding_function = SentenceTransformer("all-MiniLM-L6-v2") self.metadata = metadata register_vector(self.client) self.active_collection = None @@ -792,7 +770,6 @@ def create_collection( embedding_function=self.embedding_function, get_or_create=get_or_create, metadata=self.metadata, - model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) collection.create_collection(collection_name=collection_name) @@ -805,7 +782,6 @@ def create_collection( embedding_function=self.embedding_function, get_or_create=get_or_create, metadata=self.metadata, - model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) collection.create_collection(collection_name=collection_name) @@ -819,7 +795,6 @@ def create_collection( embedding_function=self.embedding_function, get_or_create=get_or_create, metadata=self.metadata, - model_name=self.model_name, ) collection.set_collection_name(collection_name=collection_name) collection.create_collection(collection_name=collection_name) @@ -851,7 +826,6 @@ def get_collection(self, collection_name: str = None) -> Collection: client=self.client, collection_name=collection_name, embedding_function=self.embedding_function, - model_name=self.model_name, ) return self.active_collection diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index f1862d09a857..631deeb9b273 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -89,6 +89,7 @@ "\n", "import chromadb\n", "import psycopg\n", + "from sentence_transformers import SentenceTransformer\n", "\n", "import autogen\n", "from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent\n", @@ -137,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -145,7 +146,7 @@ "output_type": "stream", "text": [ "Accepted file formats for `docs_path`:\n", - "['doc', 'pptx', 'org', 'log', 'rst', 'html', 'rtf', 'epub', 'xlsx', 'txt', 'htm', 'odt', 'json', 'md', 'tsv', 'ppt', 'jsonl', 'csv', 'yaml', 'msg', 'docx', 'yml', 'xml', 'pdf']\n" + "['yml', 'doc', 'xlsx', 'csv', 'epub', 'htm', 'msg', 'json', 'rst', 'log', 'yaml', 'txt', 'tsv', 'docx', 'jsonl', 'ppt', 'org', 'md', 'xml', 'pptx', 'pdf', 'html', 'rtf', 'odt']\n" ] } ], @@ -156,14 +157,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[autogen.oai.client: 06-04 17:15:23] {129} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" + "[autogen.oai.client: 06-06 02:04:46] {129} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" ] } ], @@ -183,7 +184,7 @@ "# conn = psycopg.connect(conninfo=\"postgresql://postgres:postgres@localhost:5432/postgres\", autocommit=True)\n", "\n", "# Optionally create embedding function object\n", - "# sentence_transformer_ef = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", + "sentence_transformer_ef = SentenceTransformer(\"all-distilroberta-v1\")\n", "\n", "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", @@ -218,12 +219,11 @@ " # \"dbname\": \"postgres\", # Optional vector database name\n", " # \"username\": \"postgres\", # Optional vector database username\n", " # \"password\": \"postgres\", # Optional vector database password\n", - " \"model_name\": \"all-distilroberta-v1\", # Sentence embedding model from https://huggingface.co/models?library=sentence-transformers or https://www.sbert.net/docs/pretrained_models.html\n", " # \"conn\": conn, # Optional - conn object to connect to database\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": False, # set to True if you want to overwrite an existing collection\n", - " # \"embedding_function\": sentence_transformer_ef,\n", + " \"embedding_function\": sentence_transformer_ef, # If left out SentenceTransformer(\"all-MiniLM-L6-v2\") will be used\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", ")" @@ -245,13 +245,26 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 4, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to create collection.\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ + "2024-06-06 02:04:56,914 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `flaml_collection`.\u001b[0m\n", + "2024-06-06 02:04:58,013 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-06-06 02:04:58,017 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Error executing select on non-existent table: flaml_collection. Creating it instead. Error: relation \"flaml_collection\" does not exist\n", + "LINE 1: SELECT id, metadatas, documents, embedding FROM flaml_collec...\n", + " ^\u001b[0m\n", + "2024-06-06 02:04:58,036 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Created table flaml_collection\u001b[0m\n", "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" ] }, @@ -588,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 5, "metadata": {}, "outputs": [ { From 8cb8ffb2e342314d002c7e76132a8123d813ab9e Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Thu, 6 Jun 2024 02:13:37 -0500 Subject: [PATCH 05/14] Second example uses conn object. --- .../agentchat_pgvector_RetrieveChat.ipynb | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index 631deeb9b273..369caf063f75 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -601,13 +601,21 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to create collection.\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ + "2024-06-06 02:13:07,077 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" ] }, @@ -911,6 +919,41 @@ "# reset the assistant. Always reset the assistant before starting a new conversation.\n", "assistant.reset()\n", "\n", + "# Optionally create psycopg conn object\n", + "conn = psycopg.connect(conninfo=\"postgresql://postgres:postgres@localhost:5432/postgres\", autocommit=True)\n", + "\n", + "ragproxyagent = RetrieveUserProxyAgent(\n", + " name=\"ragproxyagent\",\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=1,\n", + " retrieve_config={\n", + " \"task\": \"code\",\n", + " \"docs_path\": [\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md\",\n", + " \"https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md\",\n", + " os.path.join(os.path.abspath(\"\"), \"..\", \"website\", \"docs\"),\n", + " ],\n", + " \"custom_text_types\": [\"non-existent-type\"],\n", + " \"chunk_token_size\": 2000,\n", + " \"model\": config_list[0][\"model\"],\n", + " \"vector_db\": \"pgvector\", # PGVector database\n", + " \"collection_name\": \"flaml_collection\",\n", + " \"db_config\": {\n", + " # \"connection_string\": \"postgresql://postgres:postgres@localhost:5432/postgres\", # Optional - connect to an external vector database\n", + " # \"host\": \"postgres\", # Optional vector database host\n", + " # \"port\": 5432, # Optional vector database port\n", + " # \"dbname\": \"postgres\", # Optional vector database name\n", + " # \"username\": \"postgres\", # Optional vector database username\n", + " # \"password\": \"postgres\", # Optional vector database password\n", + " \"conn\": conn, # Optional - conn object to connect to database\n", + " },\n", + " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", + " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", + " # \"embedding_function\": sentence_transformer_ef, # If left out SentenceTransformer(\"all-MiniLM-L6-v2\") will be used\n", + " },\n", + " code_execution_config=False, # set to False if you don't want to execute the code\n", + ")\n", + "\n", "qa_problem = \"Who is the author of FLAML?\"\n", "chat_result = ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem=qa_problem)" ] From 2b91610cb4883e67ed5fb395ca3fbdf82009f5d3 Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Fri, 7 Jun 2024 08:26:45 -0500 Subject: [PATCH 06/14] embedding_function no longer directly references .encode --- .../agentchat/contrib/vectordb/pgvectordb.py | 20 +++++++++---------- .../agentchat_pgvector_RetrieveChat.ipynb | 5 ++--- .../test_pgvector_retrievechat.py | 2 +- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 11810f8f15c3..aefbe870f9e1 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -74,7 +74,7 @@ def __init__( sentences = [ "The weather is lovely today in paradise.", ] - embeddings = self.embedding_function.encode(sentences) + embeddings = self.embedding_function(sentences) if hasattr(embeddings, "shape"): self.dimension = embeddings.shape[1] else: @@ -118,14 +118,14 @@ def add(self, ids: List[ItemID], documents: List, embeddings: List = None, metad elif metadatas is not None: for doc_id, metadata, document in zip(ids, metadatas, documents): metadata = re.sub("'", '"', str(metadata)) - embedding = self.embedding_function.encode(document) + embedding = self.embedding_function(document) sql_values.append((doc_id, metadata, embedding, document)) sql_string = ( f"INSERT INTO {self.name} (id, metadatas, embedding, documents)\n" f"VALUES (%s, %s, %s, %s);\n" ) else: for doc_id, document in zip(ids, documents): - embedding = self.embedding_function.encode(document) + embedding = self.embedding_function(document) sql_values.append((doc_id, document, embedding)) sql_string = f"INSERT INTO {self.name} (id, documents, embedding)\n" f"VALUES (%s, %s, %s);\n" logger.debug(f"Add SQL String:\n{sql_string}\n{sql_values}") @@ -169,7 +169,7 @@ def upsert(self, ids: List[ItemID], documents: List, embeddings: List = None, me elif metadatas is not None: for doc_id, metadata, document in zip(ids, metadatas, documents): metadata = re.sub("'", '"', str(metadata)) - embedding = self.embedding_function.encode(document) + embedding = self.embedding_function(document) sql_values.append((doc_id, metadata, embedding, document, metadata, document, embedding)) sql_string = ( f"INSERT INTO {self.name} (id, metadatas, embedding, documents)\n" @@ -179,7 +179,7 @@ def upsert(self, ids: List[ItemID], documents: List, embeddings: List = None, me ) else: for doc_id, document in zip(ids, documents): - embedding = self.embedding_function.encode(document) + embedding = self.embedding_function(document) sql_values.append((doc_id, document, embedding, document)) sql_string = ( f"INSERT INTO {self.name} (id, documents, embedding)\n" @@ -422,7 +422,7 @@ def query( cursor = self.client.cursor() results = [] for query_text in query_texts: - vector = self.embedding_function.encode(query_text, convert_to_tensor=False).tolist() + vector = self.embedding_function(query_text, convert_to_tensor=False).tolist() if distance_type.lower() == "cosine": index_function = "<=>" elif distance_type.lower() == "euclidean": @@ -600,8 +600,8 @@ def __init__( username: str | The database username to use. Default is None. password: str | The database user password to use. Default is None. connect_timeout: int | The timeout to set for the connection. Default is 10. - embedding_function (Callable): The embedding function used to generate the vector representation. - Default is None. SentenceTransformer("all-MiniLM-L6-v2") will be used when None. + embedding_function: Callable | The embedding function used to generate the vector representation. + Default is None. SentenceTransformer("all-MiniLM-L6-v2").encode will be used when None. Models can be chosen from: https://huggingface.co/models?library=sentence-transformers metadata: dict | The metadata of the vector database. Default is None. If None, it will use this @@ -635,7 +635,7 @@ def __init__( "db_config": { "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", }, - "embedding_function": SentenceTransformer("all-distilroberta-v1"), + "embedding_function": SentenceTransformer("all-distilroberta-v1").encode, "get_or_create": True, # set to False if you don't want to reuse an existing collection "overwrite": False, # set to True if you want to overwrite an existing collection }, @@ -655,7 +655,7 @@ def __init__( if embedding_function: self.embedding_function = embedding_function else: - self.embedding_function = SentenceTransformer("all-MiniLM-L6-v2") + self.embedding_function = SentenceTransformer("all-MiniLM-L6-v2").encode self.metadata = metadata register_vector(self.client) self.active_collection = None diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index 369caf063f75..5846c2c420c0 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -184,7 +184,7 @@ "# conn = psycopg.connect(conninfo=\"postgresql://postgres:postgres@localhost:5432/postgres\", autocommit=True)\n", "\n", "# Optionally create embedding function object\n", - "sentence_transformer_ef = SentenceTransformer(\"all-distilroberta-v1\")\n", + "sentence_transformer_ef = SentenceTransformer(\"all-distilroberta-v1\").encode\n", "\n", "# 2. create the RetrieveUserProxyAgent instance named \"ragproxyagent\"\n", "# By default, the human_input_mode is \"ALWAYS\", which means the agent will ask for human input at every step. We set it to \"NEVER\" here.\n", @@ -223,7 +223,7 @@ " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": False, # set to True if you want to overwrite an existing collection\n", - " \"embedding_function\": sentence_transformer_ef, # If left out SentenceTransformer(\"all-MiniLM-L6-v2\") will be used\n", + " \"embedding_function\": sentence_transformer_ef, # If left out SentenceTransformer(\"all-MiniLM-L6-v2\").encode will be used\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", ")" @@ -949,7 +949,6 @@ " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", - " # \"embedding_function\": sentence_transformer_ef, # If left out SentenceTransformer(\"all-MiniLM-L6-v2\") will be used\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", ")\n", diff --git a/test/agentchat/contrib/retrievechat/test_pgvector_retrievechat.py b/test/agentchat/contrib/retrievechat/test_pgvector_retrievechat.py index b104f25af767..ca24f952f76d 100644 --- a/test/agentchat/contrib/retrievechat/test_pgvector_retrievechat.py +++ b/test/agentchat/contrib/retrievechat/test_pgvector_retrievechat.py @@ -56,7 +56,7 @@ def test_retrievechat(): }, ) - sentence_transformer_ef = SentenceTransformer("all-MiniLM-L6-v2") + sentence_transformer_ef = SentenceTransformer("all-MiniLM-L6-v2").encode ragproxyagent = RetrieveUserProxyAgent( name="ragproxyagent", human_input_mode="NEVER", From d487acff60dfd0943bb26598d6d620f63177bcdf Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Fri, 7 Jun 2024 08:57:44 -0500 Subject: [PATCH 07/14] Fixed pre-commit issue. --- notebook/agentchat_pgvector_RetrieveChat.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index 5846c2c420c0..86c7ad6bfd68 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -945,7 +945,7 @@ " # \"dbname\": \"postgres\", # Optional vector database name\n", " # \"username\": \"postgres\", # Optional vector database username\n", " # \"password\": \"postgres\", # Optional vector database password\n", - " \"conn\": conn, # Optional - conn object to connect to database\n", + " \"conn\": conn, # Optional - conn object to connect to database\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", From c0427072c2f21192b9dee857944f75bbcb12921e Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Fri, 7 Jun 2024 09:42:56 -0500 Subject: [PATCH 08/14] Use try/except to raise error when shape is not found in embedding function. --- autogen/agentchat/contrib/vectordb/pgvectordb.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index aefbe870f9e1..ba503ed20bab 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -75,14 +75,15 @@ def __init__( "The weather is lovely today in paradise.", ] embeddings = self.embedding_function(sentences) - if hasattr(embeddings, "shape"): + try: self.dimension = embeddings.shape[1] - else: - logger.error( + except Exception as e: + self.dimension = 384 + raise Exception( "Dimension of embeddings was not calculated successfully. " - "Was the 'shape' attribute missing in the embedding function?" + "Was the 'shape' attribute missing in the embedding function?\n" + f"Error: {e}" ) - self.dimension = 384 def set_collection_name(self, collection_name) -> str: name = re.sub("-", "_", collection_name) From 72e2e420c7b3fb88bceb2a90c2d0ff28f50a1c0f Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Fri, 7 Jun 2024 09:43:06 -0500 Subject: [PATCH 09/14] Re-ran notebook. --- .../agentchat_pgvector_RetrieveChat.ipynb | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index 86c7ad6bfd68..09b49c45e2fa 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -146,7 +146,7 @@ "output_type": "stream", "text": [ "Accepted file formats for `docs_path`:\n", - "['yml', 'doc', 'xlsx', 'csv', 'epub', 'htm', 'msg', 'json', 'rst', 'log', 'yaml', 'txt', 'tsv', 'docx', 'jsonl', 'ppt', 'org', 'md', 'xml', 'pptx', 'pdf', 'html', 'rtf', 'odt']\n" + "['epub', 'csv', 'tsv', 'docx', 'msg', 'rst', 'htm', 'yml', 'txt', 'pdf', 'html', 'md', 'org', 'rtf', 'xml', 'odt', 'pptx', 'log', 'xlsx', 'jsonl', 'json', 'ppt', 'yaml', 'doc']\n" ] } ], @@ -157,14 +157,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[autogen.oai.client: 06-06 02:04:46] {129} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" + "[autogen.oai.client: 06-07 09:40:29] {129} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" ] } ], @@ -245,9 +245,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-07 09:40:33,394 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `flaml_collection`.\u001b[0m\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -259,12 +266,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-06 02:04:56,914 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `flaml_collection`.\u001b[0m\n", - "2024-06-06 02:04:58,013 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "2024-06-06 02:04:58,017 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Error executing select on non-existent table: flaml_collection. Creating it instead. Error: relation \"flaml_collection\" does not exist\n", + "2024-06-07 09:40:33,648 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-06-07 09:40:33,652 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Error executing select on non-existent table: flaml_collection. Creating it instead. Error: relation \"flaml_collection\" does not exist\n", "LINE 1: SELECT id, metadatas, documents, embedding FROM flaml_collec...\n", " ^\u001b[0m\n", - "2024-06-06 02:04:58,036 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Created table flaml_collection\u001b[0m\n", + "2024-06-07 09:40:33,674 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Created table flaml_collection\u001b[0m\n", "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" ] }, @@ -601,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -615,7 +621,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-06 02:13:07,077 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "2024-06-07 09:40:49,415 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" ] }, From 97cf23e7ae27b0530e7861fd22f851e963b856dd Mon Sep 17 00:00:00 2001 From: Audel Rouhi Date: Mon, 10 Jun 2024 08:59:43 -0500 Subject: [PATCH 10/14] Update autogen/agentchat/contrib/vectordb/pgvectordb.py Co-authored-by: Li Jiang --- autogen/agentchat/contrib/vectordb/pgvectordb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index ba503ed20bab..8f12666da167 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -66,7 +66,7 @@ def __init__( if embedding_function: self.embedding_function = embedding_function else: - self.embedding_function = SentenceTransformer("all-MiniLM-L6-v2") + self.embedding_function = SentenceTransformer("all-MiniLM-L6-v2").encode self.metadata = metadata if metadata else {"hnsw:space": "ip", "hnsw:construction_ef": 32, "hnsw:M": 16} self.documents = "" self.get_or_create = get_or_create From 4c0fdc7ada790ff0800d25bdda73b7c49f43cffd Mon Sep 17 00:00:00 2001 From: Audel Rouhi Date: Mon, 10 Jun 2024 08:59:57 -0500 Subject: [PATCH 11/14] Update autogen/agentchat/contrib/vectordb/pgvectordb.py Co-authored-by: Li Jiang --- autogen/agentchat/contrib/vectordb/pgvectordb.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 8f12666da167..fde0bc753897 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -75,15 +75,7 @@ def __init__( "The weather is lovely today in paradise.", ] embeddings = self.embedding_function(sentences) - try: - self.dimension = embeddings.shape[1] - except Exception as e: - self.dimension = 384 - raise Exception( - "Dimension of embeddings was not calculated successfully. " - "Was the 'shape' attribute missing in the embedding function?\n" - f"Error: {e}" - ) + self.dimension = len(embeddings[0]) def set_collection_name(self, collection_name) -> str: name = re.sub("-", "_", collection_name) From e53d5b9e8346db8fd7caad8cd0f6032ee26d35cf Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Mon, 10 Jun 2024 23:03:20 -0500 Subject: [PATCH 12/14] Added .encode --- autogen/agentchat/contrib/vectordb/pgvectordb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index fde0bc753897..42826b04329e 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -32,7 +32,7 @@ class Collection: client: The PGVector client. collection_name (str): The name of the collection. Default is "documents". embedding_function (Callable): The embedding function used to generate the vector representation. - Default is None. SentenceTransformer("all-MiniLM-L6-v2") will be used when None. + Default is None. SentenceTransformer("all-MiniLM-L6-v2").encode will be used when None. Models can be chosen from: https://huggingface.co/models?library=sentence-transformers metadata (Optional[dict]): The metadata of the collection. From 2d25617c204d231255583e48b73041b424ae693e Mon Sep 17 00:00:00 2001 From: knucklessg1 Date: Mon, 10 Jun 2024 23:04:02 -0500 Subject: [PATCH 13/14] Removed example comment. --- .../agentchat/contrib/vectordb/pgvectordb.py | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index 42826b04329e..ea9dcec99562 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -603,37 +603,6 @@ def __init__( For more info: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw Returns: None - - Example: - RetrieveUserProxyAgent( - name="ragproxyagent", - human_input_mode="NEVER", - max_consecutive_auto_reply=3, - retrieve_config={ - "task": "code", - "docs_path": [ - "https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Examples/Integrate%20-%20Spark.md", - "https://raw.githubusercontent.com/microsoft/FLAML/main/website/docs/Research.md", - "https://raw.githubusercontent.com/Knuckles-Team/geniusbot/main/README.md", - "https://raw.githubusercontent.com/Knuckles-Team/repository-manager/main/README.md", - "https://raw.githubusercontent.com/Knuckles-Team/gitlab-api/main/README.md", - "https://raw.githubusercontent.com/Knuckles-Team/media-downloader/main/README.md", - os.path.join(os.path.abspath(""), "..", "website", "docs"), - ], - "custom_text_types": ["non-existent-type"], - "chunk_token_size": 2000, - "model": config_list[0]["model"], - "vector_db": "pgvector", # PGVector database - "collection_name": "test_collection", - "db_config": { - "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", - }, - "embedding_function": SentenceTransformer("all-distilroberta-v1").encode, - "get_or_create": True, # set to False if you don't want to reuse an existing collection - "overwrite": False, # set to True if you want to overwrite an existing collection - }, - code_execution_config=False, # set to False if you don't want to execute the code - ) """ self.client = self.establish_connection( conn=conn, From f1862ed0925d512461b8b010e08ac5b2d4c18d0b Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 11 Jun 2024 20:09:05 +0800 Subject: [PATCH 14/14] Fix overwrite doesn't work with existing collection when custom embedding function has different dimension from default one --- .../agentchat/contrib/vectordb/pgvectordb.py | 4 +- .../agentchat_pgvector_RetrieveChat.ipynb | 1019 +++++++++++++---- 2 files changed, 785 insertions(+), 238 deletions(-) diff --git a/autogen/agentchat/contrib/vectordb/pgvectordb.py b/autogen/agentchat/contrib/vectordb/pgvectordb.py index ea9dcec99562..ac86802b6723 100644 --- a/autogen/agentchat/contrib/vectordb/pgvectordb.py +++ b/autogen/agentchat/contrib/vectordb/pgvectordb.py @@ -523,7 +523,7 @@ def delete_collection(self, collection_name: Optional[str] = None) -> None: cursor.close() def create_collection( - self, collection_name: Optional[str] = None, dimension: Optional[Union[str, int]] = 384 + self, collection_name: Optional[str] = None, dimension: Optional[Union[str, int]] = None ) -> None: """ Create a new collection. @@ -540,6 +540,8 @@ def create_collection( if dimension: self.dimension = dimension + elif self.dimension is None: + self.dimension = 384 cursor = self.client.cursor() cursor.execute( diff --git a/notebook/agentchat_pgvector_RetrieveChat.ipynb b/notebook/agentchat_pgvector_RetrieveChat.ipynb index 09b49c45e2fa..1a8d70e29654 100644 --- a/notebook/agentchat_pgvector_RetrieveChat.ipynb +++ b/notebook/agentchat_pgvector_RetrieveChat.ipynb @@ -79,7 +79,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "models to use: ['Meta-Llama-3-8B-Instruct-imatrix', 'gpt-3.5-turbo-0125', 'gpt-35-turbo']\n" + "models to use: ['gpt4-1106-preview', 'gpt-4o', 'gpt-35-turbo', 'gpt-35-turbo-0613']\n" ] } ], @@ -115,7 +115,10 @@ " \"api_key\": \"...\",\n", " },\n", "]\n", - "\n", + "config_list = autogen.config_list_from_json(\n", + " \"OAI_CONFIG_LIST\",\n", + " file_location=\".\",\n", + ")\n", "assert len(config_list) > 0\n", "print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])" ] @@ -146,7 +149,7 @@ "output_type": "stream", "text": [ "Accepted file formats for `docs_path`:\n", - "['epub', 'csv', 'tsv', 'docx', 'msg', 'rst', 'htm', 'yml', 'txt', 'pdf', 'html', 'md', 'org', 'rtf', 'xml', 'odt', 'pptx', 'log', 'xlsx', 'jsonl', 'json', 'ppt', 'yaml', 'doc']\n" + "['yaml', 'ppt', 'rst', 'jsonl', 'xml', 'txt', 'yml', 'log', 'rtf', 'msg', 'xlsx', 'htm', 'pdf', 'org', 'pptx', 'md', 'docx', 'epub', 'tsv', 'csv', 'html', 'doc', 'odt', 'json']\n" ] } ], @@ -157,14 +160,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "[autogen.oai.client: 06-07 09:40:29] {129} WARNING - The API key specified is not a valid OpenAI format; it won't work with the OpenAI-hosted model.\n" + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n" ] } ], @@ -222,7 +226,7 @@ " # \"conn\": conn, # Optional - conn object to connect to database\n", " },\n", " \"get_or_create\": True, # set to False if you don't want to reuse an existing collection\n", - " \"overwrite\": False, # set to True if you want to overwrite an existing collection\n", + " \"overwrite\": True, # set to True if you want to overwrite an existing collection\n", " \"embedding_function\": sentence_transformer_ef, # If left out SentenceTransformer(\"all-MiniLM-L6-v2\").encode will be used\n", " },\n", " code_execution_config=False, # set to False if you don't want to execute the code\n", @@ -245,16 +249,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-06-07 09:40:33,394 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `flaml_collection`.\u001b[0m\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -266,12 +263,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-06-07 09:40:33,648 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "2024-06-07 09:40:33,652 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Error executing select on non-existent table: flaml_collection. Creating it instead. Error: relation \"flaml_collection\" does not exist\n", - "LINE 1: SELECT id, metadatas, documents, embedding FROM flaml_collec...\n", - " ^\u001b[0m\n", - "2024-06-07 09:40:33,674 - autogen.agentchat.contrib.vectordb.pgvectordb - INFO - Created table flaml_collection\u001b[0m\n", - "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" + "2024-06-11 19:57:44,122 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "Model gpt4-1106-preview not found. Using cl100k_base encoding.\n" ] }, { @@ -286,7 +279,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" + "Model gpt4-1106-preview not found. Using cl100k_base encoding.\n" ] }, { @@ -555,96 +548,6 @@ "\n", "\n", "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", - "\n", - "It seems like you're sharing research papers related to the FLAML (Fast and Lightweight AutoML Library) project, which is a machine learning library developed by Microsoft Research.\n", - "\n", - "The papers cover various topics such as:\n", - "\n", - "1. **Frugal Optimization for Cost-related Hyperparameters**: This paper proposes a frugal optimization method for hyperparameter tuning in machine learning models.\n", - "2. **Economical Hyperparameter Optimization With Blended Search Strategy**: This paper presents a blended search strategy for economical hyperparameter optimization, which combines multiple optimization methods to achieve better results.\n", - "3. **An Empirical Study on Hyperparameter Optimization for Fine- Tuning Pre-trained Language Models**: This paper investigates the effectiveness of different hyperparameter optimization methods for fine-tuning pre-trained language models.\n", - "4. **ChaCha for Online AutoML**: This paper introduces ChaCha, a online autoML framework that can adapt to changing environments and optimize model performance in real-time.\n", - "5. **Fair AutoML**: This paper proposes a fair AutoML method that aims to reduce bias and improve fairness in machine learning models.\n", - "6. **Mining Robust Default Configurations for Resource-constrained AutoML**: This paper presents a method for mining robust default configurations for resource-constrained autoML, which can be useful for deploying AI models in real-world scenarios.\n", - "7. **Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives**: This paper proposes a targeted hyperparameter optimization method that can handle multiple objectives and lexicographic preferences.\n", - "8. **Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference**: This paper investigates cost-effective hyperparameter optimization methods for large language model generation inference, which is an important problem in natural language processing.\n", - "9. **An Empirical Study on Challenging Math Problem Solving with GPT-4**: This paper presents an empirical study on challenging math problem solving using GPT-4, a powerful AI model.\n", - "\n", - "These papers demonstrate the capabilities of FLAML and its potential applications in various fields, including natural language processing, computer vision, and reinforcement learning.\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "# reset the assistant. Always reset the assistant before starting a new conversation.\n", - "assistant.reset()\n", - "\n", - "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", - "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", - "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", - "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", - "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train for 30 seconds and force cancel jobs if time limit is reached.\"\n", - "chat_result = ragproxyagent.initiate_chat(\n", - " assistant, message=ragproxyagent.message_generator, problem=code_problem, search_string=\"spark\"\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example 2\n", - "\n", - "[Back to top](#table-of-contents)\n", - "\n", - "Use RetrieveChat to answer a question that is not related to code generation.\n", - "\n", - "Problem: Who is the author of FLAML?" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trying to create collection.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-06-07 09:40:49,415 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", - "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "VectorDB returns doc_ids: [['7968cf3c', 'bdfbc921']]\n", - "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Model Meta-Llama-3-8B-Instruct-imatrix not found. Using cl100k_base encoding.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", "\n", "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", @@ -657,123 +560,9 @@ "# your code\n", "```\n", "\n", - "User's question is: Who is the author of FLAML?\n", - "\n", - "Context is: # Research\n", - "\n", - "For technical details, please check our research publications.\n", - "\n", - "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021flaml,\n", - " title={FLAML: A Fast and Lightweight AutoML Library},\n", - " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", - " year={2021},\n", - " booktitle={MLSys},\n", - "}\n", - "```\n", - "\n", - "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021cfo,\n", - " title={Frugal Optimization for Cost-related Hyperparameters},\n", - " author={Qingyun Wu and Chi Wang and Silu Huang},\n", - " year={2021},\n", - " booktitle={AAAI},\n", - "}\n", - "```\n", - "\n", - "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2021blendsearch,\n", - " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", - " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", - " year={2021},\n", - " booktitle={ICLR},\n", - "}\n", - "```\n", - "\n", - "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{liuwang2021hpolm,\n", - " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", - " author={Susan Xueqing Liu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ACL},\n", - "}\n", - "```\n", - "\n", - "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", - "\n", - "```bibtex\n", - "@inproceedings{wu2021chacha,\n", - " title={ChaCha for Online AutoML},\n", - " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", - " year={2021},\n", - " booktitle={ICML},\n", - "}\n", - "```\n", - "\n", - "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", - "\n", - "```bibtex\n", - "@inproceedings{wuwang2021fairautoml,\n", - " title={Fair AutoML},\n", - " author={Qingyun Wu and Chi Wang},\n", - " year={2021},\n", - " booktitle={ArXiv preprint arXiv:2111.06495},\n", - "}\n", - "```\n", - "\n", - "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", - "\n", - "```bibtex\n", - "@inproceedings{kayaliwang2022default,\n", - " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", - " author={Moe Kayali and Chi Wang},\n", - " year={2022},\n", - " booktitle={ArXiv preprint arXiv:2202.09927},\n", - "}\n", - "```\n", - "\n", - "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", - "\n", - "```bibtex\n", - "@inproceedings{zhang2023targeted,\n", - " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", - " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", - " booktitle={International Conference on Learning Representations},\n", - " year={2023},\n", - " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", - "}\n", - "```\n", - "\n", - "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", - "\n", - "```bibtex\n", - "@inproceedings{wang2023EcoOptiGen,\n", - " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", - " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2303.04673},\n", - "}\n", - "```\n", - "\n", - "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "User's question is: How can I use FLAML to perform a classification task and use spark to do parallel training. Train for 30 seconds and force cancel jobs if time limit is reached.\n", "\n", - "```bibtex\n", - "@inproceedings{wu2023empirical,\n", - " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", - " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", - " year={2023},\n", - " booktitle={ArXiv preprint arXiv:2306.01337},\n", - "}\n", - "```\n", - "# Integrate - Spark\n", + "Context is: # Integrate - Spark\n", "\n", "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", "\n", @@ -902,20 +691,776 @@ "```\n", "\n", "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "# Research\n", "\n", + "For technical details, please check our research publications.\n", "\n", + "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", + "```\n", + "\n", + "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", + "```\n", + "\n", + "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", "\n", - "This is a comprehensive guide to using FLAML (Fast and Lightweight AutoML Library) with Spark data. It covers the following topics:\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", + "```\n", "\n", - "1. **Pandas-on-Spark**: How to prepare your data in a format that can be used by FLAML, specifically converting it to a pandas DataFrame on Spark.\n", - "2. **Spark ML models**: How to use Spark-based machine learning models, such as LightGBM, with FLAML.\n", - "3. **Estimators**: How to specify the models you want to try in AutoML using the `estimator_list` argument.\n", - "4. **Parallel Spark jobs**: How to activate Spark as a parallel backend for Hyperparameter Tuning and AutoML, using `use_spark`, `n_concurrent_trials`, and `force_cancel` arguments.\n", + "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", + "```\n", + "\n", + "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", + "```\n", + "\n", + "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", + "\n", + "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", + "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", + "\n", + "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", + "```\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "Based on the provided context which details the integration of Spark with FLAML for distributed training, and the requirement to perform a classification task with parallel training in Spark, here's a code snippet that configures FLAML to train a classification model for 30 seconds and cancels the jobs if the time limit is reached.\n", + "\n", + "```python\n", + "from flaml import AutoML\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "import pandas as pd\n", + "\n", + "# Your pandas DataFrame 'data' goes here\n", + "# Assuming 'data' is already a pandas DataFrame with appropriate data for classification\n", + "# and 'label_column' is the name of the column that we want to predict.\n", + "\n", + "# First, convert your pandas DataFrame to a pandas-on-spark DataFrame\n", + "psdf = to_pandas_on_spark(data)\n", + "\n", + "# Now, we prepare the settings for the AutoML training with Spark\n", + "automl_settings = {\n", + " \"time_budget\": 30, # Train for 30 seconds\n", + " \"metric\": \"accuracy\", # Assuming you want to use accuracy as the metric\n", + " \"task\": \"classification\",\n", + " \"n_concurrent_trials\": 2, # Adjust the number of concurrent trials depending on your cluster setup\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Force cancel jobs if time limit is reached\n", + "}\n", + "\n", + "# Create an AutoML instance\n", + "automl = AutoML()\n", + "\n", + "# Run the AutoML search\n", + "# You need to replace 'psdf' with your actual pandas-on-spark DataFrame variable\n", + "# and 'label_column' with the name of your label column\n", + "automl.fit(dataframe=psdf, label=label_column, **automl_settings)\n", + "```\n", + "\n", + "This code snippet assumes that the `data` variable contains the pandas DataFrame you want to classify and that `label_column` is the name of the target variable for the classification task. Make sure to replace 'data' and 'label_column' with your actual data and label column name before running this code.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", + "\n", + "UPDATE CONTEXT\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "# reset the assistant. Always reset the assistant before starting a new conversation.\n", + "assistant.reset()\n", + "\n", + "# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.\n", + "# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.\n", + "# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.\n", + "# With human-in-loop, the conversation will continue until the user says \"exit\".\n", + "code_problem = \"How can I use FLAML to perform a classification task and use spark to do parallel training. Train for 30 seconds and force cancel jobs if time limit is reached.\"\n", + "chat_result = ragproxyagent.initiate_chat(\n", + " assistant, message=ragproxyagent.message_generator, problem=code_problem, search_string=\"spark\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 2\n", + "\n", + "[Back to top](#table-of-contents)\n", + "\n", + "Use RetrieveChat to answer a question that is not related to code generation.\n", + "\n", + "Problem: Who is the author of FLAML?" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/lijiang1/anaconda3/envs/autogen/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to create collection.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-11 19:58:21,076 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n", + "Model gpt4-1106-preview not found. Using cl100k_base encoding.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VectorDB returns doc_ids: [['7968cf3c', 'bdfbc921']]\n", + "\u001b[32mAdding content of doc 7968cf3c to context.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Model gpt4-1106-preview not found. Using cl100k_base encoding.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32mAdding content of doc bdfbc921 to context.\u001b[0m\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", + "context provided by the user.\n", + "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", + "For code generation, you must obey the following rules:\n", + "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", + "Rule 2. You must follow the formats below to write your code:\n", + "```language\n", + "# your code\n", + "```\n", + "\n", + "User's question is: Who is the author of FLAML?\n", + "\n", + "Context is: # Research\n", + "\n", + "For technical details, please check our research publications.\n", + "\n", + "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", + "```\n", + "\n", + "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", + "```\n", + "\n", + "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", + "```\n", + "\n", + "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", + "```\n", + "\n", + "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", + "```\n", + "\n", + "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", + "\n", + "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", + "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", + "\n", + "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", + "```\n", + "# Integrate - Spark\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", + "\n", + "## Spark ML Estimators\n", + "\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", + "\n", + "### Data\n", + "\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "\n", + "Here is an example code snippet for Spark Data:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000],\n", + "}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "\n", + "Here is an example of how to use it:\n", + "\n", + "```python\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "```\n", + "\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "\n", + "### Estimators\n", + "\n", + "#### Model List\n", + "\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", + "\n", + "#### Usage\n", + "\n", + "First, prepare your data in the required format as described in the previous section.\n", + "\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "\n", + "Here is an example code snippet using SparkML models in AutoML:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", + "\n", + "## Parallel Spark Jobs\n", + "\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", + "\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mragproxyagent\u001b[0m (to assistant):\n", + "\n", + "You're a retrieve augmented coding assistant. You answer user's questions based on your own knowledge and the\n", + "context provided by the user.\n", + "If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.\n", + "For code generation, you must obey the following rules:\n", + "Rule 1. You MUST NOT install any packages because all the packages needed are already installed.\n", + "Rule 2. You must follow the formats below to write your code:\n", + "```language\n", + "# your code\n", + "```\n", + "\n", + "User's question is: Who is the author of FLAML?\n", + "\n", + "Context is: # Research\n", + "\n", + "For technical details, please check our research publications.\n", + "\n", + "- [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021flaml,\n", + " title={FLAML: A Fast and Lightweight AutoML Library},\n", + " author={Chi Wang and Qingyun Wu and Markus Weimer and Erkang Zhu},\n", + " year={2021},\n", + " booktitle={MLSys},\n", + "}\n", + "```\n", + "\n", + "- [Frugal Optimization for Cost-related Hyperparameters](https://arxiv.org/abs/2005.01571). Qingyun Wu, Chi Wang, Silu Huang. AAAI 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021cfo,\n", + " title={Frugal Optimization for Cost-related Hyperparameters},\n", + " author={Qingyun Wu and Chi Wang and Silu Huang},\n", + " year={2021},\n", + " booktitle={AAAI},\n", + "}\n", + "```\n", + "\n", + "- [Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/). Chi Wang, Qingyun Wu, Silu Huang, Amin Saied. ICLR 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2021blendsearch,\n", + " title={Economical Hyperparameter Optimization With Blended Search Strategy},\n", + " author={Chi Wang and Qingyun Wu and Silu Huang and Amin Saied},\n", + " year={2021},\n", + " booktitle={ICLR},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models](https://aclanthology.org/2021.acl-long.178.pdf). Susan Xueqing Liu, Chi Wang. ACL 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{liuwang2021hpolm,\n", + " title={An Empirical Study on Hyperparameter Optimization for Fine-Tuning Pre-trained Language Models},\n", + " author={Susan Xueqing Liu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ACL},\n", + "}\n", + "```\n", + "\n", + "- [ChaCha for Online AutoML](https://www.microsoft.com/en-us/research/publication/chacha-for-online-automl/). Qingyun Wu, Chi Wang, John Langford, Paul Mineiro and Marco Rossi. ICML 2021.\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2021chacha,\n", + " title={ChaCha for Online AutoML},\n", + " author={Qingyun Wu and Chi Wang and John Langford and Paul Mineiro and Marco Rossi},\n", + " year={2021},\n", + " booktitle={ICML},\n", + "}\n", + "```\n", + "\n", + "- [Fair AutoML](https://arxiv.org/abs/2111.06495). Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2111.06495 (2021).\n", + "\n", + "```bibtex\n", + "@inproceedings{wuwang2021fairautoml,\n", + " title={Fair AutoML},\n", + " author={Qingyun Wu and Chi Wang},\n", + " year={2021},\n", + " booktitle={ArXiv preprint arXiv:2111.06495},\n", + "}\n", + "```\n", + "\n", + "- [Mining Robust Default Configurations for Resource-constrained AutoML](https://arxiv.org/abs/2202.09927). Moe Kayali, Chi Wang. ArXiv preprint arXiv:2202.09927 (2022).\n", + "\n", + "```bibtex\n", + "@inproceedings{kayaliwang2022default,\n", + " title={Mining Robust Default Configurations for Resource-constrained AutoML},\n", + " author={Moe Kayali and Chi Wang},\n", + " year={2022},\n", + " booktitle={ArXiv preprint arXiv:2202.09927},\n", + "}\n", + "```\n", + "\n", + "- [Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives](https://openreview.net/forum?id=0Ij9_q567Ma). Shaokun Zhang, Feiran Jia, Chi Wang, Qingyun Wu. ICLR 2023 (notable-top-5%).\n", + "\n", + "```bibtex\n", + "@inproceedings{zhang2023targeted,\n", + " title={Targeted Hyperparameter Optimization with Lexicographic Preferences Over Multiple Objectives},\n", + " author={Shaokun Zhang and Feiran Jia and Chi Wang and Qingyun Wu},\n", + " booktitle={International Conference on Learning Representations},\n", + " year={2023},\n", + " url={https://openreview.net/forum?id=0Ij9_q567Ma},\n", + "}\n", + "```\n", + "\n", + "- [Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference](https://arxiv.org/abs/2303.04673). Chi Wang, Susan Xueqing Liu, Ahmed H. Awadallah. ArXiv preprint arXiv:2303.04673 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wang2023EcoOptiGen,\n", + " title={Cost-Effective Hyperparameter Optimization for Large Language Model Generation Inference},\n", + " author={Chi Wang and Susan Xueqing Liu and Ahmed H. Awadallah},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2303.04673},\n", + "}\n", + "```\n", + "\n", + "- [An Empirical Study on Challenging Math Problem Solving with GPT-4](https://arxiv.org/abs/2306.01337). Yiran Wu, Feiran Jia, Shaokun Zhang, Hangyu Li, Erkang Zhu, Yue Wang, Yin Tat Lee, Richard Peng, Qingyun Wu, Chi Wang. ArXiv preprint arXiv:2306.01337 (2023).\n", + "\n", + "```bibtex\n", + "@inproceedings{wu2023empirical,\n", + " title={An Empirical Study on Challenging Math Problem Solving with GPT-4},\n", + " author={Yiran Wu and Feiran Jia and Shaokun Zhang and Hangyu Li and Erkang Zhu and Yue Wang and Yin Tat Lee and Richard Peng and Qingyun Wu and Chi Wang},\n", + " year={2023},\n", + " booktitle={ArXiv preprint arXiv:2306.01337},\n", + "}\n", + "```\n", + "# Integrate - Spark\n", + "\n", + "FLAML has integrated Spark for distributed training. There are two main aspects of integration with Spark:\n", + "\n", + "- Use Spark ML estimators for AutoML.\n", + "- Use Spark to run training in parallel spark jobs.\n", + "\n", + "## Spark ML Estimators\n", + "\n", + "FLAML integrates estimators based on Spark ML models. These models are trained in parallel using Spark, so we called them Spark estimators. To use these models, you first need to organize your data in the required format.\n", + "\n", + "### Data\n", + "\n", + "For Spark estimators, AutoML only consumes Spark data. FLAML provides a convenient function `to_pandas_on_spark` in the `flaml.automl.spark.utils` module to convert your data into a pandas-on-spark (`pyspark.pandas`) dataframe/series, which Spark estimators require.\n", + "\n", + "This utility function takes data in the form of a `pandas.Dataframe` or `pyspark.sql.Dataframe` and converts it into a pandas-on-spark dataframe. It also takes `pandas.Series` or `pyspark.sql.Dataframe` and converts it into a [pandas-on-spark](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html) series. If you pass in a `pyspark.pandas.Dataframe`, it will not make any changes.\n", + "\n", + "This function also accepts optional arguments `index_col` and `default_index_type`.\n", + "\n", + "- `index_col` is the column name to use as the index, default is None.\n", + "- `default_index_type` is the default index type, default is \"distributed-sequence\". More info about default index type could be found on Spark official [documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type)\n", + "\n", + "Here is an example code snippet for Spark Data:\n", + "\n", + "```python\n", + "import pandas as pd\n", + "from flaml.automl.spark.utils import to_pandas_on_spark\n", + "\n", + "# Creating a dictionary\n", + "data = {\n", + " \"Square_Feet\": [800, 1200, 1800, 1500, 850],\n", + " \"Age_Years\": [20, 15, 10, 7, 25],\n", + " \"Price\": [100000, 200000, 300000, 240000, 120000],\n", + "}\n", + "\n", + "# Creating a pandas DataFrame\n", + "dataframe = pd.DataFrame(data)\n", + "label = \"Price\"\n", + "\n", + "# Convert to pandas-on-spark dataframe\n", + "psdf = to_pandas_on_spark(dataframe)\n", + "```\n", + "\n", + "To use Spark ML models you need to format your data appropriately. Specifically, use [`VectorAssembler`](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.VectorAssembler.html) to merge all feature columns into a single vector column.\n", + "\n", + "Here is an example of how to use it:\n", + "\n", + "```python\n", + "from pyspark.ml.feature import VectorAssembler\n", + "\n", + "columns = psdf.columns\n", + "feature_cols = [col for col in columns if col != label]\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n", + "```\n", + "\n", + "Later in conducting the experiment, use your pandas-on-spark data like non-spark data and pass them using `X_train, y_train` or `dataframe, label`.\n", + "\n", + "### Estimators\n", + "\n", + "#### Model List\n", + "\n", + "- `lgbm_spark`: The class for fine-tuning Spark version LightGBM models, using [SynapseML](https://microsoft.github.io/SynapseML/docs/features/lightgbm/about/) API.\n", + "\n", + "#### Usage\n", + "\n", + "First, prepare your data in the required format as described in the previous section.\n", + "\n", + "By including the models you intend to try in the `estimators_list` argument to `flaml.automl`, FLAML will start trying configurations for these models. If your input is Spark data, FLAML will also use estimators with the `_spark` postfix by default, even if you haven't specified them.\n", + "\n", + "Here is an example code snippet using SparkML models in AutoML:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "# prepare your data in pandas-on-spark format as we previously mentioned\n", + "\n", + "automl = flaml.AutoML()\n", + "settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"estimator_list\": [\"lgbm_spark\"], # this setting is optional\n", + " \"task\": \"regression\",\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=psdf,\n", + " label=label,\n", + " **settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/automl_bankrupt_synapseml.ipynb)\n", + "\n", + "## Parallel Spark Jobs\n", + "\n", + "You can activate Spark as the parallel backend during parallel tuning in both [AutoML](/docs/Use-Cases/Task-Oriented-AutoML#parallel-tuning) and [Hyperparameter Tuning](/docs/Use-Cases/Tune-User-Defined-Function#parallel-tuning), by setting the `use_spark` to `true`. FLAML will dispatch your job to the distributed Spark backend using [`joblib-spark`](https://github.com/joblib/joblib-spark).\n", + "\n", + "Please note that you should not set `use_spark` to `true` when applying AutoML and Tuning for Spark Data. This is because only SparkML models will be used for Spark Data in AutoML and Tuning. As SparkML models run in parallel, there is no need to distribute them with `use_spark` again.\n", + "\n", + "All the Spark-related arguments are stated below. These arguments are available in both Hyperparameter Tuning and AutoML:\n", + "\n", + "- `use_spark`: boolean, default=False | Whether to use spark to run the training in parallel spark jobs. This can be used to accelerate training on large models and large datasets, but will incur more overhead in time and thus slow down training in some cases. GPU training is not supported yet when use_spark is True. For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`.\n", + "- `n_concurrent_trials`: int, default=1 | The number of concurrent trials. When n_concurrent_trials > 1, FLAML performes parallel tuning.\n", + "- `force_cancel`: boolean, default=False | Whether to forcely cancel Spark jobs if the search time exceeded the time budget. Spark jobs include parallel tuning jobs and Spark-based model training jobs.\n", + "\n", + "An example code snippet for using parallel Spark jobs:\n", + "\n", + "```python\n", + "import flaml\n", + "\n", + "automl_experiment = flaml.AutoML()\n", + "automl_settings = {\n", + " \"time_budget\": 30,\n", + " \"metric\": \"r2\",\n", + " \"task\": \"regression\",\n", + " \"n_concurrent_trials\": 2,\n", + " \"use_spark\": True,\n", + " \"force_cancel\": True, # Activating the force_cancel option can immediately halt Spark jobs once they exceed the allocated time_budget.\n", + "}\n", + "\n", + "automl.fit(\n", + " dataframe=dataframe,\n", + " label=label,\n", + " **automl_settings,\n", + ")\n", + "```\n", + "\n", + "[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_spark.ipynb)\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33massistant\u001b[0m (to ragproxyagent):\n", "\n", - "The guide provides code snippets and references to official documentation for each topic, making it easy to follow along and implement FLAML with Spark data. It also includes links to notebooks that demonstrate the usage of FLAML with Spark data in Colab.\n", + "The authors of FLAML are Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu.\n", "\n", "--------------------------------------------------------------------------------\n" ] @@ -986,7 +1531,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.13" }, "skip_test": "Requires interactive usage" },