Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #3281 #3282

Merged
merged 2 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions autogen/agentchat/contrib/retrieve_user_proxy_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _generate_retrieve_user_reply(
self.problem, self.n_results * (2 * _tmp_retrieve_count + 1), self._search_string
)
doc_contents = self._get_context(self._results)
if doc_contents:
if doc_contents or self.n_results * (2 * _tmp_retrieve_count + 1) >= len(self._results[0]):
break
elif update_context_case2:
# Use the current intermediate info as the query text to retrieve docs, and each time we append the top similar
Expand All @@ -531,7 +531,7 @@ def _generate_retrieve_user_reply(
)
self._get_context(self._results)
doc_contents = "\n".join(self._doc_contents) # + "\n" + "\n".join(self._intermediate_answers)
if doc_contents:
if doc_contents or self.n_results * (2 * _tmp_retrieve_count + 1) >= len(self._results[0]):
break

self.clear_history()
Expand Down
135 changes: 54 additions & 81 deletions notebook/agentchat_RetrieveChat.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"models to use: ['gpt-3.5-turbo-0125']\n"
"models to use: ['gpt-35-turbo', 'gpt4-1106-preview', 'gpt-4o']\n"
]
}
],
Expand All @@ -73,9 +73,7 @@
"# a vector database instance\n",
"from autogen.retrieve_utils import TEXT_FORMATS\n",
"\n",
"config_list = [\n",
" {\"model\": \"gpt-3.5-turbo-0125\", \"api_key\": \"<YOUR_API_KEY>\", \"api_type\": \"openai\"},\n",
"]\n",
"config_list = autogen.config_list_from_json(\"OAI_CONFIG_LIST\")\n",
"\n",
"assert len(config_list) > 0\n",
"print(\"models to use: \", [config_list[i][\"model\"] for i in range(len(config_list))])"
Expand Down Expand Up @@ -107,7 +105,7 @@
"output_type": "stream",
"text": [
"Accepted file formats for `docs_path`:\n",
"['odt', 'xml', 'pdf', 'docx', 'html', 'md', 'htm', 'csv', 'rst', 'org', 'ppt', 'doc', 'log', 'json', 'epub', 'jsonl', 'pptx', 'yml', 'xlsx', 'tsv', 'txt', 'yaml', 'msg', 'rtf']\n"
"['txt', 'json', 'csv', 'tsv', 'md', 'html', 'htm', 'rtf', 'rst', 'jsonl', 'log', 'xml', 'yaml', 'yml', 'pdf']\n"
]
}
],
Expand All @@ -120,7 +118,16 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/anaconda3/envs/autogen312/lib/python3.12/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
"source": [
"# 1. create an RetrieveAssistantAgent instance named \"assistant\"\n",
"assistant = RetrieveAssistantAgent(\n",
Expand Down Expand Up @@ -160,6 +167,7 @@
" # \"client\": chromadb.PersistentClient(path=\"/tmp/chromadb\"), # deprecated, use \"vector_db\" instead\n",
" \"vector_db\": \"chroma\", # to use the deprecated `client` parameter, set to None and uncomment the line above\n",
" \"overwrite\": False, # set to True if you want to overwrite an existing collection\n",
" \"get_or_create\": True, # set to False if don't want to reuse an existing collection\n",
" },\n",
" code_execution_config=False, # set to False if you don't want to execute the code\n",
")"
Expand Down Expand Up @@ -188,7 +196,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2024-04-07 17:30:56,955 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `autogen-docs`.\u001b[0m\n"
"2024-08-02 06:30:11,303 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - \u001b[32mUse the existing collection `autogen-docs`.\u001b[0m\n",
"2024-08-02 06:30:11,485 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n"
]
},
{
Expand All @@ -202,7 +211,6 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2024-04-07 17:30:59,609 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 2 chunks.\u001b[0m\n",
"Number of requested results 20 is greater than number of elements in index 2, updating n_results = 2\n"
]
},
Expand Down Expand Up @@ -361,65 +369,53 @@
"--------------------------------------------------------------------------------\n",
"\u001b[33massistant\u001b[0m (to ragproxyagent):\n",
"\n",
"To perform a classification task using FLAML and use Spark to do parallel training for 30 seconds and force cancel jobs if the time limit is reached, you can follow these steps:\n",
"\n",
"1. First, convert your data into Spark dataframe format using `to_pandas_on_spark` function from `flaml.automl.spark.utils` module.\n",
"2. Then, format your data for use SparkML models by using `VectorAssembler`.\n",
"3. Define your AutoML settings, including the `metric`, `time_budget`, and `task`.\n",
"4. Use `AutoML` from `flaml` to run AutoML with SparkML models by setting `use_spark` to `true`, and `estimator_list` to a list of spark-based estimators, like `[\"lgbm_spark\"]`.\n",
"5. Set `n_concurrent_trials` to the desired number of parallel jobs and `force_cancel` to `True` to cancel the jobs if the time limit is reached.\n",
"\n",
"Here's an example code snippet for performing classification using FLAML and Spark:\n",
"\n",
"```python\n",
"import pandas as pd\n",
"import flaml\n",
"from flaml.automl.spark.utils import to_pandas_on_spark\n",
"from pyspark.ml.feature import VectorAssembler\n",
"import flaml\n",
"import pandas as pd\n",
"\n",
"# Creating a dictionary\n",
"# Example Data (Please provide real data in practice)\n",
"data = {\n",
" \"sepal_length\": [5.1, 4.9, 4.7, 4.6, 5.0],\n",
" \"sepal_width\": [3.5, 3.0, 3.2, 3.1, 3.6],\n",
" \"petal_length\": [1.4, 1.4, 1.3, 1.5, 1.4],\n",
" \"petal_width\": [0.2, 0.2, 0.2, 0.2, 0.2],\n",
" \"species\": [\"setosa\", \"setosa\", \"setosa\", \"setosa\", \"setosa\"]\n",
" \"feature1\": [0, 1, 2, 3, 4],\n",
" \"feature2\": [1, 2, 3, 4, 5],\n",
" # ... add all features you need for your classification\n",
" \"label\": ['a', 'b', 'a', 'a', 'b'], # assuming binary classification with labels 'a' and 'b'\n",
"}\n",
"\n",
"# Creating a pandas DataFrame\n",
"dataframe = pd.DataFrame(data)\n",
"label = \"species\"\n",
"# Convert to Pandas DataFrame\n",
"pdf = pd.DataFrame(data)\n",
"\n",
"# Convert to pandas-on-spark dataframe\n",
"psdf = to_pandas_on_spark(dataframe)\n",
"# Generate pandas-on-spark dataframe\n",
"psdf = to_pandas_on_spark(pdf)\n",
"\n",
"# Format data for SparkML models\n",
"columns = psdf.columns\n",
"feature_cols = [col for col in columns if col != label]\n",
"# Organize data into feature vectors and labels\n",
"label_col = \"label\"\n",
"feature_cols = [col for col in psdf.columns if col != label_col]\n",
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\"]\n",
"\n",
"# Define AutoML settings\n",
"settings = {\n",
"# Apply the transformation\n",
"psdf = featurizer.transform(psdf.to_spark(index_col=\"index\"))[\"index\", \"features\", label_col]\n",
"\n",
"# Prepare AutoML settings\n",
"automl_settings = {\n",
" \"time_budget\": 30,\n",
" \"metric\": \"accuracy\",\n",
" \"metric\": \"accuracy\", # Change this to a classification metric you prefer\n",
" \"task\": \"classification\",\n",
" \"n_concurrent_trials\": 2, # Or other number that fits your Spark cluster configuration\n",
" \"use_spark\": True,\n",
" \"force_cancel\": True, # Enable force cancel to obey the time constraint\n",
" \"estimator_list\": [\"lgbm_spark\"], # Specify SparkML estimators you want to try\n",
"}\n",
"\n",
"# Use AutoML with SparkML models and parallel jobs\n",
"# Create an AutoML instance\n",
"automl = flaml.AutoML()\n",
"automl.fit(\n",
" dataframe=psdf,\n",
" label=label,\n",
" estimator_list=[\"lgbm_spark\"],\n",
" use_spark=True,\n",
" n_concurrent_trials=2,\n",
" force_cancel=True,\n",
" **settings,\n",
")\n",
"```\n",
"\n",
"Note that the above code assumes the data is small enough to train within 30 seconds. If you have a larger dataset, you may need to increase the `time_budget` and adjust the number of parallel jobs accordingly.\n",
"# Run the AutoML search\n",
"automl.fit(dataframe=psdf, label=label_col, **automl_settings)\n",
"``` \n",
"\n",
"Remember to replace the example data with your real dataset and choose an appropriate metric for your classification task. You'll also need a configured and running Spark environment to utilize the \"use_spark\" feature.\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
Expand All @@ -439,25 +435,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Number of requested results 60 is greater than number of elements in index 2, updating n_results = 2\n",
"Number of requested results 100 is greater than number of elements in index 2, updating n_results = 2\n",
"Number of requested results 140 is greater than number of elements in index 2, updating n_results = 2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"VectorDB returns doc_ids: [['bdfbc921']]\n",
"VectorDB returns doc_ids: [['bdfbc921']]\n",
"VectorDB returns doc_ids: [['bdfbc921']]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Number of requested results 180 is greater than number of elements in index 2, updating n_results = 2\n"
"Number of requested results 60 is greater than number of elements in index 2, updating n_results = 2\n"
]
},
{
Expand All @@ -470,18 +448,13 @@
"\n",
"TERMINATE\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[33mragproxyagent\u001b[0m (to assistant):\n",
"\n",
"TERMINATE\n",
"\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/plain": [
"ChatResult(chat_id=None, chat_history=[{'content': 'TERMINATE', 'role': 'assistant'}], summary='', cost=({'total_cost': 0.007691, 'gpt-35-turbo': {'cost': 0.007691, 'prompt_tokens': 4242, 'completion_tokens': 664, 'total_tokens': 4906}}, {'total_cost': 0}), human_input=[])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand Down Expand Up @@ -2836,7 +2809,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.12.4"
},
"skip_test": "Requires interactive usage"
},
Expand Down
Loading