From b8304c213090feaf887282b3f9adad71b34e7379 Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Tue, 26 Mar 2024 13:48:57 -0700 Subject: [PATCH 1/5] Add web scrapping notebook --- .../agentchat_webscrapping_with_apify.ipynb | 329 ++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 notebook/agentchat_webscrapping_with_apify.ipynb diff --git a/notebook/agentchat_webscrapping_with_apify.ipynb b/notebook/agentchat_webscrapping_with_apify.ipynb new file mode 100644 index 000000000000..3a2f2e162dcc --- /dev/null +++ b/notebook/agentchat_webscrapping_with_apify.ipynb @@ -0,0 +1,329 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Web Scrapping using Apify Tools" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -qqq pyautogen apify-client" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "config_list = [\n", + " {\"model\": \"gpt-4\", \"api_key\": os.getenv(\"OPENAI_API_KEY\")},\n", + "]\n", + "\n", + "apify_api_key = os.getenv(\"APIFY_API_KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from typing_extensions import Annotated\n", + "from apify_client import ApifyClient\n", + "\n", + "def scrape_page(url: Annotated[str, \"The URL of the web page to scrape\"]) -> Annotated[str, \"Scraped content\"]:\n", + " # Initialize the ApifyClient with your API token\n", + " client = ApifyClient(token=apify_api_key)\n", + "\n", + " # Prepare the Actor input\n", + " run_input = {\n", + " \"startUrls\": [{\"url\": url}],\n", + " \"useSitemaps\": False,\n", + " \"crawlerType\": \"playwright:firefox\",\n", + " \"includeUrlGlobs\": [],\n", + " \"excludeUrlGlobs\": [],\n", + " \"ignoreCanonicalUrl\": False,\n", + " \"maxCrawlDepth\": 0,\n", + " \"maxCrawlPages\": 1,\n", + " \"initialConcurrency\": 0,\n", + " \"maxConcurrency\": 200,\n", + " \"initialCookies\": [],\n", + " \"proxyConfiguration\": {\"useApifyProxy\": True},\n", + " \"maxSessionRotations\": 10,\n", + " \"maxRequestRetries\": 5,\n", + " \"requestTimeoutSecs\": 60,\n", + " \"dynamicContentWaitSecs\": 10,\n", + " \"maxScrollHeightPixels\": 5000,\n", + " \"removeElementsCssSelector\": \"\"\"nav, footer, script, style, noscript, svg,\n", + " [role=\\\"alert\\\"],\n", + " [role=\\\"banner\\\"],\n", + " [role=\\\"dialog\\\"],\n", + " [role=\\\"alertdialog\\\"],\n", + " [role=\\\"region\\\"][aria-label*=\\\"skip\\\" i],\n", + " [aria-modal=\\\"true\\\"]\"\"\",\n", + " \"removeCookieWarnings\": True,\n", + " \"clickElementsCssSelector\": '[aria-expanded=\"false\"]',\n", + " \"htmlTransformer\": \"readableText\",\n", + " \"readableTextCharThreshold\": 100,\n", + " \"aggressivePrune\": False,\n", + " \"debugMode\": True,\n", + " \"debugLog\": True,\n", + " \"saveHtml\": True,\n", + " \"saveMarkdown\": True,\n", + " \"saveFiles\": False,\n", + " \"saveScreenshots\": False,\n", + " \"maxResults\": 9999999,\n", + " \"clientSideMinChangePercentage\": 15,\n", + " \"renderingTypeDetectionPercentage\": 10,\n", + " }\n", + "\n", + " # Run the Actor and wait for it to finish\n", + " run = client.actor(\"aYG0l9s7dbB7j3gbS\").call(run_input=run_input)\n", + "\n", + " # Fetch and print Actor results from the run's dataset (if there are any)\n", + " text_data = \"\"\n", + " for item in client.dataset(run[\"defaultDatasetId\"]).iterate_items():\n", + " text_data += item.get(\"text\", \"\") + \"\\n\"\n", + "\n", + " average_token = 0.75\n", + " max_tokens = 20000 # slightly less than max to be safe 32k\n", + " text_data = text_data[: int(average_token * max_tokens)]\n", + " return text_data" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "from autogen import ConversableAgent, register_function\n", + "\n", + "# Create web scrapper agent.\n", + "scraper_agent = ConversableAgent(\n", + " \"WebScraper\",\n", + " llm_config={\"config_list\": config_list},\n", + " system_message=\"You are a web scrapper and you can scrape any web page using the tools provided. \"\n", + " \"Returns 'TERMINATE' when the scraping is done.\",\n", + ")\n", + "\n", + "# Create user proxy agent.\n", + "user_proxy_agent = ConversableAgent(\n", + " \"UserProxy\",\n", + " llm_config=False, # No LLM for this agent.\n", + " human_input_mode=\"NEVER\",\n", + " code_execution_config=False, # No code execution for this agent.\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") is not None and \"terminate\" in x[\"content\"].lower(),\n", + " default_auto_reply=\"Please continue if not finished, otherwise return 'TERMINATE'.\",\n", + ")\n", + "\n", + "# Register the function with the agents.\n", + "register_function(\n", + " scrape_page, \n", + " caller=scraper_agent, \n", + " executor=user_proxy_agent, \n", + " name=\"scrape_page\", \n", + " description=\"Scrape a web page and return the content.\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "Can you scrape agentops.ai for me?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", + "\n", + "\u001b[32m***** Suggested tool call (call_0qok2jvCxOfv7HOA0oxPWneM): scrape_page *****\u001b[0m\n", + "Arguments: \n", + "{\n", + "\"url\": \"https://www.agentops.ai\"\n", + "}\n", + "\u001b[32m****************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION scrape_page...\u001b[0m\n", + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "\u001b[32m***** Response from calling tool (call_0qok2jvCxOfv7HOA0oxPWneM) *****\u001b[0m\n", + "START NOW\n", + "Take your business to the next level with our features \n", + "AI Agents Suck.\n", + "We're Fixing That. \n", + "Build compliant AI agents with observability, evals, and replay analytics. No more black boxes and prompt guessing.\n", + "New! Introducing AgentOps\n", + "Three Lines of Code. Unlimited Testing. \n", + "Instant Testing + Debugging = Compliant AI Agents That Work\n", + "5\n", + "# Beginning of program's code (i.e. main.py, __init__.py)\n", + "6\n", + "ao_client = agentops.Client()\n", + "9\n", + "# (optional: record specific functions)\n", + "10\n", + "@ao_client.record_action('sample function being record')\n", + "11\n", + "def sample_function(...):\n", + "15\n", + "ao_client.end_session('Success')\n", + "Prototype to Production\n", + "Generous free limits, upgrade only when you need it.\n", + "\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", + "\n", + "Sure, here's the information from the website agentops.ai:\n", + "\n", + "- Their main value proposition is to fix bad AI Agents and replace black boxes and prompt guessing with compliant, observable AI agents that come with evals and replay analytics.\n", + "- Their latest product is AgentOps. The simple and instant testing & debugging offered promises better-performing compliant AI agents.\n", + "- Integration is easy with just three lines of code.\n", + "- They let you record specific functions.\n", + "- They provide generous free limits and you only need to upgrade when necessary.\n", + "\n", + "Here's a sample of their code:\n", + "```python\n", + "ao_client = agentops.Client()\n", + "\n", + "# optional: record specific functions\n", + "@ao_client.record_action('sample function being record')\n", + "def sample_function(...):\n", + " ...\n", + "\n", + "ao_client.end_session('Success')\n", + "```\n", + "This code is for sample usage of their libraries/functions.\n", + "\n", + "Let me know if you need more specific details.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "Please continue if not finished, otherwise return 'TERMINATE'.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "chat_result = user_proxy_agent.initiate_chat(\n", + " scraper_agent,\n", + " message=\"Can you scrape agentops.ai for me?\",\n", + " summary_method=\"reflection_with_llm\",\n", + " summary_args={\n", + " \"summary_prompt\": \"\"\"Summarize the scraped content and format summary EXACTLY as follows: \n", + "---\n", + "*Company name*:\n", + "`Acme Corp`\n", + "---\n", + "*Website*:\n", + "`acmecorp.com`\n", + "---\n", + "*Description*:\n", + "`Company that does things.`\n", + "---\n", + "*Tags*:\n", + "`Manufacturing. Retail. E-commerce.`\n", + "---\n", + "*Takeaways*:\n", + "`Provides shareholders with value by selling products.`\n", + "---\n", + "*Questions*:\n", + "`What products do they sell? How do they make money? What is their market share?`\n", + "---\n", + "\"\"\"\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---\n", + "*Company name*:\n", + "`AgentOps`\n", + "---\n", + "*Website*:\n", + "`agentops.ai`\n", + "---\n", + "*Description*:\n", + "`Company that aims to improve AI agents. They offer observed and evaluable AI agents with replay analytics as an alternative to black box models and blind prompting.`\n", + "---\n", + "*Tags*:\n", + "`Artificial Intelligence, AI agents, Observability, Analytics.`\n", + "---\n", + "*Takeaways*:\n", + "`Their product, AgentOps, allows for easy and instant testing and debugging of AI agents. Integration is as simple as writing three lines of code. They also provide generous free limits and mandate upgrades only when necessary.`\n", + "---\n", + "*Questions*:\n", + "`What differentiates AgentOps from other, similar products? How does their pricing scale with usage? What are the details of their \"generous free limits\"?`\n", + "---\n" + ] + } + ], + "source": [ + "print(chat_result.summary)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autogen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 2d687a02bcb42c1f665e50cf0357ea0406f70993 Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Tue, 26 Mar 2024 13:54:49 -0700 Subject: [PATCH 2/5] formatting --- notebook/agentchat_webscrapping_with_apify.ipynb | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/notebook/agentchat_webscrapping_with_apify.ipynb b/notebook/agentchat_webscrapping_with_apify.ipynb index 3a2f2e162dcc..8d5c6cb9fe50 100644 --- a/notebook/agentchat_webscrapping_with_apify.ipynb +++ b/notebook/agentchat_webscrapping_with_apify.ipynb @@ -40,6 +40,7 @@ "from typing_extensions import Annotated\n", "from apify_client import ApifyClient\n", "\n", + "\n", "def scrape_page(url: Annotated[str, \"The URL of the web page to scrape\"]) -> Annotated[str, \"Scraped content\"]:\n", " # Initialize the ApifyClient with your API token\n", " client = ApifyClient(token=apify_api_key)\n", @@ -119,19 +120,19 @@ "# Create user proxy agent.\n", "user_proxy_agent = ConversableAgent(\n", " \"UserProxy\",\n", - " llm_config=False, # No LLM for this agent.\n", + " llm_config=False, # No LLM for this agent.\n", " human_input_mode=\"NEVER\",\n", - " code_execution_config=False, # No code execution for this agent.\n", + " code_execution_config=False, # No code execution for this agent.\n", " is_termination_msg=lambda x: x.get(\"content\", \"\") is not None and \"terminate\" in x[\"content\"].lower(),\n", " default_auto_reply=\"Please continue if not finished, otherwise return 'TERMINATE'.\",\n", ")\n", "\n", "# Register the function with the agents.\n", "register_function(\n", - " scrape_page, \n", - " caller=scraper_agent, \n", - " executor=user_proxy_agent, \n", - " name=\"scrape_page\", \n", + " scrape_page,\n", + " caller=scraper_agent,\n", + " executor=user_proxy_agent,\n", + " name=\"scrape_page\",\n", " description=\"Scrape a web page and return the content.\",\n", ")" ] From 2f8b6e6e31fd1723bbe954b61947100eb770c6ee Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Wed, 27 Mar 2024 10:27:42 -0700 Subject: [PATCH 3/5] tags --- .../agentchat_webscrapping_with_apify.ipynb | 665 +++++++++--------- 1 file changed, 337 insertions(+), 328 deletions(-) diff --git a/notebook/agentchat_webscrapping_with_apify.ipynb b/notebook/agentchat_webscrapping_with_apify.ipynb index 8d5c6cb9fe50..b48df8e765c8 100644 --- a/notebook/agentchat_webscrapping_with_apify.ipynb +++ b/notebook/agentchat_webscrapping_with_apify.ipynb @@ -1,330 +1,339 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Web Scrapping using Apify Tools" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install -qqq pyautogen apify-client" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "config_list = [\n", - " {\"model\": \"gpt-4\", \"api_key\": os.getenv(\"OPENAI_API_KEY\")},\n", - "]\n", - "\n", - "apify_api_key = os.getenv(\"APIFY_API_KEY\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from typing_extensions import Annotated\n", - "from apify_client import ApifyClient\n", - "\n", - "\n", - "def scrape_page(url: Annotated[str, \"The URL of the web page to scrape\"]) -> Annotated[str, \"Scraped content\"]:\n", - " # Initialize the ApifyClient with your API token\n", - " client = ApifyClient(token=apify_api_key)\n", - "\n", - " # Prepare the Actor input\n", - " run_input = {\n", - " \"startUrls\": [{\"url\": url}],\n", - " \"useSitemaps\": False,\n", - " \"crawlerType\": \"playwright:firefox\",\n", - " \"includeUrlGlobs\": [],\n", - " \"excludeUrlGlobs\": [],\n", - " \"ignoreCanonicalUrl\": False,\n", - " \"maxCrawlDepth\": 0,\n", - " \"maxCrawlPages\": 1,\n", - " \"initialConcurrency\": 0,\n", - " \"maxConcurrency\": 200,\n", - " \"initialCookies\": [],\n", - " \"proxyConfiguration\": {\"useApifyProxy\": True},\n", - " \"maxSessionRotations\": 10,\n", - " \"maxRequestRetries\": 5,\n", - " \"requestTimeoutSecs\": 60,\n", - " \"dynamicContentWaitSecs\": 10,\n", - " \"maxScrollHeightPixels\": 5000,\n", - " \"removeElementsCssSelector\": \"\"\"nav, footer, script, style, noscript, svg,\n", - " [role=\\\"alert\\\"],\n", - " [role=\\\"banner\\\"],\n", - " [role=\\\"dialog\\\"],\n", - " [role=\\\"alertdialog\\\"],\n", - " [role=\\\"region\\\"][aria-label*=\\\"skip\\\" i],\n", - " [aria-modal=\\\"true\\\"]\"\"\",\n", - " \"removeCookieWarnings\": True,\n", - " \"clickElementsCssSelector\": '[aria-expanded=\"false\"]',\n", - " \"htmlTransformer\": \"readableText\",\n", - " \"readableTextCharThreshold\": 100,\n", - " \"aggressivePrune\": False,\n", - " \"debugMode\": True,\n", - " \"debugLog\": True,\n", - " \"saveHtml\": True,\n", - " \"saveMarkdown\": True,\n", - " \"saveFiles\": False,\n", - " \"saveScreenshots\": False,\n", - " \"maxResults\": 9999999,\n", - " \"clientSideMinChangePercentage\": 15,\n", - " \"renderingTypeDetectionPercentage\": 10,\n", - " }\n", - "\n", - " # Run the Actor and wait for it to finish\n", - " run = client.actor(\"aYG0l9s7dbB7j3gbS\").call(run_input=run_input)\n", - "\n", - " # Fetch and print Actor results from the run's dataset (if there are any)\n", - " text_data = \"\"\n", - " for item in client.dataset(run[\"defaultDatasetId\"]).iterate_items():\n", - " text_data += item.get(\"text\", \"\") + \"\\n\"\n", - "\n", - " average_token = 0.75\n", - " max_tokens = 20000 # slightly less than max to be safe 32k\n", - " text_data = text_data[: int(average_token * max_tokens)]\n", - " return text_data" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "from autogen import ConversableAgent, register_function\n", - "\n", - "# Create web scrapper agent.\n", - "scraper_agent = ConversableAgent(\n", - " \"WebScraper\",\n", - " llm_config={\"config_list\": config_list},\n", - " system_message=\"You are a web scrapper and you can scrape any web page using the tools provided. \"\n", - " \"Returns 'TERMINATE' when the scraping is done.\",\n", - ")\n", - "\n", - "# Create user proxy agent.\n", - "user_proxy_agent = ConversableAgent(\n", - " \"UserProxy\",\n", - " llm_config=False, # No LLM for this agent.\n", - " human_input_mode=\"NEVER\",\n", - " code_execution_config=False, # No code execution for this agent.\n", - " is_termination_msg=lambda x: x.get(\"content\", \"\") is not None and \"terminate\" in x[\"content\"].lower(),\n", - " default_auto_reply=\"Please continue if not finished, otherwise return 'TERMINATE'.\",\n", - ")\n", - "\n", - "# Register the function with the agents.\n", - "register_function(\n", - " scrape_page,\n", - " caller=scraper_agent,\n", - " executor=user_proxy_agent,\n", - " name=\"scrape_page\",\n", - " description=\"Scrape a web page and return the content.\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", - "\n", - "Can you scrape agentops.ai for me?\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", - "\n", - "\u001b[32m***** Suggested tool call (call_0qok2jvCxOfv7HOA0oxPWneM): scrape_page *****\u001b[0m\n", - "Arguments: \n", - "{\n", - "\"url\": \"https://www.agentops.ai\"\n", - "}\n", - "\u001b[32m****************************************************************************\u001b[0m\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[35m\n", - ">>>>>>>> EXECUTING FUNCTION scrape_page...\u001b[0m\n", - "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", - "\n", - "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", - "\n", - "\u001b[32m***** Response from calling tool (call_0qok2jvCxOfv7HOA0oxPWneM) *****\u001b[0m\n", - "START NOW\n", - "Take your business to the next level with our features \n", - "AI Agents Suck.\n", - "We're Fixing That. \n", - "Build compliant AI agents with observability, evals, and replay analytics. No more black boxes and prompt guessing.\n", - "New! Introducing AgentOps\n", - "Three Lines of Code. Unlimited Testing. \n", - "Instant Testing + Debugging = Compliant AI Agents That Work\n", - "5\n", - "# Beginning of program's code (i.e. main.py, __init__.py)\n", - "6\n", - "ao_client = agentops.Client()\n", - "9\n", - "# (optional: record specific functions)\n", - "10\n", - "@ao_client.record_action('sample function being record')\n", - "11\n", - "def sample_function(...):\n", - "15\n", - "ao_client.end_session('Success')\n", - "Prototype to Production\n", - "Generous free limits, upgrade only when you need it.\n", - "\n", - "\u001b[32m**********************************************************************\u001b[0m\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", - "\n", - "Sure, here's the information from the website agentops.ai:\n", - "\n", - "- Their main value proposition is to fix bad AI Agents and replace black boxes and prompt guessing with compliant, observable AI agents that come with evals and replay analytics.\n", - "- Their latest product is AgentOps. The simple and instant testing & debugging offered promises better-performing compliant AI agents.\n", - "- Integration is easy with just three lines of code.\n", - "- They let you record specific functions.\n", - "- They provide generous free limits and you only need to upgrade when necessary.\n", - "\n", - "Here's a sample of their code:\n", - "```python\n", - "ao_client = agentops.Client()\n", - "\n", - "# optional: record specific functions\n", - "@ao_client.record_action('sample function being record')\n", - "def sample_function(...):\n", - " ...\n", - "\n", - "ao_client.end_session('Success')\n", - "```\n", - "This code is for sample usage of their libraries/functions.\n", - "\n", - "Let me know if you need more specific details.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", - "\n", - "Please continue if not finished, otherwise return 'TERMINATE'.\n", - "\n", - "--------------------------------------------------------------------------------\n", - "\u001b[31m\n", - ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", - "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", - "\n", - "TERMINATE\n", - "\n", - "--------------------------------------------------------------------------------\n" - ] - } - ], - "source": [ - "chat_result = user_proxy_agent.initiate_chat(\n", - " scraper_agent,\n", - " message=\"Can you scrape agentops.ai for me?\",\n", - " summary_method=\"reflection_with_llm\",\n", - " summary_args={\n", - " \"summary_prompt\": \"\"\"Summarize the scraped content and format summary EXACTLY as follows: \n", - "---\n", - "*Company name*:\n", - "`Acme Corp`\n", - "---\n", - "*Website*:\n", - "`acmecorp.com`\n", - "---\n", - "*Description*:\n", - "`Company that does things.`\n", - "---\n", - "*Tags*:\n", - "`Manufacturing. Retail. E-commerce.`\n", - "---\n", - "*Takeaways*:\n", - "`Provides shareholders with value by selling products.`\n", - "---\n", - "*Questions*:\n", - "`What products do they sell? How do they make money? What is their market share?`\n", - "---\n", - "\"\"\"\n", - " },\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---\n", - "*Company name*:\n", - "`AgentOps`\n", - "---\n", - "*Website*:\n", - "`agentops.ai`\n", - "---\n", - "*Description*:\n", - "`Company that aims to improve AI agents. They offer observed and evaluable AI agents with replay analytics as an alternative to black box models and blind prompting.`\n", - "---\n", - "*Tags*:\n", - "`Artificial Intelligence, AI agents, Observability, Analytics.`\n", - "---\n", - "*Takeaways*:\n", - "`Their product, AgentOps, allows for easy and instant testing and debugging of AI agents. Integration is as simple as writing three lines of code. They also provide generous free limits and mandate upgrades only when necessary.`\n", - "---\n", - "*Questions*:\n", - "`What differentiates AgentOps from other, similar products? How does their pricing scale with usage? What are the details of their \"generous free limits\"?`\n", - "---\n" - ] - } - ], - "source": [ - "print(chat_result.summary)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "autogen", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Web Scrapping using Apify Tools" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -qqq pyautogen apify-client" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "config_list = [\n", + " {\"model\": \"gpt-4\", \"api_key\": os.getenv(\"OPENAI_API_KEY\")},\n", + "]\n", + "\n", + "apify_api_key = os.getenv(\"APIFY_API_KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from typing_extensions import Annotated\n", + "from apify_client import ApifyClient\n", + "\n", + "\n", + "def scrape_page(url: Annotated[str, \"The URL of the web page to scrape\"]) -> Annotated[str, \"Scraped content\"]:\n", + " # Initialize the ApifyClient with your API token\n", + " client = ApifyClient(token=apify_api_key)\n", + "\n", + " # Prepare the Actor input\n", + " run_input = {\n", + " \"startUrls\": [{\"url\": url}],\n", + " \"useSitemaps\": False,\n", + " \"crawlerType\": \"playwright:firefox\",\n", + " \"includeUrlGlobs\": [],\n", + " \"excludeUrlGlobs\": [],\n", + " \"ignoreCanonicalUrl\": False,\n", + " \"maxCrawlDepth\": 0,\n", + " \"maxCrawlPages\": 1,\n", + " \"initialConcurrency\": 0,\n", + " \"maxConcurrency\": 200,\n", + " \"initialCookies\": [],\n", + " \"proxyConfiguration\": {\"useApifyProxy\": True},\n", + " \"maxSessionRotations\": 10,\n", + " \"maxRequestRetries\": 5,\n", + " \"requestTimeoutSecs\": 60,\n", + " \"dynamicContentWaitSecs\": 10,\n", + " \"maxScrollHeightPixels\": 5000,\n", + " \"removeElementsCssSelector\": \"\"\"nav, footer, script, style, noscript, svg,\n", + " [role=\\\"alert\\\"],\n", + " [role=\\\"banner\\\"],\n", + " [role=\\\"dialog\\\"],\n", + " [role=\\\"alertdialog\\\"],\n", + " [role=\\\"region\\\"][aria-label*=\\\"skip\\\" i],\n", + " [aria-modal=\\\"true\\\"]\"\"\",\n", + " \"removeCookieWarnings\": True,\n", + " \"clickElementsCssSelector\": '[aria-expanded=\"false\"]',\n", + " \"htmlTransformer\": \"readableText\",\n", + " \"readableTextCharThreshold\": 100,\n", + " \"aggressivePrune\": False,\n", + " \"debugMode\": True,\n", + " \"debugLog\": True,\n", + " \"saveHtml\": True,\n", + " \"saveMarkdown\": True,\n", + " \"saveFiles\": False,\n", + " \"saveScreenshots\": False,\n", + " \"maxResults\": 9999999,\n", + " \"clientSideMinChangePercentage\": 15,\n", + " \"renderingTypeDetectionPercentage\": 10,\n", + " }\n", + "\n", + " # Run the Actor and wait for it to finish\n", + " run = client.actor(\"aYG0l9s7dbB7j3gbS\").call(run_input=run_input)\n", + "\n", + " # Fetch and print Actor results from the run's dataset (if there are any)\n", + " text_data = \"\"\n", + " for item in client.dataset(run[\"defaultDatasetId\"]).iterate_items():\n", + " text_data += item.get(\"text\", \"\") + \"\\n\"\n", + "\n", + " average_token = 0.75\n", + " max_tokens = 20000 # slightly less than max to be safe 32k\n", + " text_data = text_data[: int(average_token * max_tokens)]\n", + " return text_data" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "from autogen import ConversableAgent, register_function\n", + "\n", + "# Create web scrapper agent.\n", + "scraper_agent = ConversableAgent(\n", + " \"WebScraper\",\n", + " llm_config={\"config_list\": config_list},\n", + " system_message=\"You are a web scrapper and you can scrape any web page using the tools provided. \"\n", + " \"Returns 'TERMINATE' when the scraping is done.\",\n", + ")\n", + "\n", + "# Create user proxy agent.\n", + "user_proxy_agent = ConversableAgent(\n", + " \"UserProxy\",\n", + " llm_config=False, # No LLM for this agent.\n", + " human_input_mode=\"NEVER\",\n", + " code_execution_config=False, # No code execution for this agent.\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") is not None and \"terminate\" in x[\"content\"].lower(),\n", + " default_auto_reply=\"Please continue if not finished, otherwise return 'TERMINATE'.\",\n", + ")\n", + "\n", + "# Register the function with the agents.\n", + "register_function(\n", + " scrape_page,\n", + " caller=scraper_agent,\n", + " executor=user_proxy_agent,\n", + " name=\"scrape_page\",\n", + " description=\"Scrape a web page and return the content.\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "Can you scrape agentops.ai for me?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", + "\n", + "\u001b[32m***** Suggested tool call (call_0qok2jvCxOfv7HOA0oxPWneM): scrape_page *****\u001b[0m\n", + "Arguments: \n", + "{\n", + "\"url\": \"https://www.agentops.ai\"\n", + "}\n", + "\u001b[32m****************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION scrape_page...\u001b[0m\n", + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "\u001b[32m***** Response from calling tool (call_0qok2jvCxOfv7HOA0oxPWneM) *****\u001b[0m\n", + "START NOW\n", + "Take your business to the next level with our features \n", + "AI Agents Suck.\n", + "We're Fixing That. \n", + "Build compliant AI agents with observability, evals, and replay analytics. No more black boxes and prompt guessing.\n", + "New! Introducing AgentOps\n", + "Three Lines of Code. Unlimited Testing. \n", + "Instant Testing + Debugging = Compliant AI Agents That Work\n", + "5\n", + "# Beginning of program's code (i.e. main.py, __init__.py)\n", + "6\n", + "ao_client = agentops.Client()\n", + "9\n", + "# (optional: record specific functions)\n", + "10\n", + "@ao_client.record_action('sample function being record')\n", + "11\n", + "def sample_function(...):\n", + "15\n", + "ao_client.end_session('Success')\n", + "Prototype to Production\n", + "Generous free limits, upgrade only when you need it.\n", + "\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", + "\n", + "Sure, here's the information from the website agentops.ai:\n", + "\n", + "- Their main value proposition is to fix bad AI Agents and replace black boxes and prompt guessing with compliant, observable AI agents that come with evals and replay analytics.\n", + "- Their latest product is AgentOps. The simple and instant testing & debugging offered promises better-performing compliant AI agents.\n", + "- Integration is easy with just three lines of code.\n", + "- They let you record specific functions.\n", + "- They provide generous free limits and you only need to upgrade when necessary.\n", + "\n", + "Here's a sample of their code:\n", + "```python\n", + "ao_client = agentops.Client()\n", + "\n", + "# optional: record specific functions\n", + "@ao_client.record_action('sample function being record')\n", + "def sample_function(...):\n", + " ...\n", + "\n", + "ao_client.end_session('Success')\n", + "```\n", + "This code is for sample usage of their libraries/functions.\n", + "\n", + "Let me know if you need more specific details.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[33mUserProxy\u001b[0m (to WebScraper):\n", + "\n", + "Please continue if not finished, otherwise return 'TERMINATE'.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[31m\n", + ">>>>>>>> USING AUTO REPLY...\u001b[0m\n", + "\u001b[33mWebScraper\u001b[0m (to UserProxy):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "chat_result = user_proxy_agent.initiate_chat(\n", + " scraper_agent,\n", + " message=\"Can you scrape agentops.ai for me?\",\n", + " summary_method=\"reflection_with_llm\",\n", + " summary_args={\n", + " \"summary_prompt\": \"\"\"Summarize the scraped content and format summary EXACTLY as follows:\n", + "---\n", + "*Company name*:\n", + "`Acme Corp`\n", + "---\n", + "*Website*:\n", + "`acmecorp.com`\n", + "---\n", + "*Description*:\n", + "`Company that does things.`\n", + "---\n", + "*Tags*:\n", + "`Manufacturing. Retail. E-commerce.`\n", + "---\n", + "*Takeaways*:\n", + "`Provides shareholders with value by selling products.`\n", + "---\n", + "*Questions*:\n", + "`What products do they sell? How do they make money? What is their market share?`\n", + "---\n", + "\"\"\"\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---\n", + "*Company name*:\n", + "`AgentOps`\n", + "---\n", + "*Website*:\n", + "`agentops.ai`\n", + "---\n", + "*Description*:\n", + "`Company that aims to improve AI agents. They offer observed and evaluable AI agents with replay analytics as an alternative to black box models and blind prompting.`\n", + "---\n", + "*Tags*:\n", + "`Artificial Intelligence, AI agents, Observability, Analytics.`\n", + "---\n", + "*Takeaways*:\n", + "`Their product, AgentOps, allows for easy and instant testing and debugging of AI agents. Integration is as simple as writing three lines of code. They also provide generous free limits and mandate upgrades only when necessary.`\n", + "---\n", + "*Questions*:\n", + "`What differentiates AgentOps from other, similar products? How does their pricing scale with usage? What are the details of their \"generous free limits\"?`\n", + "---\n" + ] + } + ], + "source": [ + "print(chat_result.summary)" + ] + } + ], + "metadata": { + "front_matter": { + "title": "Web Scraper Agent using Apify Tools", + "description": "Scrapping web pages and summarizing the content using agents with tools.", + "tags": [ + "web-scrapping", + "apify", + "tool-use" + ] + }, + "kernelspec": { + "display_name": "autogen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } From 41eca60f44d986a138476490e1e1bbf2797c711d Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Thu, 28 Mar 2024 09:56:04 -0700 Subject: [PATCH 4/5] Update --- ...=> agentchat_webscraping_with_apify.ipynb} | 56 ++++++++++++++++++- website/docs/Examples.md | 1 + 2 files changed, 54 insertions(+), 3 deletions(-) rename notebook/{agentchat_webscrapping_with_apify.ipynb => agentchat_webscraping_with_apify.ipynb} (89%) diff --git a/notebook/agentchat_webscrapping_with_apify.ipynb b/notebook/agentchat_webscraping_with_apify.ipynb similarity index 89% rename from notebook/agentchat_webscrapping_with_apify.ipynb rename to notebook/agentchat_webscraping_with_apify.ipynb index b48df8e765c8..a8d0c68867e7 100644 --- a/notebook/agentchat_webscrapping_with_apify.ipynb +++ b/notebook/agentchat_webscraping_with_apify.ipynb @@ -4,7 +4,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Web Scrapping using Apify Tools" + "# Web Scraping using Apify Tools\n", + "\n", + "This notebook shows how to use Apify tools with AutoGen agents to\n", + "scrape data from a website and formate the output." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we need to install the Apify SDK and the AutoGen library." ] }, { @@ -16,6 +26,13 @@ "! pip install -qqq pyautogen apify-client" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setting up the LLM configuration and the Apify API key is also required." + ] + }, { "cell_type": "code", "execution_count": 11, @@ -31,6 +48,14 @@ "apify_api_key = os.getenv(\"APIFY_API_KEY\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define the tool for scraping data from the website using Apify actor.\n", + "Read more about tool use in this [tutorial chapter](/docs/tutorial/tool-use)." + ] + }, { "cell_type": "code", "execution_count": 12, @@ -101,6 +126,13 @@ " return text_data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the agents and register the tool." + ] + }, { "cell_type": "code", "execution_count": 34, @@ -137,6 +169,17 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Start the conversation for scraping web data. We used the\n", + "`reflection_with_llm` option for summary method\n", + "to perform the formatting of the output into a desired format.\n", + "The summary method is called after the conversation is completed\n", + "given the complete history of the conversation." + ] + }, { "cell_type": "code", "execution_count": 36, @@ -270,6 +313,13 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output is stored in the summary." + ] + }, { "cell_type": "code", "execution_count": 37, @@ -308,13 +358,13 @@ ], "metadata": { "front_matter": { - "title": "Web Scraper Agent using Apify Tools", "description": "Scrapping web pages and summarizing the content using agents with tools.", "tags": [ "web-scrapping", "apify", "tool-use" - ] + ], + "title": "Web Scraper Agent using Apify Tools" }, "kernelspec": { "display_name": "autogen", diff --git a/website/docs/Examples.md b/website/docs/Examples.md index 155921719e8e..3da637d85eed 100644 --- a/website/docs/Examples.md +++ b/website/docs/Examples.md @@ -54,6 +54,7 @@ Links to notebook examples: - Constrained Responses via Guidance - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_guidance.ipynb) - Browse the Web with Agents - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_surfer.ipynb) - **SQL**: Natural Language Text to SQL Query using the [Spider](https://yale-lily.github.io/spider) Text-to-SQL Benchmark - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_sql_spider.ipynb) +- **Web Scraping**: Web Scraping with Apify - [View Notebook](/docs/notebooks/agentchat_webscraping_with_apify). ### Human Involvement From 1c3d83bab09610d9c3de0817300f276be08922be Mon Sep 17 00:00:00 2001 From: Eric Zhu Date: Mon, 1 Apr 2024 09:08:33 -0700 Subject: [PATCH 5/5] update --- notebook/agentchat_webscraping_with_apify.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebook/agentchat_webscraping_with_apify.ipynb b/notebook/agentchat_webscraping_with_apify.ipynb index a8d0c68867e7..ad80f0d960ce 100644 --- a/notebook/agentchat_webscraping_with_apify.ipynb +++ b/notebook/agentchat_webscraping_with_apify.ipynb @@ -360,9 +360,9 @@ "front_matter": { "description": "Scrapping web pages and summarizing the content using agents with tools.", "tags": [ - "web-scrapping", + "web scraping", "apify", - "tool-use" + "tool use" ], "title": "Web Scraper Agent using Apify Tools" },