From b8e400d2b2b4924a3e6e2c716e0c9bd9f51d55a0 Mon Sep 17 00:00:00 2001 From: Vijay Ramesh Date: Sun, 4 Feb 2024 16:37:24 -0800 Subject: [PATCH] Add headless browser to the WebSurferAgent, closes #1481 --- autogen/agentchat/contrib/web_surfer.py | 28 ++-- autogen/browser_utils.py | 150 ++++++++++++++++++++-- setup.py | 2 +- test/agentchat/contrib/test_web_surfer.py | 68 +++++++--- 4 files changed, 208 insertions(+), 40 deletions(-) diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py index 4877a4d0949d..d9178d0c6c11 100644 --- a/autogen/agentchat/contrib/web_surfer.py +++ b/autogen/agentchat/contrib/web_surfer.py @@ -1,21 +1,23 @@ -import json +import copy import copy import logging import re -from dataclasses import dataclass -from typing import Dict, List, Optional, Union, Callable, Literal, Tuple -from autogen import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper -from autogen.browser_utils import SimpleTextBrowser -from autogen.code_utils import content_str from datetime import datetime -from autogen.token_count_utils import count_token, get_max_token_limit +from typing import Dict, List, Optional, Union, Callable, Literal, Tuple + +from autogen import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, OpenAIWrapper +from autogen.browser_utils import SimpleTextBrowser, HeadlessChromeBrowser from autogen.oai.openai_utils import filter_config +from autogen.token_count_utils import count_token, get_max_token_limit logger = logging.getLogger(__name__) class WebSurferAgent(ConversableAgent): - """(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages.""" + """(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages. + Defaults to a simple text-based browser. + Can be configured to use a headless Chrome browser by providing a browser_config dictionary with the key "headless" set to True. + """ DEFAULT_PROMPT = ( "You are a helpful AI assistant with access to a web browser (via the provided functions). In fact, YOU ARE THE ONLY MEMBER OF YOUR PARTY WITH ACCESS TO A WEB BROWSER, so please help out where you can by performing web searches, navigating pages, and reporting what you find. Today's date is " @@ -84,7 +86,11 @@ def __init__( if browser_config is None: self.browser = SimpleTextBrowser() else: - self.browser = SimpleTextBrowser(**browser_config) + headless = browser_config.pop("headless") + if headless: + self.browser = HeadlessChromeBrowser(**browser_config) + else: + self.browser = SimpleTextBrowser(**browser_config) # Create a copy of the llm_config for the inner monologue agents to use, and set them up with function calling if llm_config is None: # Nothing to copy @@ -214,7 +220,7 @@ def _browser_state(): current_page = self.browser.viewport_current_page total_pages = len(self.browser.viewport_pages) - header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n" + header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n" return (header, self.browser.viewport) def _informational_search(query): @@ -225,7 +231,7 @@ def _informational_search(query): def _navigational_search(query): self.browser.visit_page(f"bing: {query}") - # Extract the first linl + # Extract the first link m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content) if m: self.browser.visit_page(m.group(1)) diff --git a/autogen/browser_utils.py b/autogen/browser_utils.py index 68e39e4ac8e6..dbfaf27c6133 100644 --- a/autogen/browser_utils.py +++ b/autogen/browser_utils.py @@ -1,15 +1,16 @@ -import json +import io +import mimetypes import os -import requests import re -import markdownify -import io import uuid -import mimetypes from urllib.parse import urljoin, urlparse + +import markdownify +import requests from bs4 import BeautifulSoup -from dataclasses import dataclass -from typing import Dict, List, Optional, Union, Callable, Literal, Tuple +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By # Optional PDF support IS_PDF_CAPABLE = False @@ -27,8 +28,57 @@ except ModuleNotFoundError: pass +from abc import ABC, abstractmethod +from typing import Optional, Union, Dict + + +class AbstractBrowser(ABC): + """An abstract class for a web browser.""" + + @abstractmethod + def __init__( + self, + start_page: Optional[str] = "about:blank", + viewport_size: Optional[int] = 1024 * 8, + downloads_folder: Optional[Union[str, None]] = None, + bing_api_key: Optional[Union[str, None]] = None, + request_kwargs: Optional[Union[Dict, None]] = None, + ): + pass + + @property + @abstractmethod + def address(self) -> str: + pass + + @abstractmethod + def set_address(self, uri_or_path): + pass + + @property + @abstractmethod + def viewport(self) -> str: + pass + + @property + @abstractmethod + def page_content(self) -> str: + pass + + @abstractmethod + def page_down(self): + pass + + @abstractmethod + def page_up(self): + pass + + @abstractmethod + def visit_page(self, path_or_uri): + pass + -class SimpleTextBrowser: +class SimpleTextBrowser(AbstractBrowser): """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" def __init__( @@ -281,3 +331,87 @@ def _fetch_page(self, url): except requests.exceptions.RequestException as e: self.page_title = "Error" self._set_page_content(str(e)) + + +class HeadlessChromeBrowser(AbstractBrowser): + """(In preview) A Selenium powered headless Chrome browser. Suitable for Agentic use.""" + + def __init__( + self, + start_page: Optional[str] = "about:blank", + viewport_size: Optional[int] = 1024 * 8, + downloads_folder: Optional[Union[str, None]] = None, + bing_api_key: Optional[Union[str, None]] = None, + request_kwargs: Optional[Union[Dict, None]] = None, + ): + self.start_page = start_page + self.driver = None + self.viewport_size = viewport_size # Applies only to the standard uri types + self.downloads_folder = downloads_folder + self.history = list() + self.page_title = None + self.viewport_current_page = 0 + self.viewport_pages = list() + self.bing_api_key = bing_api_key + self.request_kwargs = request_kwargs + + self._start_browser() + + def _start_browser(self): + chrome_options = Options() + chrome_options.add_argument("--headless") + self.driver = webdriver.Chrome(options=chrome_options) + self.driver.get(self.start_page) + + @property + def address(self) -> str: + return self.driver.current_url + + def set_address(self, uri_or_path): + if uri_or_path.startswith("bing:"): + self._bing_search(uri_or_path[len("bing:") :].strip()) + else: + self.driver.get(uri_or_path) + + @property + def viewport(self) -> str: + # returns the content of the current viewport + return self.page_content + + @property + def page_content(self) -> str: + html = self.driver.execute_script("return document.body.innerHTML;") + return self._process_html(html) + + def _process_html(self, html: str) -> str: + """Process the raw HTML content and return the processed text.""" + soup = BeautifulSoup(html, "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Convert to text + text = soup.get_text() + + # Remove excessive blank lines + text = re.sub(r"\n{2,}", "\n\n", text).strip() + + return text + + def _bing_search(self, query): + self.driver.get("https://www.bing.com") + + search_bar = self.driver.find_element(By.NAME, "q") + search_bar.clear() + search_bar.send_keys(query) + search_bar.submit() + + def page_down(self): + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + + def page_up(self): + self.driver.execute_script("window.scrollTo(0, 0);") + + def visit_page(self, path_or_uri): + self.set_address(path_or_uri) diff --git a/setup.py b/setup.py index 3dbdfad05159..fef74a632dfb 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ "teachable": ["chromadb"], "lmm": ["replicate", "pillow"], "graphs": ["networkx~=3.2.1", "matplotlib~=3.8.1"], - "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate"], + "websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium"], "redis": ["redis"], }, classifiers=[ diff --git a/test/agentchat/contrib/test_web_surfer.py b/test/agentchat/contrib/test_web_surfer.py index 307abefecbca..1c1b54c17a8b 100644 --- a/test/agentchat/contrib/test_web_surfer.py +++ b/test/agentchat/contrib/test_web_surfer.py @@ -110,32 +110,34 @@ def test_web_surfer_oai(): assert len(llm_config["config_list"]) > 0 assert len(summarizer_llm_config["config_list"]) > 0 - page_size = 4096 - web_surfer = WebSurferAgent( - "web_surfer", - llm_config=llm_config, - summarizer_llm_config=summarizer_llm_config, - browser_config={"viewport_size": page_size}, - ) + # run the test with both text and headless browsers + for useHeadlessBrowser in [False, True]: + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config=llm_config, + summarizer_llm_config=summarizer_llm_config, + browser_config={"viewport_size": page_size, "headless": useHeadlessBrowser}, + ) - user_proxy = UserProxyAgent( - "user_proxy", - human_input_mode="NEVER", - code_execution_config=False, - default_auto_reply="", - is_termination_msg=lambda x: True, - ) + user_proxy = UserProxyAgent( + "user_proxy", + human_input_mode="NEVER", + code_execution_config=False, + default_auto_reply="", + is_termination_msg=lambda x: True, + ) - # Make some requests that should test function calling - user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") + # Make some requests that should test function calling + user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'") - user_proxy.initiate_chat(web_surfer, message="Please scroll down.") + user_proxy.initiate_chat(web_surfer, message="Please scroll down.") - user_proxy.initiate_chat(web_surfer, message="Please scroll up.") + user_proxy.initiate_chat(web_surfer, message="Please scroll up.") - user_proxy.initiate_chat(web_surfer, message="When was it founded?") + user_proxy.initiate_chat(web_surfer, message="When was it founded?") - user_proxy.initiate_chat(web_surfer, message="What's this page about?") + user_proxy.initiate_chat(web_surfer, message="What's this page about?") @pytest.mark.skipif( @@ -165,6 +167,32 @@ def test_web_surfer_bing(): assert "Address: https://en.wikipedia.org/wiki/" in response +@pytest.mark.skipif( + skip_bing, + reason="do not run if bing api key is not available", +) +def test_web_surfer_headless_bing(): + page_size = 4096 + web_surfer = WebSurferAgent( + "web_surfer", + llm_config=False, + browser_config={"viewport_size": page_size, "headless": True}, + ) + + # Sneak a peak at the function map, allowing us to call the functions for testing here + function_map = web_surfer._user_proxy._function_map + + # Test informational queries + response = function_map["informational_web_search"](BING_QUERY) + assert "Address: https://www.bing.com/search?q=Microsoft&form=QBLH" in response + assert "Microsoft – Cloud, Computers, Apps & Gaming" in response + + # Test informational queries + response = function_map["navigational_web_search"](BING_QUERY + " Wikipedia") + assert "Address: https://www.bing.com/search?q=Microsoft+Wikipedia&form=QBLH" in response + assert "Microsoft - Wikipedia" in response + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_web_surfer()