Feat/headless browser (retargeted) (#1832)

* Add headless browser to the WebSurferAgent, closes #1481 * replace soup.get_text() with markdownify.MarkdownConverter().convert_soup(soup) * import HeadlessChromeBrowser * implicitly wait for 10s * inicrease max. wait time to 99s * fix: trim trailing whitespace * test: fix headless tests * better bing query search * docs: add example 3 for headless option --------- Co-authored-by: Vijay Ramesh <[email protected]>
microsoft · Mar 2, 2024 · 96683ee · 96683ee
1 parent 0a49f2a
commit 96683ee
Show file tree

Hide file tree

Showing 12 changed files with 1,039 additions and 202 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -40,6 +40,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip wheel
           pip install -e .
+          pip install -e .[test,websurfer]
           python -c "import autogen"
           pip install pytest mock
       - name: Install optional dependencies for code executors

diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py
@@ -1,12 +1,11 @@
-import json
 import copy
 import logging
 import re
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple
 from typing_extensions import Annotated
 from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper
-from ...browser_utils import SimpleTextBrowser
+from ...browser_utils import SimpleTextBrowser, HeadlessChromeBrowser
 from ...code_utils import content_str
 from datetime import datetime
 from ...token_count_utils import count_token, get_max_token_limit
@@ -16,7 +15,10 @@
 
 
 class WebSurferAgent(ConversableAgent):
-    """(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages."""
+    """(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages.
+    Defaults to a simple text-based browser.
+    Can be configured to use a headless Chrome browser by providing a browser_config dictionary with the key "headless" set to True.
+    """
 
     DEFAULT_PROMPT = (
         "You are a helpful AI assistant with access to a web browser (via the provided functions). In fact, YOU ARE THE ONLY MEMBER OF YOUR PARTY WITH ACCESS TO A WEB BROWSER, so please help out where you can by performing web searches, navigating pages, and reporting what you find. Today's date is "
@@ -56,8 +58,12 @@ def __init__(
         self._create_summarizer_client(summarizer_llm_config, llm_config)
 
         # Create the browser
-        self.browser = SimpleTextBrowser(**(browser_config if browser_config else {}))
-
+        headless = browser_config.pop("headless", False)
+        self.browser = (
+            SimpleTextBrowser(**(browser_config if browser_config else {}))
+            if not headless
+            else HeadlessChromeBrowser(**browser_config)
+        )
         inner_llm_config = copy.deepcopy(llm_config)
 
         # Set up the inner monologue
@@ -124,7 +130,7 @@ def _browser_state() -> Tuple[str, str]:
             current_page = self.browser.viewport_current_page
             total_pages = len(self.browser.viewport_pages)
 
-            header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n"
+            header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
             return (header, self.browser.viewport)
 
         @self._user_proxy.register_for_execution()
@@ -145,7 +151,7 @@ def _informational_search(query: Annotated[str, "The informational web search qu
         def _navigational_search(query: Annotated[str, "The navigational web search query to perform."]) -> str:
             self.browser.visit_page(f"bing: {query}")
 
-            # Extract the first linl
+            # Extract the first link
             m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content)
             if m:
                 self.browser.visit_page(m.group(1))

diff --git a/autogen/browser_utils/__init__.py b/autogen/browser_utils/__init__.py
@@ -0,0 +1,7 @@
+from .simple_text_browser import SimpleTextBrowser
+from .headless_chrome_browser import HeadlessChromeBrowser
+
+__all__ = (
+    "SimpleTextBrowser",
+    "HeadlessChromeBrowser",
+)
diff --git a/autogen/browser_utils/abstract_browser.py b/autogen/browser_utils/abstract_browser.py
@@ -0,0 +1,48 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Union, Dict
+
+
+class AbstractBrowser(ABC):
+    """An abstract class for a web browser."""
+
+    @abstractmethod
+    def __init__(
+        self,
+        start_page: Optional[str] = "about:blank",
+        viewport_size: Optional[int] = 1024 * 8,
+        downloads_folder: Optional[Union[str, None]] = None,
+        bing_api_key: Optional[Union[str, None]] = None,
+        request_kwargs: Optional[Union[Dict, None]] = None,
+    ):
+        pass
+
+    @property
+    @abstractmethod
+    def address(self) -> str:
+        pass
+
+    @abstractmethod
+    def set_address(self, uri_or_path):
+        pass
+
+    @property
+    @abstractmethod
+    def viewport(self) -> str:
+        pass
+
+    @property
+    @abstractmethod
+    def page_content(self) -> str:
+        pass
+
+    @abstractmethod
+    def page_down(self):
+        pass
+
+    @abstractmethod
+    def page_up(self):
+        pass
+
+    @abstractmethod
+    def visit_page(self, path_or_uri):
+        pass
diff --git a/autogen/browser_utils/headless_chrome_browser.py b/autogen/browser_utils/headless_chrome_browser.py
@@ -0,0 +1,150 @@
+import re
+
+from bs4 import BeautifulSoup
+import markdownify
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from typing import Optional, Union, Dict
+
+from autogen.browser_utils.abstract_browser import AbstractBrowser
+
+# Optional PDF support
+IS_PDF_CAPABLE = False
+try:
+    import pdfminer
+    import pdfminer.high_level
+
+    IS_PDF_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+
+# Other optional dependencies
+try:
+    import pathvalidate
+except ModuleNotFoundError:
+    pass
+
+
+class HeadlessChromeBrowser(AbstractBrowser):
+    """(In preview) A Selenium powered headless Chrome browser. Suitable for Agentic use."""
+
+    def __init__(
+        self,
+        start_page: Optional[str] = "about:blank",
+        viewport_size: Optional[int] = 1024 * 8,
+        downloads_folder: Optional[Union[str, None]] = None,
+        bing_api_key: Optional[Union[str, None]] = None,
+        request_kwargs: Optional[Union[Dict, None]] = None,
+    ):
+        self.start_page = start_page
+        self.driver = None
+        self.viewport_size = viewport_size  # Applies only to the standard uri types
+        self.downloads_folder = downloads_folder
+        self.history = list()
+        self.page_title = None
+        self.viewport_current_page = 0
+        self.viewport_pages = list()
+        self.bing_api_key = bing_api_key
+        self.request_kwargs = request_kwargs
+        self._page_content = ""
+
+        self._start_browser()
+
+    def _start_browser(self):
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        self.driver = webdriver.Chrome(options=chrome_options)
+        self.driver.implicitly_wait(99)
+        self.driver.get(self.start_page)
+
+    @property
+    def address(self) -> str:
+        return self.driver.current_url
+
+    def set_address(self, uri_or_path):
+        if uri_or_path.startswith("bing:"):
+            self._bing_search(uri_or_path[len("bing:") :].strip())
+        else:
+            self.driver.get(uri_or_path)
+
+    @property
+    def viewport(self) -> str:
+        """Return the content of the current viewport."""
+        if not self.viewport_pages:
+            return ""
+        bounds = self.viewport_pages[self.viewport_current_page]
+        return self._page_content[bounds[0] : bounds[1]]
+
+    @property
+    def page_content(self) -> str:
+        """Return the full contents of the current page."""
+        return self._page_content
+
+    def _set_page_content(self, content) -> str:
+        """Sets the text content of the current page."""
+        self._page_content = content
+        self._split_pages()
+        if self.viewport_current_page >= len(self.viewport_pages):
+            self.viewport_current_page = len(self.viewport_pages) - 1
+
+    def _split_pages(self):
+        # Split only regular pages
+        if not self.address.startswith("http:") and not self.address.startswith("https:"):
+            return
+
+        # Handle empty pages
+        if len(self._page_content) == 0:
+            self.viewport_pages = [(0, 0)]
+            return
+
+        # Break the viewport into pages
+        self.viewport_pages = []
+        start_idx = 0
+        while start_idx < len(self._page_content):
+            end_idx = min(start_idx + self.viewport_size, len(self._page_content))
+            self.viewport_pages.append((start_idx, end_idx))
+            start_idx = end_idx
+
+    def _process_html(self, html: str, is_search: bool) -> str:
+        """Process the raw HTML content and return the processed text."""
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+
+        # Convert to text
+        converter = markdownify.MarkdownConverter()
+        text = converter.convert_soup(soup) if not is_search else converter.convert_soup(soup.find("main"))
+
+        # Remove excessive blank lines
+        text = re.sub(r"\n{2,}", "\n\n", text).strip()
+
+        return text
+
+    def _bing_search(self, query):
+        self.driver.get("https://www.bing.com")
+
+        search_bar = self.driver.find_element(By.NAME, "q")
+        search_bar.clear()
+        search_bar.send_keys(query)
+        search_bar.submit()
+
+    def page_down(self):
+        """Move the viewport one page down."""
+        if self.viewport_current_page < len(self.viewport_pages) - 1:
+            self.viewport_current_page += 1
+
+    def page_up(self):
+        """Move the viewport one page up."""
+        if self.viewport_current_page > 0:
+            self.viewport_current_page -= 1
+
+    def visit_page(self, path_or_uri):
+        """Update the address, visit the page, and return the content of the viewport."""
+        is_search = path_or_uri.startswith("bing:")
+        self.set_address(path_or_uri)
+        html = self.driver.execute_script("return document.body.innerHTML;")
+        self._set_page_content(self._process_html(html, is_search))
+        return self.viewport