Skip to content

Commit

Permalink
Feat/headless browser (retargeted) (#1832)
Browse files Browse the repository at this point in the history
* Add headless browser to the WebSurferAgent, closes #1481

* replace soup.get_text() with markdownify.MarkdownConverter().convert_soup(soup)

* import HeadlessChromeBrowser

* implicitly wait for 10s

* inicrease max. wait time to 99s

* fix: trim trailing whitespace

* test: fix headless tests

* better bing query search

* docs: add example 3 for headless option

---------

Co-authored-by: Vijay Ramesh <[email protected]>
  • Loading branch information
INF800 and vijaykramesh authored Mar 2, 2024
1 parent 0a49f2a commit 96683ee
Show file tree
Hide file tree
Showing 12 changed files with 1,039 additions and 202 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ jobs:
run: |
python -m pip install --upgrade pip wheel
pip install -e .
pip install -e .[test,websurfer]
python -c "import autogen"
pip install pytest mock
- name: Install optional dependencies for code executors
Expand Down
20 changes: 13 additions & 7 deletions autogen/agentchat/contrib/web_surfer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import json
import copy
import logging
import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple
from typing_extensions import Annotated
from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper
from ...browser_utils import SimpleTextBrowser
from ...browser_utils import SimpleTextBrowser, HeadlessChromeBrowser
from ...code_utils import content_str
from datetime import datetime
from ...token_count_utils import count_token, get_max_token_limit
Expand All @@ -16,7 +15,10 @@


class WebSurferAgent(ConversableAgent):
"""(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages."""
"""(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages.
Defaults to a simple text-based browser.
Can be configured to use a headless Chrome browser by providing a browser_config dictionary with the key "headless" set to True.
"""

DEFAULT_PROMPT = (
"You are a helpful AI assistant with access to a web browser (via the provided functions). In fact, YOU ARE THE ONLY MEMBER OF YOUR PARTY WITH ACCESS TO A WEB BROWSER, so please help out where you can by performing web searches, navigating pages, and reporting what you find. Today's date is "
Expand Down Expand Up @@ -56,8 +58,12 @@ def __init__(
self._create_summarizer_client(summarizer_llm_config, llm_config)

# Create the browser
self.browser = SimpleTextBrowser(**(browser_config if browser_config else {}))

headless = browser_config.pop("headless", False)
self.browser = (
SimpleTextBrowser(**(browser_config if browser_config else {}))
if not headless
else HeadlessChromeBrowser(**browser_config)
)
inner_llm_config = copy.deepcopy(llm_config)

# Set up the inner monologue
Expand Down Expand Up @@ -124,7 +130,7 @@ def _browser_state() -> Tuple[str, str]:
current_page = self.browser.viewport_current_page
total_pages = len(self.browser.viewport_pages)

header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n"
header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
return (header, self.browser.viewport)

@self._user_proxy.register_for_execution()
Expand All @@ -145,7 +151,7 @@ def _informational_search(query: Annotated[str, "The informational web search qu
def _navigational_search(query: Annotated[str, "The navigational web search query to perform."]) -> str:
self.browser.visit_page(f"bing: {query}")

# Extract the first linl
# Extract the first link
m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content)
if m:
self.browser.visit_page(m.group(1))
Expand Down
7 changes: 7 additions & 0 deletions autogen/browser_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .simple_text_browser import SimpleTextBrowser
from .headless_chrome_browser import HeadlessChromeBrowser

__all__ = (
"SimpleTextBrowser",
"HeadlessChromeBrowser",
)
48 changes: 48 additions & 0 deletions autogen/browser_utils/abstract_browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from abc import ABC, abstractmethod
from typing import Optional, Union, Dict


class AbstractBrowser(ABC):
"""An abstract class for a web browser."""

@abstractmethod
def __init__(
self,
start_page: Optional[str] = "about:blank",
viewport_size: Optional[int] = 1024 * 8,
downloads_folder: Optional[Union[str, None]] = None,
bing_api_key: Optional[Union[str, None]] = None,
request_kwargs: Optional[Union[Dict, None]] = None,
):
pass

@property
@abstractmethod
def address(self) -> str:
pass

@abstractmethod
def set_address(self, uri_or_path):
pass

@property
@abstractmethod
def viewport(self) -> str:
pass

@property
@abstractmethod
def page_content(self) -> str:
pass

@abstractmethod
def page_down(self):
pass

@abstractmethod
def page_up(self):
pass

@abstractmethod
def visit_page(self, path_or_uri):
pass
150 changes: 150 additions & 0 deletions autogen/browser_utils/headless_chrome_browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import re

from bs4 import BeautifulSoup
import markdownify
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from typing import Optional, Union, Dict

from autogen.browser_utils.abstract_browser import AbstractBrowser

# Optional PDF support
IS_PDF_CAPABLE = False
try:
import pdfminer
import pdfminer.high_level

IS_PDF_CAPABLE = True
except ModuleNotFoundError:
pass

# Other optional dependencies
try:
import pathvalidate
except ModuleNotFoundError:
pass


class HeadlessChromeBrowser(AbstractBrowser):
"""(In preview) A Selenium powered headless Chrome browser. Suitable for Agentic use."""

def __init__(
self,
start_page: Optional[str] = "about:blank",
viewport_size: Optional[int] = 1024 * 8,
downloads_folder: Optional[Union[str, None]] = None,
bing_api_key: Optional[Union[str, None]] = None,
request_kwargs: Optional[Union[Dict, None]] = None,
):
self.start_page = start_page
self.driver = None
self.viewport_size = viewport_size # Applies only to the standard uri types
self.downloads_folder = downloads_folder
self.history = list()
self.page_title = None
self.viewport_current_page = 0
self.viewport_pages = list()
self.bing_api_key = bing_api_key
self.request_kwargs = request_kwargs
self._page_content = ""

self._start_browser()

def _start_browser(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.implicitly_wait(99)
self.driver.get(self.start_page)

@property
def address(self) -> str:
return self.driver.current_url

def set_address(self, uri_or_path):
if uri_or_path.startswith("bing:"):
self._bing_search(uri_or_path[len("bing:") :].strip())
else:
self.driver.get(uri_or_path)

@property
def viewport(self) -> str:
"""Return the content of the current viewport."""
if not self.viewport_pages:
return ""
bounds = self.viewport_pages[self.viewport_current_page]
return self._page_content[bounds[0] : bounds[1]]

@property
def page_content(self) -> str:
"""Return the full contents of the current page."""
return self._page_content

def _set_page_content(self, content) -> str:
"""Sets the text content of the current page."""
self._page_content = content
self._split_pages()
if self.viewport_current_page >= len(self.viewport_pages):
self.viewport_current_page = len(self.viewport_pages) - 1

def _split_pages(self):
# Split only regular pages
if not self.address.startswith("http:") and not self.address.startswith("https:"):
return

# Handle empty pages
if len(self._page_content) == 0:
self.viewport_pages = [(0, 0)]
return

# Break the viewport into pages
self.viewport_pages = []
start_idx = 0
while start_idx < len(self._page_content):
end_idx = min(start_idx + self.viewport_size, len(self._page_content))
self.viewport_pages.append((start_idx, end_idx))
start_idx = end_idx

def _process_html(self, html: str, is_search: bool) -> str:
"""Process the raw HTML content and return the processed text."""
soup = BeautifulSoup(html, "html.parser")

# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()

# Convert to text
converter = markdownify.MarkdownConverter()
text = converter.convert_soup(soup) if not is_search else converter.convert_soup(soup.find("main"))

# Remove excessive blank lines
text = re.sub(r"\n{2,}", "\n\n", text).strip()

return text

def _bing_search(self, query):
self.driver.get("https://www.bing.com")

search_bar = self.driver.find_element(By.NAME, "q")
search_bar.clear()
search_bar.send_keys(query)
search_bar.submit()

def page_down(self):
"""Move the viewport one page down."""
if self.viewport_current_page < len(self.viewport_pages) - 1:
self.viewport_current_page += 1

def page_up(self):
"""Move the viewport one page up."""
if self.viewport_current_page > 0:
self.viewport_current_page -= 1

def visit_page(self, path_or_uri):
"""Update the address, visit the page, and return the content of the viewport."""
is_search = path_or_uri.startswith("bing:")
self.set_address(path_or_uri)
html = self.driver.execute_script("return document.body.innerHTML;")
self._set_page_content(self._process_html(html, is_search))
return self.viewport
Loading

0 comments on commit 96683ee

Please sign in to comment.