-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feat/headless browser (retargeted) (#1832)
* Add headless browser to the WebSurferAgent, closes #1481 * replace soup.get_text() with markdownify.MarkdownConverter().convert_soup(soup) * import HeadlessChromeBrowser * implicitly wait for 10s * inicrease max. wait time to 99s * fix: trim trailing whitespace * test: fix headless tests * better bing query search * docs: add example 3 for headless option --------- Co-authored-by: Vijay Ramesh <[email protected]>
- Loading branch information
1 parent
0a49f2a
commit 96683ee
Showing
12 changed files
with
1,039 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .simple_text_browser import SimpleTextBrowser | ||
from .headless_chrome_browser import HeadlessChromeBrowser | ||
|
||
__all__ = ( | ||
"SimpleTextBrowser", | ||
"HeadlessChromeBrowser", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Optional, Union, Dict | ||
|
||
|
||
class AbstractBrowser(ABC): | ||
"""An abstract class for a web browser.""" | ||
|
||
@abstractmethod | ||
def __init__( | ||
self, | ||
start_page: Optional[str] = "about:blank", | ||
viewport_size: Optional[int] = 1024 * 8, | ||
downloads_folder: Optional[Union[str, None]] = None, | ||
bing_api_key: Optional[Union[str, None]] = None, | ||
request_kwargs: Optional[Union[Dict, None]] = None, | ||
): | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def address(self) -> str: | ||
pass | ||
|
||
@abstractmethod | ||
def set_address(self, uri_or_path): | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def viewport(self) -> str: | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def page_content(self) -> str: | ||
pass | ||
|
||
@abstractmethod | ||
def page_down(self): | ||
pass | ||
|
||
@abstractmethod | ||
def page_up(self): | ||
pass | ||
|
||
@abstractmethod | ||
def visit_page(self, path_or_uri): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
import re | ||
|
||
from bs4 import BeautifulSoup | ||
import markdownify | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.options import Options | ||
from selenium.webdriver.common.by import By | ||
from typing import Optional, Union, Dict | ||
|
||
from autogen.browser_utils.abstract_browser import AbstractBrowser | ||
|
||
# Optional PDF support | ||
IS_PDF_CAPABLE = False | ||
try: | ||
import pdfminer | ||
import pdfminer.high_level | ||
|
||
IS_PDF_CAPABLE = True | ||
except ModuleNotFoundError: | ||
pass | ||
|
||
# Other optional dependencies | ||
try: | ||
import pathvalidate | ||
except ModuleNotFoundError: | ||
pass | ||
|
||
|
||
class HeadlessChromeBrowser(AbstractBrowser): | ||
"""(In preview) A Selenium powered headless Chrome browser. Suitable for Agentic use.""" | ||
|
||
def __init__( | ||
self, | ||
start_page: Optional[str] = "about:blank", | ||
viewport_size: Optional[int] = 1024 * 8, | ||
downloads_folder: Optional[Union[str, None]] = None, | ||
bing_api_key: Optional[Union[str, None]] = None, | ||
request_kwargs: Optional[Union[Dict, None]] = None, | ||
): | ||
self.start_page = start_page | ||
self.driver = None | ||
self.viewport_size = viewport_size # Applies only to the standard uri types | ||
self.downloads_folder = downloads_folder | ||
self.history = list() | ||
self.page_title = None | ||
self.viewport_current_page = 0 | ||
self.viewport_pages = list() | ||
self.bing_api_key = bing_api_key | ||
self.request_kwargs = request_kwargs | ||
self._page_content = "" | ||
|
||
self._start_browser() | ||
|
||
def _start_browser(self): | ||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") | ||
self.driver = webdriver.Chrome(options=chrome_options) | ||
self.driver.implicitly_wait(99) | ||
self.driver.get(self.start_page) | ||
|
||
@property | ||
def address(self) -> str: | ||
return self.driver.current_url | ||
|
||
def set_address(self, uri_or_path): | ||
if uri_or_path.startswith("bing:"): | ||
self._bing_search(uri_or_path[len("bing:") :].strip()) | ||
else: | ||
self.driver.get(uri_or_path) | ||
|
||
@property | ||
def viewport(self) -> str: | ||
"""Return the content of the current viewport.""" | ||
if not self.viewport_pages: | ||
return "" | ||
bounds = self.viewport_pages[self.viewport_current_page] | ||
return self._page_content[bounds[0] : bounds[1]] | ||
|
||
@property | ||
def page_content(self) -> str: | ||
"""Return the full contents of the current page.""" | ||
return self._page_content | ||
|
||
def _set_page_content(self, content) -> str: | ||
"""Sets the text content of the current page.""" | ||
self._page_content = content | ||
self._split_pages() | ||
if self.viewport_current_page >= len(self.viewport_pages): | ||
self.viewport_current_page = len(self.viewport_pages) - 1 | ||
|
||
def _split_pages(self): | ||
# Split only regular pages | ||
if not self.address.startswith("http:") and not self.address.startswith("https:"): | ||
return | ||
|
||
# Handle empty pages | ||
if len(self._page_content) == 0: | ||
self.viewport_pages = [(0, 0)] | ||
return | ||
|
||
# Break the viewport into pages | ||
self.viewport_pages = [] | ||
start_idx = 0 | ||
while start_idx < len(self._page_content): | ||
end_idx = min(start_idx + self.viewport_size, len(self._page_content)) | ||
self.viewport_pages.append((start_idx, end_idx)) | ||
start_idx = end_idx | ||
|
||
def _process_html(self, html: str, is_search: bool) -> str: | ||
"""Process the raw HTML content and return the processed text.""" | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
# Remove javascript and style blocks | ||
for script in soup(["script", "style"]): | ||
script.extract() | ||
|
||
# Convert to text | ||
converter = markdownify.MarkdownConverter() | ||
text = converter.convert_soup(soup) if not is_search else converter.convert_soup(soup.find("main")) | ||
|
||
# Remove excessive blank lines | ||
text = re.sub(r"\n{2,}", "\n\n", text).strip() | ||
|
||
return text | ||
|
||
def _bing_search(self, query): | ||
self.driver.get("https://www.bing.com") | ||
|
||
search_bar = self.driver.find_element(By.NAME, "q") | ||
search_bar.clear() | ||
search_bar.send_keys(query) | ||
search_bar.submit() | ||
|
||
def page_down(self): | ||
"""Move the viewport one page down.""" | ||
if self.viewport_current_page < len(self.viewport_pages) - 1: | ||
self.viewport_current_page += 1 | ||
|
||
def page_up(self): | ||
"""Move the viewport one page up.""" | ||
if self.viewport_current_page > 0: | ||
self.viewport_current_page -= 1 | ||
|
||
def visit_page(self, path_or_uri): | ||
"""Update the address, visit the page, and return the content of the viewport.""" | ||
is_search = path_or_uri.startswith("bing:") | ||
self.set_address(path_or_uri) | ||
html = self.driver.execute_script("return document.body.innerHTML;") | ||
self._set_page_content(self._process_html(html, is_search)) | ||
return self.viewport |
Oops, something went wrong.