Parse Any HTML-esh Style Tags (microsoft#2046)

WaelKarkoub · BeibinLi · sonichi · web-flow · commit 8a8a2b648cad · 2024-03-26T18:46:44.000Z
* tried implementing my own regex

* improves tests

* finally works

* removes prints

* fixed test

* adds start and end

* delete unused imports

* refactored to use new tool

* significantly improved algo

* tag content -&gt; tag attr

* fix tests + adds new field

* return full match

* return remove start and end

* update docstrings

* update docstrings

* update docstrings

---------

Co-authored-by: Beibin Li &lt;BeibinLi@users.noreply.github.com&gt;
Co-authored-by: Chi Wang &lt;wang.chi@microsoft.com&gt;
diff --git a/autogen/agentchat/contrib/img_utils.py b/autogen/agentchat/contrib/img_utils.py
@@ -1,14 +1,15 @@
 import base64
 import copy
-import mimetypes
 import os
 import re
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import requests
 from PIL import Image
 
+from autogen.agentchat import utils
+
 
 def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
     """
@@ -179,13 +180,9 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
     last_index = 0
     image_count = 0
 
-    # Regular expression pattern for matching <img ...> tags
-    img_tag_pattern = re.compile(r"<img ([^>]+)>")
-
     # Find all image tags
-    for match in img_tag_pattern.finditer(prompt):
-        image_location = match.group(1)
-
+    for parsed_tag in utils.parse_tags_from_content("img", prompt):
+        image_location = parsed_tag["attr"]["src"]
         try:
             if img_format == "pil":
                 img_data = get_pil_image(image_location)
@@ -202,12 +199,12 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
             continue
 
         # Add text before this image tag to output list
-        output.append({"type": "text", "text": prompt[last_index : match.start()]})
+        output.append({"type": "text", "text": prompt[last_index : parsed_tag["match"].start()]})
 
         # Add image data to output list
         output.append({"type": "image_url", "image_url": {"url": img_data}})
 
-        last_index = match.end()
+        last_index = parsed_tag["match"].end()
         image_count += 1
 
     # Add remaining text to output list
diff --git a/autogen/agentchat/utils.py b/autogen/agentchat/utils.py
@@ -1,4 +1,6 @@
-from typing import Any, List, Dict, Tuple, Callable
+import re
+from typing import Any, Callable, Dict, List, Tuple, Union
+
 from .agent import Agent
 
 
@@ -76,3 +78,108 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
             aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)
 
     return total_usage_summary, actual_usage_summary
+
+
+def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
+    """Parses HTML style tags from message contents.
+
+    The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
+    specified as an argument to the function. The function looks for this tag in the text and extracts its content. The
+    content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content
+    can be a single string or a set of attribute-value pairs.
+
+    Examples:
+        <img http://example.com/image.png> -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}]
+        <audio text="Hello I'm a robot" prompt="whisper"> ->
+                [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}, "match": re.Match}]
+
+    Args:
+        tag (str): The HTML style tag to be parsed.
+        content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
+            items.
+
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
+            contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
+            and 'match' which is a regular expression match object.
+
+    Raises:
+        ValueError: If the content is not a string or a list.
+    """
+    results = []
+    if isinstance(content, str):
+        results.extend(_parse_tags_from_text(tag, content))
+    # Handles case for multimodal messages.
+    elif isinstance(content, list):
+        for item in content:
+            if item.get("type") == "text":
+                results.extend(_parse_tags_from_text(tag, item["text"]))
+    else:
+        raise ValueError(f"content must be str or list, but got {type(content)}")
+
+    return results
+
+
+def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
+    pattern = re.compile(f"<{tag} (.*?)>")
+
+    results = []
+    for match in re.finditer(pattern, text):
+        tag_attr = match.group(1).strip()
+        attr = _parse_attributes_from_tags(tag_attr)
+
+        results.append({"tag": tag, "attr": attr, "match": match})
+    return results
+
+
+def _parse_attributes_from_tags(tag_content: str):
+    pattern = r"([^ ]+)"
+    attrs = re.findall(pattern, tag_content)
+    reconstructed_attrs = _reconstruct_attributes(attrs)
+
+    def _append_src_value(content, value):
+        if "src" in content:
+            content["src"] += f" {value}"
+        else:
+            content["src"] = value
+
+    content = {}
+    for attr in reconstructed_attrs:
+        if "=" not in attr:
+            _append_src_value(content, attr)
+            continue
+
+        key, value = attr.split("=", 1)
+        if value.startswith("'") or value.startswith('"'):
+            content[key] = value[1:-1]  # remove quotes
+        else:
+            _append_src_value(content, attr)
+
+    return content
+
+
+def _reconstruct_attributes(attrs: List[str]) -> List[str]:
+    """Reconstructs attributes from a list of strings where some attributes may be split across multiple elements."""
+
+    def is_attr(attr: str) -> bool:
+        if "=" in attr:
+            _, value = attr.split("=", 1)
+            if value.startswith("'") or value.startswith('"'):
+                return True
+        return False
+
+    reconstructed = []
+    found_attr = False
+    for attr in attrs:
+        if is_attr(attr):
+            reconstructed.append(attr)
+            found_attr = True
+        else:
+            if found_attr:
+                reconstructed[-1] += f" {attr}"
+                found_attr = True
+            elif reconstructed:
+                reconstructed[-1] += f" {attr}"
+            else:
+                reconstructed.append(attr)
+    return reconstructed
diff --git a/test/agentchat/test_agentchat_utils.py b/test/agentchat/test_agentchat_utils.py
@@ -0,0 +1,76 @@
+from typing import Dict, List, Union
+from autogen import agentchat
+import pytest
+
+TAG_PARSING_TESTS = [
+    {
+        "message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
+        "expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
+    },
+    {
+        "message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
+        "expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
+    },
+    {
+        "message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
+        "expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
+    },
+    {
+        "message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
+        "expected": [
+            {"tag": "img", "attr": {"src": "http://example.com/image.png"}},
+            {"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
+        ],
+    },
+    {
+        "message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
+        "expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
+    },
+    {
+        "message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
+        "expected": [
+            {"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
+            {"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
+        ],
+    },
+    {
+        "message": "Text with no tags",
+        "expected": [],
+    },
+]
+
+
+def _delete_unused_keys(d: Dict) -> None:
+    if "match" in d:
+        del d["match"]
+
+
+@pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
+def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
+    """Test the tag_parsing function."""
+    message = test_case["message"]
+    expected = test_case["expected"]
+    tags = ["img", "audio", "random"]
+
+    result = []
+    for tag in tags:
+        parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
+        for item in parsed_tags:
+            _delete_unused_keys(item)
+
+        result.extend(parsed_tags)
+    assert result == expected
+
+    result = []
+    for tag in tags:
+        content = [{"type": "text", "text": message}]
+        parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
+        for item in parsed_tags:
+            _delete_unused_keys(item)
+
+        result.extend(parsed_tags)
+    assert result == expected
+
+
+if __name__ == "__main__":
+    test_tag_parsing(TAG_PARSING_TESTS[0])