Skip to content

Commit 66d96dd

Browse files
WaelKarkoubBeibinLisonichi
authored
Parse Any HTML-esh Style Tags (#2046)
* tried implementing my own regex * improves tests * finally works * removes prints * fixed test * adds start and end * delete unused imports * refactored to use new tool * significantly improved algo * tag content -> tag attr * fix tests + adds new field * return full match * return remove start and end * update docstrings * update docstrings * update docstrings --------- Co-authored-by: Beibin Li <[email protected]> Co-authored-by: Chi Wang <[email protected]>
1 parent 59a7790 commit 66d96dd

File tree

3 files changed

+191
-11
lines changed

3 files changed

+191
-11
lines changed

autogen/agentchat/contrib/img_utils.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import base64
22
import copy
3-
import mimetypes
43
import os
54
import re
65
from io import BytesIO
7-
from typing import Any, Dict, List, Optional, Tuple, Union
6+
from typing import Dict, List, Tuple, Union
87

98
import requests
109
from PIL import Image
1110

11+
from autogen.agentchat import utils
12+
1213

1314
def get_pil_image(image_file: Union[str, Image.Image]) -> Image.Image:
1415
"""
@@ -179,13 +180,9 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
179180
last_index = 0
180181
image_count = 0
181182

182-
# Regular expression pattern for matching <img ...> tags
183-
img_tag_pattern = re.compile(r"<img ([^>]+)>")
184-
185183
# Find all image tags
186-
for match in img_tag_pattern.finditer(prompt):
187-
image_location = match.group(1)
188-
184+
for parsed_tag in utils.parse_tags_from_content("img", prompt):
185+
image_location = parsed_tag["attr"]["src"]
189186
try:
190187
if img_format == "pil":
191188
img_data = get_pil_image(image_location)
@@ -202,12 +199,12 @@ def gpt4v_formatter(prompt: str, img_format: str = "uri") -> List[Union[str, dic
202199
continue
203200

204201
# Add text before this image tag to output list
205-
output.append({"type": "text", "text": prompt[last_index : match.start()]})
202+
output.append({"type": "text", "text": prompt[last_index : parsed_tag["match"].start()]})
206203

207204
# Add image data to output list
208205
output.append({"type": "image_url", "image_url": {"url": img_data}})
209206

210-
last_index = match.end()
207+
last_index = parsed_tag["match"].end()
211208
image_count += 1
212209

213210
# Add remaining text to output list

autogen/agentchat/utils.py

+108-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
from typing import Any, List, Dict, Tuple, Callable
1+
import re
2+
from typing import Any, Callable, Dict, List, Tuple, Union
3+
24
from .agent import Agent
35

46

@@ -76,3 +78,108 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
7678
aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)
7779

7880
return total_usage_summary, actual_usage_summary
81+
82+
83+
def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]:
84+
"""Parses HTML style tags from message contents.
85+
86+
The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is
87+
specified as an argument to the function. The function looks for this tag in the text and extracts its content. The
88+
content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content
89+
can be a single string or a set of attribute-value pairs.
90+
91+
Examples:
92+
<img http://example.com/image.png> -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}]
93+
<audio text="Hello I'm a robot" prompt="whisper"> ->
94+
[{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}, "match": re.Match}]
95+
96+
Args:
97+
tag (str): The HTML style tag to be parsed.
98+
content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content
99+
items.
100+
101+
Returns:
102+
List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary
103+
contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes,
104+
and 'match' which is a regular expression match object.
105+
106+
Raises:
107+
ValueError: If the content is not a string or a list.
108+
"""
109+
results = []
110+
if isinstance(content, str):
111+
results.extend(_parse_tags_from_text(tag, content))
112+
# Handles case for multimodal messages.
113+
elif isinstance(content, list):
114+
for item in content:
115+
if item.get("type") == "text":
116+
results.extend(_parse_tags_from_text(tag, item["text"]))
117+
else:
118+
raise ValueError(f"content must be str or list, but got {type(content)}")
119+
120+
return results
121+
122+
123+
def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]:
124+
pattern = re.compile(f"<{tag} (.*?)>")
125+
126+
results = []
127+
for match in re.finditer(pattern, text):
128+
tag_attr = match.group(1).strip()
129+
attr = _parse_attributes_from_tags(tag_attr)
130+
131+
results.append({"tag": tag, "attr": attr, "match": match})
132+
return results
133+
134+
135+
def _parse_attributes_from_tags(tag_content: str):
136+
pattern = r"([^ ]+)"
137+
attrs = re.findall(pattern, tag_content)
138+
reconstructed_attrs = _reconstruct_attributes(attrs)
139+
140+
def _append_src_value(content, value):
141+
if "src" in content:
142+
content["src"] += f" {value}"
143+
else:
144+
content["src"] = value
145+
146+
content = {}
147+
for attr in reconstructed_attrs:
148+
if "=" not in attr:
149+
_append_src_value(content, attr)
150+
continue
151+
152+
key, value = attr.split("=", 1)
153+
if value.startswith("'") or value.startswith('"'):
154+
content[key] = value[1:-1] # remove quotes
155+
else:
156+
_append_src_value(content, attr)
157+
158+
return content
159+
160+
161+
def _reconstruct_attributes(attrs: List[str]) -> List[str]:
162+
"""Reconstructs attributes from a list of strings where some attributes may be split across multiple elements."""
163+
164+
def is_attr(attr: str) -> bool:
165+
if "=" in attr:
166+
_, value = attr.split("=", 1)
167+
if value.startswith("'") or value.startswith('"'):
168+
return True
169+
return False
170+
171+
reconstructed = []
172+
found_attr = False
173+
for attr in attrs:
174+
if is_attr(attr):
175+
reconstructed.append(attr)
176+
found_attr = True
177+
else:
178+
if found_attr:
179+
reconstructed[-1] += f" {attr}"
180+
found_attr = True
181+
elif reconstructed:
182+
reconstructed[-1] += f" {attr}"
183+
else:
184+
reconstructed.append(attr)
185+
return reconstructed
+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from typing import Dict, List, Union
2+
from autogen import agentchat
3+
import pytest
4+
5+
TAG_PARSING_TESTS = [
6+
{
7+
"message": "Hello agent, can you take a look at this image <img http://example.com/image.png>",
8+
"expected": [{"tag": "img", "attr": {"src": "http://example.com/image.png"}}],
9+
},
10+
{
11+
"message": "Can you transcribe this audio? <audio http://example.com/au=dio.mp3>",
12+
"expected": [{"tag": "audio", "attr": {"src": "http://example.com/au=dio.mp3"}}],
13+
},
14+
{
15+
"message": "Can you describe what's in this image <img url='http://example.com/=image.png'>",
16+
"expected": [{"tag": "img", "attr": {"url": "http://example.com/=image.png"}}],
17+
},
18+
{
19+
"message": "Can you describe what's in this image <img http://example.com/image.png> and transcribe this audio? <audio http://example.com/audio.mp3>",
20+
"expected": [
21+
{"tag": "img", "attr": {"src": "http://example.com/image.png"}},
22+
{"tag": "audio", "attr": {"src": "http://example.com/audio.mp3"}},
23+
],
24+
},
25+
{
26+
"message": "Can you generate this audio? <audio text='Hello I'm a robot' prompt='whisper'>",
27+
"expected": [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}}],
28+
},
29+
{
30+
"message": "Can you describe what's in this image <img http://example.com/image.png width='100'> and this image <img http://hello.com/image=.png>?",
31+
"expected": [
32+
{"tag": "img", "attr": {"src": "http://example.com/image.png", "width": "100"}},
33+
{"tag": "img", "attr": {"src": "http://hello.com/image=.png"}},
34+
],
35+
},
36+
{
37+
"message": "Text with no tags",
38+
"expected": [],
39+
},
40+
]
41+
42+
43+
def _delete_unused_keys(d: Dict) -> None:
44+
if "match" in d:
45+
del d["match"]
46+
47+
48+
@pytest.mark.parametrize("test_case", TAG_PARSING_TESTS)
49+
def test_tag_parsing(test_case: Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]) -> None:
50+
"""Test the tag_parsing function."""
51+
message = test_case["message"]
52+
expected = test_case["expected"]
53+
tags = ["img", "audio", "random"]
54+
55+
result = []
56+
for tag in tags:
57+
parsed_tags = agentchat.utils.parse_tags_from_content(tag, message)
58+
for item in parsed_tags:
59+
_delete_unused_keys(item)
60+
61+
result.extend(parsed_tags)
62+
assert result == expected
63+
64+
result = []
65+
for tag in tags:
66+
content = [{"type": "text", "text": message}]
67+
parsed_tags = agentchat.utils.parse_tags_from_content(tag, content)
68+
for item in parsed_tags:
69+
_delete_unused_keys(item)
70+
71+
result.extend(parsed_tags)
72+
assert result == expected
73+
74+
75+
if __name__ == "__main__":
76+
test_tag_parsing(TAG_PARSING_TESTS[0])

0 commit comments

Comments
 (0)