|
1 |
| -from typing import Any, List, Dict, Tuple, Callable |
| 1 | +import re |
| 2 | +from typing import Any, Callable, Dict, List, Tuple, Union |
| 3 | + |
2 | 4 | from .agent import Agent
|
3 | 5 |
|
4 | 6 |
|
@@ -76,3 +78,108 @@ def aggregate_summary(usage_summary: Dict[str, Any], agent_summary: Dict[str, An
|
76 | 78 | aggregate_summary(actual_usage_summary, agent.client.actual_usage_summary)
|
77 | 79 |
|
78 | 80 | return total_usage_summary, actual_usage_summary
|
| 81 | + |
| 82 | + |
| 83 | +def parse_tags_from_content(tag: str, content: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Dict[str, str]]]: |
| 84 | + """Parses HTML style tags from message contents. |
| 85 | +
|
| 86 | + The parsing is done by looking for patterns in the text that match the format of HTML tags. The tag to be parsed is |
| 87 | + specified as an argument to the function. The function looks for this tag in the text and extracts its content. The |
| 88 | + content of a tag is everything that is inside the tag, between the opening and closing angle brackets. The content |
| 89 | + can be a single string or a set of attribute-value pairs. |
| 90 | +
|
| 91 | + Examples: |
| 92 | + <img http://example.com/image.png> -> [{"tag": "img", "attr": {"src": "http://example.com/image.png"}, "match": re.Match}] |
| 93 | + <audio text="Hello I'm a robot" prompt="whisper"> -> |
| 94 | + [{"tag": "audio", "attr": {"text": "Hello I'm a robot", "prompt": "whisper"}, "match": re.Match}] |
| 95 | +
|
| 96 | + Args: |
| 97 | + tag (str): The HTML style tag to be parsed. |
| 98 | + content (Union[str, List[Dict[str, Any]]]): The message content to parse. Can be a string or a list of content |
| 99 | + items. |
| 100 | +
|
| 101 | + Returns: |
| 102 | + List[Dict[str, str]]: A list of dictionaries, where each dictionary represents a parsed tag. Each dictionary |
| 103 | + contains three key-value pairs: 'type' which is the tag, 'attr' which is a dictionary of the parsed attributes, |
| 104 | + and 'match' which is a regular expression match object. |
| 105 | +
|
| 106 | + Raises: |
| 107 | + ValueError: If the content is not a string or a list. |
| 108 | + """ |
| 109 | + results = [] |
| 110 | + if isinstance(content, str): |
| 111 | + results.extend(_parse_tags_from_text(tag, content)) |
| 112 | + # Handles case for multimodal messages. |
| 113 | + elif isinstance(content, list): |
| 114 | + for item in content: |
| 115 | + if item.get("type") == "text": |
| 116 | + results.extend(_parse_tags_from_text(tag, item["text"])) |
| 117 | + else: |
| 118 | + raise ValueError(f"content must be str or list, but got {type(content)}") |
| 119 | + |
| 120 | + return results |
| 121 | + |
| 122 | + |
| 123 | +def _parse_tags_from_text(tag: str, text: str) -> List[Dict[str, str]]: |
| 124 | + pattern = re.compile(f"<{tag} (.*?)>") |
| 125 | + |
| 126 | + results = [] |
| 127 | + for match in re.finditer(pattern, text): |
| 128 | + tag_attr = match.group(1).strip() |
| 129 | + attr = _parse_attributes_from_tags(tag_attr) |
| 130 | + |
| 131 | + results.append({"tag": tag, "attr": attr, "match": match}) |
| 132 | + return results |
| 133 | + |
| 134 | + |
| 135 | +def _parse_attributes_from_tags(tag_content: str): |
| 136 | + pattern = r"([^ ]+)" |
| 137 | + attrs = re.findall(pattern, tag_content) |
| 138 | + reconstructed_attrs = _reconstruct_attributes(attrs) |
| 139 | + |
| 140 | + def _append_src_value(content, value): |
| 141 | + if "src" in content: |
| 142 | + content["src"] += f" {value}" |
| 143 | + else: |
| 144 | + content["src"] = value |
| 145 | + |
| 146 | + content = {} |
| 147 | + for attr in reconstructed_attrs: |
| 148 | + if "=" not in attr: |
| 149 | + _append_src_value(content, attr) |
| 150 | + continue |
| 151 | + |
| 152 | + key, value = attr.split("=", 1) |
| 153 | + if value.startswith("'") or value.startswith('"'): |
| 154 | + content[key] = value[1:-1] # remove quotes |
| 155 | + else: |
| 156 | + _append_src_value(content, attr) |
| 157 | + |
| 158 | + return content |
| 159 | + |
| 160 | + |
| 161 | +def _reconstruct_attributes(attrs: List[str]) -> List[str]: |
| 162 | + """Reconstructs attributes from a list of strings where some attributes may be split across multiple elements.""" |
| 163 | + |
| 164 | + def is_attr(attr: str) -> bool: |
| 165 | + if "=" in attr: |
| 166 | + _, value = attr.split("=", 1) |
| 167 | + if value.startswith("'") or value.startswith('"'): |
| 168 | + return True |
| 169 | + return False |
| 170 | + |
| 171 | + reconstructed = [] |
| 172 | + found_attr = False |
| 173 | + for attr in attrs: |
| 174 | + if is_attr(attr): |
| 175 | + reconstructed.append(attr) |
| 176 | + found_attr = True |
| 177 | + else: |
| 178 | + if found_attr: |
| 179 | + reconstructed[-1] += f" {attr}" |
| 180 | + found_attr = True |
| 181 | + elif reconstructed: |
| 182 | + reconstructed[-1] += f" {attr}" |
| 183 | + else: |
| 184 | + reconstructed.append(attr) |
| 185 | + return reconstructed |
0 commit comments