Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

image alt support #3797

Merged
merged 3 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.16.7-dev0

### Enhancements
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags

### Features

### Fixes

## 0.16.6

### Enhancements
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
</header>
</div>
</body>
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
(
"html_files/example_with_alternative_text.html",
"unstructured_json_output/example_with_alternative_text.json",
),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
(
"html_files/example_with_inline_fields.html",
Expand All @@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path

expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text()

predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True
)

assert len(expected_json_elements) == len(predicted_elements)

for i in range(len(expected_json_elements)):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
[
{
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"metadata": {
"category_depth": 1,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
},
"text": "New York logo",
"type": "Image"
},
{
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
},
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
"type": "Image"
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
assert len(unstructured_elements) == 2
assert isinstance(unstructured_elements[0], Text)
assert isinstance(unstructured_elements[1], NarrativeText)


def test_alternate_text_from_image_is_passed():
# language=HTML
input_html = """
<div class="Page">
<table>
<tr>
<td rowspan="2">Example image nested in the table:</td>
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
</tr>
</table>
</div>add_img_alt_text
"""
page = parse_html_to_ontology(input_html)
unstructured_elements = ontology_to_unstructured_elements(page)
assert len(unstructured_elements) == 2
assert "ALT TEXT Logo" in unstructured_elements[1].text
47 changes: 47 additions & 0 deletions test_unstructured/partition/html/test_partition_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unstructured.partition.html import partition_html


def test_alternative_image_text_can_be_included():
# language=HTML
html = """
<div class="Page">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</div>
"""
_, image_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in image_to_text_alt_mode.text

_, image_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in image_none_alt_mode.text


def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
# language=HTML
html = """
<div class="Page">
<p class="Paragraph">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</p>
</div>
"""
_, paragraph_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text

_, paragraph_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.6" # pragma: no cover
__version__ = "0.16.7-dev0" # pragma: no cover
26 changes: 21 additions & 5 deletions unstructured/documents/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str:

return result_html

def to_text(self, add_children=True) -> str:
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
"""
Returns the text representation of the element.

Args:
add_children: If True, the text of the children will be included.
Otherwise, element is represented as single self-closing tag.
add_img_alt_text: If True, the alt text of the image will be included.
"""
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
children_text = " ".join(
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
)
return children_text
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
text += f" {self.additional_attributes.get('alt', '')}"

return text.strip()

def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
Expand Down Expand Up @@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True)

def to_text(self, add_children=True) -> str:
text = super().to_text()
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
text = super().to_text(add_children, add_img_alt_text)
value = self.additional_attributes.get("value", "")
if not value:
return text
Expand Down
21 changes: 20 additions & 1 deletion unstructured/partition/html/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element
from unstructured.documents.ontology import Page
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
Expand All @@ -36,6 +37,7 @@ def partition_html(
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
Expand Down Expand Up @@ -65,6 +67,9 @@ def partition_html(
html_parser_version (Literal['v1', 'v2']):
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
use the ontology schema to parse the HTML document.

image_alt_mode (Literal['to_text']):
When set 'to_text', the v2 parser will include the alternative text of images in the output.
"""
# -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url:
Expand All @@ -81,6 +86,7 @@ def partition_html(
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
html_parser_version=html_parser_version,
image_alt_mode=image_alt_mode,
)

return list(_HtmlPartitioner.iter_elements(opts))
Expand All @@ -102,6 +108,7 @@ def __init__(
skip_headers_and_footers: bool,
detection_origin: str | None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
):
self._file_path = file_path
self._file = file
Expand All @@ -113,6 +120,7 @@ def __init__(
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
self._html_parser_version = html_parser_version
self._image_alt_mode = image_alt_mode

@lazyproperty
def detection_origin(self) -> str | None:
Expand Down Expand Up @@ -172,6 +180,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]:
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
return self._html_parser_version

@lazyproperty
def add_img_alt_text(self) -> bool:
"""When True, the alternative text of images is included in the output."""
return self._image_alt_mode == "to_text"


class _HtmlPartitioner:
"""Partition HTML document into document-elements."""
Expand Down Expand Up @@ -239,5 +252,11 @@ def _from_ontology(self) -> List[Element]:
"""Convert an ontology elements represented in HTML to an ontology element."""
html_text = self._opts.html_text
ontology = parse_html_to_ontology(html_text)
unstructured_elements = ontology_to_unstructured_elements(ontology)
unstructured_elements = ontology_to_unstructured_elements(
ontology, add_img_alt_text=self._opts.add_img_alt_text
)
return unstructured_elements


if __name__ == "__main__":
Page()
10 changes: 6 additions & 4 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
page_number: int = None,
depth: int = 0,
filename: str | None = None,
add_img_alt_text: bool = True,
) -> list[elements.Element]:
"""
Converts an OntologyElement object to a list of unstructured Element objects.
Expand All @@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
parent_id (str, optional): The ID of the parent element. Defaults to None.
page_number (int, optional): The page number of the element. Defaults to None.
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.

filename (str, optional): The name of the file the element comes from. Defaults to None.
add_img_alt_text (bool): Whether to include the alternative text of images
in the output. Defaults to True.
Returns:
list[Element]: A list of unstructured Element objects.
"""
Expand Down Expand Up @@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
page_number=page_number,
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename,
add_img_alt_text=add_img_alt_text,
)
children += child

Expand All @@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
else:
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html()
element_text = ontology_element.to_text()
element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)

unstructured_element = element_class(
text=element_text,
Expand Down Expand Up @@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
Args:
html_code (str): The HTML code to be parsed.
Parsing HTML will start from <div class="Page">.

Returns:
OntologyElement: The parsed Element object.

Expand Down Expand Up @@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
Args:
soup (Tag): The BeautifulSoup Tag object to be converted.
recursion_depth (int): Flag to control limit of recursion depth.

Returns:
OntologyElement: The converted OntologyElement object.
"""
Expand Down
Loading