diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py index 009fccad..d839aa86 100644 --- a/docling_core/experimental/idoctags.py +++ b/docling_core/experimental/idoctags.py @@ -1,10 +1,8 @@ """Define classes for DocTags serialization.""" import copy -import html import re from enum import Enum -from html.parser import HTMLParser from itertools import groupby from typing import Any, ClassVar, Final, Optional, cast from xml.dom.minidom import Element, Text, parseString @@ -373,8 +371,9 @@ class IDocTagsToken(str, Enum): INLINE = "inline" # Formatting - BOLD = "bold" # instead of "strong" - ITALIC = "italic" # instead of "em" + BOLD = "bold" + ITALIC = "italic" + UNDERLINE = "underline" STRIKETHROUGH = "strikethrough" SUPERSCRIPT = "superscript" SUBSCRIPT = "subscript" @@ -981,6 +980,15 @@ class IDocTagsSerializationMode(str, Enum): LLM_FRIENDLY = "llm_friendly" +class EscapeMode(str, Enum): + """XML escape mode for IDocTags output.""" + + CDATA_ALWAYS = "cdata_always" # wrap all text in CDATA + CDATA_WHEN_NEEDED = ( + "cdata_when_needed" # wrap text in CDATA only if it contains special characters + ) + + class IDocTagsParams(CommonParams): """IDocTags-specific serialization parameters independent of DocTags.""" @@ -1002,7 +1010,7 @@ class IDocTagsParams(CommonParams): # Expand self-closing forms of non-self-closing tokens after pretty-printing preserve_empty_non_selfclosing: bool = True # XML compliance: escape special characters in text content - xml_compliant: bool = False + escape_mode: EscapeMode = EscapeMode.CDATA_WHEN_NEEDED def _get_delim(*, params: IDocTagsParams) -> str: @@ -1014,84 +1022,13 @@ def _get_delim(*, params: IDocTagsParams) -> str: raise RuntimeError(f"Unknown IDocTags mode: {params.mode}") -class _WhitelistHTMLParser(HTMLParser): - """XML-safe sanitizer that preserves only specific IDocTags formatting and content tags. - - Preserves these tags (attributes are stripped): - bold, italic, strikethrough, superscript, subscript, inline, text, code, formula, facets. - All other tags are escaped literally. - """ - - # Allowed formatting and content tags - _ALLOWED = { - IDocTagsToken.BOLD.value, - IDocTagsToken.ITALIC.value, - IDocTagsToken.STRIKETHROUGH.value, - IDocTagsToken.SUPERSCRIPT.value, - IDocTagsToken.SUBSCRIPT.value, - IDocTagsToken.INLINE.value, - IDocTagsToken.TEXT.value, - IDocTagsToken.CODE.value, - IDocTagsToken.FORMULA.value, - IDocTagsToken.FACETS.value, - } - - def __init__(self): - super().__init__(convert_charrefs=False) - self.out = [] - - def handle_starttag(self, tag, attrs): - if tag in self._ALLOWED: - self.out.append(f"<{tag}>") - else: - # Escape disallowed tags literally - self.out.append(html.escape(self.get_starttag_text(), quote=False)) - - def handle_endtag(self, tag): - if tag in self._ALLOWED: - self.out.append(f"") - else: - self.out.append(html.escape(f"", quote=False)) - - def handle_startendtag(self, tag, attrs): - if tag in self._ALLOWED: - self.out.append(f"<{tag}>") - else: - self.out.append(html.escape(self.get_starttag_text(), quote=False)) - - def handle_data(self, data): - self.out.append(html.escape(data, quote=False)) - - def handle_entityref(self, name): - self.out.append(f"&{name};") - - def handle_charref(self, name): - self.out.append(f"&#{name};") - - def handle_comment(self, data): - self.out.append(html.escape(f"", quote=False)) - - -# def _escape_xml_text(text: str, xml_compliant: bool) -> str: -# """Escape XML special characters if xml_compliant is enabled.""" -# if xml_compliant: -# return html.escape(text, quote=False) -# return text - - -def _escape_xml_text(text: str, xml_compliant: bool) -> str: - """Escape text for XML while optionally preserving specific IDocTags formatting tags. - - If xml_compliant=True, preserves only these tags (attributes stripped): - bold, italic, strikethrough, superscript, subscript, inline, text, code, formula, facets. - All other tags are escaped. If xml_compliant=False, returns text unchanged. - """ - if not xml_compliant: - return text - parser = _WhitelistHTMLParser() - parser.feed(text) - parser.close() - return "".join(parser.out) +def _escape_text(text: str, escape_mode: EscapeMode) -> str: + if escape_mode == EscapeMode.CDATA_ALWAYS or ( + escape_mode == EscapeMode.CDATA_WHEN_NEEDED + and any(c in text for c in ['"', "'", "&", "<", ">"]) + ): + return f"" + return text class IDocTagsListSerializer(BaseModel, BaseListSerializer): @@ -1333,6 +1270,12 @@ def _serialize_single_item( elif isinstance(item, ListItem): tok = IDocTagsToken.LIST_TEXT wrap_open_token = f"<{tok.value}>" + elif isinstance(item, CodeItem): + tok = IDocTagsToken.CODE + if item.code_language != CodeLanguageLabel.UNKNOWN: + wrap_open_token = f'<{tok.value} {IDocTagsAttributeKey.CLASS.value}="{item.code_language.value}">' + else: + wrap_open_token = f"<{tok.value}>" elif ( isinstance(item, TextItem) and item.label == DocItemLabel.CHECKBOX_SELECTED ): @@ -1402,44 +1345,26 @@ def _serialize_single_item( hyperlink=item.hyperlink, ) else: + text_part = _escape_text(item.text, params.escape_mode) text_part = doc_serializer.post_process( - text=item.text, + text=text_part, formatting=item.formatting, hyperlink=item.hyperlink, ) - # For code blocks, preserve language using a lightweight facets marker - # e.g., language=python before the code content. - if isinstance(item, CodeItem): - # lang = getattr(item.code_language, "value", str(item.code_language)) - if item.code_language != CodeLanguageLabel.UNKNOWN: - parts.append( - _wrap( - # text=f"language={lang.lower()}", - text=item.code_language.value, - wrap_tag=IDocTagsToken.FACETS.value, - ) - ) - # Keep the textual code content as-is (no stripping) - else: - text_part = text_part.strip() - - # Apply XML escaping if xml_compliant is enabled - text_part = _escape_xml_text(text_part, params.xml_compliant) - if text_part: parts.append(text_part) if params.add_caption and isinstance(item, FloatingItem): cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text if cap_text: - cap_text = _escape_xml_text(cap_text, params.xml_compliant) + cap_text = _escape_text(cap_text, params.escape_mode) parts.append(cap_text) if params.add_footnote and isinstance(item, FloatingItem): ftn_text = doc_serializer.serialize_footnotes(item=item, **kwargs).text if ftn_text: - ftn_text = _escape_xml_text(ftn_text, params.xml_compliant) + ftn_text = _escape_text(ftn_text, params.escape_mode) parts.append(ftn_text) text_res = "".join(parts) @@ -1496,12 +1421,12 @@ def _serialize_meta_field( if name == MetaFieldName.SUMMARY and isinstance( field_val, SummaryMetaField ): - escaped_text = _escape_xml_text(field_val.text, params.xml_compliant) + escaped_text = _escape_text(field_val.text, params.escape_mode) txt = f"{escaped_text}" elif name == MetaFieldName.DESCRIPTION and isinstance( field_val, DescriptionMetaField ): - escaped_text = _escape_xml_text(field_val.text, params.xml_compliant) + escaped_text = _escape_text(field_val.text, params.escape_mode) txt = f"{escaped_text}" elif name == MetaFieldName.CLASSIFICATION and isinstance( field_val, PictureClassificationMetaField @@ -1509,12 +1434,12 @@ def _serialize_meta_field( class_name = self._humanize_text( field_val.get_main_prediction().class_name ) - escaped_class_name = _escape_xml_text(class_name, params.xml_compliant) + escaped_class_name = _escape_text(class_name, params.escape_mode) txt = f"{escaped_class_name}" elif name == MetaFieldName.MOLECULE and isinstance( field_val, MoleculeMetaField ): - escaped_smi = _escape_xml_text(field_val.smi, params.xml_compliant) + escaped_smi = _escape_text(field_val.smi, params.escape_mode) txt = f"{escaped_smi}" elif name == MetaFieldName.TABULAR_CHART and isinstance( field_val, TabularChartMetaField @@ -1524,9 +1449,7 @@ def _serialize_meta_field( # elif tmp := str(field_val or ""): # txt = tmp elif name not in {v.value for v in MetaFieldName}: - escaped_text = _escape_xml_text( - str(field_val or ""), params.xml_compliant - ) + escaped_text = _escape_text(str(field_val or ""), params.escape_mode) txt = _wrap(text=escaped_text, wrap_tag=name) return txt return None @@ -1744,9 +1667,7 @@ def _emit_otsl( parts.append(cell_loc) if params.add_content: # Apply XML escaping to table cell content - escaped_content = _escape_xml_text( - content, params.xml_compliant - ) + escaped_content = _escape_text(content, params.escape_mode) parts.append(escaped_content) else: parts.append( @@ -2139,6 +2060,11 @@ def serialize_italic(self, text: str, **kwargs: Any) -> str: """Apply IDocTags-specific italic serialization.""" return _wrap(text=text, wrap_tag=IDocTagsToken.ITALIC.value) + @override + def serialize_underline(self, text: str, **kwargs: Any) -> str: + """Apply IDocTags-specific underline serialization.""" + return _wrap(text=text, wrap_tag=IDocTagsToken.UNDERLINE.value) + @override def serialize_strikethrough(self, text: str, **kwargs: Any) -> str: """Apply IDocTags-specific strikethrough serialization.""" @@ -2341,7 +2267,12 @@ def _extract_code_content_and_language( self, el: Element ) -> tuple[str, CodeLanguageLabel]: """Extract code content and language from a element.""" - lang_label = CodeLanguageLabel.UNKNOWN + try: + lang_label = CodeLanguageLabel( + el.getAttribute(IDocTagsAttributeKey.CLASS.value) + ) + except ValueError: + lang_label = CodeLanguageLabel.UNKNOWN parts: list[str] = [] for node in el.childNodes: if isinstance(node, Text): @@ -2349,36 +2280,9 @@ def _extract_code_content_and_language( parts.append(node.data) elif isinstance(node, Element): nm_child = node.tagName - if nm_child == IDocTagsToken.FACETS.value: - language_text = self._get_text(node).strip() - try: - lang_label = next( - lbl - for lbl in CodeLanguageLabel - if lbl.value == language_text - ) - except StopIteration: - lang_label = CodeLanguageLabel.UNKNOWN - - """ - facets_text = self._get_text(node).strip() - if "=" in facets_text: - key, val = facets_text.split("=", 1) - if key.strip().lower() == "language": - val_norm = val.strip().lower() - try: - lang_label = next( - lbl - for lbl in CodeLanguageLabel - if lbl.value.lower() == val_norm - ) - except StopIteration: - lang_label = CodeLanguageLabel.UNKNOWN - """ - continue if nm_child == IDocTagsToken.LOCATION.value: continue - if nm_child == IDocTagsToken.BR.value: + elif nm_child == IDocTagsToken.BR.value: parts.append("\n") else: parts.append(self._get_text(node)) @@ -2793,11 +2697,12 @@ def _extract_text_with_formatting( # Mapping of format tags to Formatting attributes format_tags = { - IDocTagsToken.BOLD.value: "bold", - IDocTagsToken.ITALIC.value: "italic", - IDocTagsToken.STRIKETHROUGH.value: "strikethrough", - IDocTagsToken.SUPERSCRIPT.value: "superscript", - IDocTagsToken.SUBSCRIPT.value: "subscript", + IDocTagsToken.BOLD, + IDocTagsToken.ITALIC, + IDocTagsToken.STRIKETHROUGH, + IDocTagsToken.UNDERLINE, + IDocTagsToken.SUPERSCRIPT, + IDocTagsToken.SUBSCRIPT, } if tag_name in format_tags: @@ -2815,6 +2720,8 @@ def _extract_text_with_formatting( child_formatting.italic = True elif tag_name == IDocTagsToken.STRIKETHROUGH.value: child_formatting.strikethrough = True + elif tag_name == IDocTagsToken.UNDERLINE.value: + child_formatting.underline = True elif tag_name == IDocTagsToken.SUPERSCRIPT.value: child_formatting.script = Script.SUPER elif tag_name == IDocTagsToken.SUBSCRIPT.value: diff --git a/examples/convert_to_idoctags.py b/examples/convert_to_idoctags.py index 61692c0a..fac952b9 100644 --- a/examples/convert_to_idoctags.py +++ b/examples/convert_to_idoctags.py @@ -18,6 +18,7 @@ from docling_core.types.doc import DoclingDocument, ImageRef from docling_core.types.doc.base import ImageRefMode from docling_core.experimental.idoctags import ( + EscapeMode, IDocTagsSerializationMode, IDocTagsParams, IDocTagsVocabulary, @@ -28,7 +29,7 @@ import numpy as np # In order to download **before** the datasets library, run -# +# # HF_HUB_DISABLE_XET=1 hf download --repo-type dataset "{hf-repo-id}" # @@ -151,7 +152,7 @@ def _write_report(rows: list[dict[str, str]], path: Path) -> None: - Row ID - Loaded DoclingDocument - Loaded DoclingDocument Error - - Serialized IDocTags (mode, xml_compliant, content) for all combinations + - Serialized IDocTags (mode, escape_mode, content) for all combinations - Serialized HTML - Serialized HTML Error @@ -166,12 +167,12 @@ def _write_report(rows: list[dict[str, str]], path: Path) -> None: "Loaded DoclingDocument Error", ] - # Add all combinations of mode, xml_compliant, and content - for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]: - for comp in [True, False]: + # Add all combinations of mode, escape_mode, and content + for mode in IDocTagsSerializationMode: + for esc_mode in EscapeMode: for content in [True, False]: - cols.append(f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})") - cols.append(f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error") + cols.append(f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})") + cols.append(f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content}) Error") cols.extend([ "Serialized HTML", @@ -193,10 +194,10 @@ def _count_yes(key: str) -> int: ] # Add summary rows for all combinations - for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]: - for comp in [True, False]: + for mode in IDocTagsSerializationMode: + for esc_mode in EscapeMode: for content in [True, False]: - col_name = f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})" + col_name = f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})" summary_rows.append({"Metric": col_name, "Count": _count_yes(col_name)}) summary_rows.append({"Metric": "Serialized HTML", "Count": _count_yes("Serialized HTML")}) @@ -315,14 +316,14 @@ def _count_yes(key: str) -> int: "Loaded DoclingDocument": _yes(False), "Loaded DoclingDocument Error": "", "Serialized HTML": _yes(False), - "Serialized HTML Error": "", + "Serialized HTML Error": "", } - for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]: - for comp in [True, False]: + for mode in IDocTagsSerializationMode: + for esc_mode in EscapeMode: for content in [True, False]: - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(False) - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = "" + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})"] = _yes(False) + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}) Error"] = "" try: doc = DoclingDocument.model_validate_json(text) @@ -338,12 +339,12 @@ def _count_yes(key: str) -> int: # Record failure outcome for this row row_result["Loaded DoclingDocument Error"] = str(exc) - for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]: - for comp in [True, False]: + for mode in IDocTagsSerializationMode: + for esc_mode in EscapeMode: for content in [True, False]: - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(False) - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = "NA" - + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})"] = _yes(False) + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content}) Error"] = "NA" + results_rows.append(row_result) continue @@ -353,24 +354,24 @@ def _count_yes(key: str) -> int: # __.save(png_path) for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]: - for comp in [True, False]: + for esc_mode in [True, False]: for content in [True, False]: try: params_probe = IDocTagsParams() params_probe.add_content = content params_probe.mode = mode - params_probe.xml_compliant = comp + params_probe.escape_mode = esc_mode params_probe.pretty_indentation = " " if mode==IDocTagsSerializationMode.HUMAN_FRIENDLY else None iser_probe = IDocTagsDocSerializer(doc=doc, params=params_probe) _ = iser_probe.serialize().text - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(True) - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = "" + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content})"] = _yes(True) + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}) Error"] = "" except Exception as exc_: - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(False) - row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = str(exc_) + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content})"] = _yes(False) + row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}) Error"] = str(exc_) # Attempt HTML export (non-writing) to check serialization capability try: @@ -401,9 +402,9 @@ def _count_yes(rows: list[dict[str, str]], key: str) -> int: print(f" - Total processed: {len(results_rows)}") print(f" - Loaded DoclingDocument: {_count_yes(results_rows, 'Loaded DoclingDocument')}") for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]: - for comp in [True, False]: + for esc_mode in [True, False]: for content in [True, False]: - print(f" - Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}): {_count_yes(results_rows, f'Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})')}") + print(f" - Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}): {_count_yes(results_rows, f'Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content})')}") print(f" - Serialized HTML: {_count_yes(results_rows, 'Serialized HTML')}") if errors: diff --git a/test/conftest.py b/test/conftest.py index 7dadfc5c..63b4aaf2 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -26,8 +26,8 @@ ) -@pytest.fixture(scope="session") -def _construct_doc() -> DoclingDocument: +# factored out of fixture to simplify IDE-level debugging +def _construct_doc_impl() -> DoclingDocument: """Fixture for a DoclingDocument to be reused across a test session.""" doc = DoclingDocument(name="Untitled 1") @@ -375,6 +375,12 @@ def _construct_doc() -> DoclingDocument: return doc +@pytest.fixture(scope="session") +def _construct_doc() -> DoclingDocument: + """Fixture for a DoclingDocument to be reused across a test session.""" + return _construct_doc_impl() + + @pytest.fixture(scope="function") def sample_doc(_construct_doc: DoclingDocument) -> DoclingDocument: """Copy of a DoclingDocument for each test function.""" diff --git a/test/data/doc/cdata_always.gt.idt.xml b/test/data/doc/cdata_always.gt.idt.xml new file mode 100644 index 00000000..251768cd --- /dev/null +++ b/test/data/doc/cdata_always.gt.idt.xml @@ -0,0 +1,169 @@ + + + + + <![CDATA[Title of the Document]]> + + + + + + + + + + + + + + + + + + + + This is the caption of table 1. + + + + + + + + + + + + + + + + + This is the caption of figure 1. + + + This is the caption of figure 2. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + & ampersands]]> + + + + + + + + + + + + + + + + 100]]> + +]]> + + + +]]> + + + + + + + + + diff --git a/test/data/doc/cdata_when_needed.gt.idt.xml b/test/data/doc/cdata_when_needed.gt.idt.xml new file mode 100644 index 00000000..3312d779 --- /dev/null +++ b/test/data/doc/cdata_when_needed.gt.idt.xml @@ -0,0 +1,179 @@ + + + item of leading list + + Title of the Document + Author 1 +Affiliation 1 + Author 2 +Affiliation 2 + 1. Introduction + This paper introduces the biggest invention ever made. ... + + list item 1 + list item 2 + list item 3 + + list item 3.a + list item 3.b + list item 3.c + + list item 3.c.i + + + list item 4 + + + This is the caption of table 1. + + + Product + + Years + + + + + 2016 + + 2017 + + + Apple + + 49823 + + 695944 + + + + + This is the caption of figure 1. + + + This is the caption of figure 2. + + + item 1 of list + + + item 1 of list after empty list + item 2 of list after empty list + + + item 1 of neighboring list + item 2 of neighboring list + + item 1 of sub list + + + Here a code snippet: + + (to be displayed inline) + + + + + Here a formula: + E=mc^2 + (to be displayed inline) + + + + + Here a code block: + + Here a formula block: + E=mc^2 + + Some formatting chops: + + bold + + + italic + + + underline + + + strikethrough + + + subscript + + + superscript + + hyperlink + + + + + + everything at the same time. + + + + + + + Item 1 in A + Item 2 in A + Item 3 in A + + Item 1 in B + Item 2 in B + + Item 1 in C + Item 2 in C + + Item 3 in B + + Item 4 in A + + + List item without parent list group + + The end. + Simple text + 4 leading spaces, 1 trailing + + + + + & ampersands]]> + Description content + + + 0 == 0 + 1 leading space, 4 trailing + + 42 == 42 + + + + + Foo + + Bar + + + + 100]]> + +]]> + + + Only + +]]> + + + + + + + + + diff --git a/test/data/doc/ddoc_0.v0.gt.idt b/test/data/doc/ddoc_0.v0.gt.idt index ba2dc6d6..a0c01bd1 100644 --- a/test/data/doc/ddoc_0.v0.gt.idt +++ b/test/data/doc/ddoc_0.v0.gt.idt @@ -73,8 +73,7 @@ - Set connect string for connecting to ndb_mgmd. Syntax: "[nodeid=id;][host=]hostname[:port]". Overrides entries in NDB_CONNECTSTRING and my.cnf. - + diff --git a/test/data/doc/dummy_doc_with_meta.gt.idt.xml b/test/data/doc/dummy_doc_with_meta.gt.idt.xml index 020e1115..17f37fea 100644 --- a/test/data/doc/dummy_doc_with_meta.gt.idt.xml +++ b/test/data/doc/dummy_doc_with_meta.gt.idt.xml @@ -23,7 +23,7 @@ ... Bar chart CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 - {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} + diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py index a91b5a96..1b1191b3 100644 --- a/test/test_deserializer_idoctags.py +++ b/test/test_deserializer_idoctags.py @@ -27,18 +27,15 @@ def _serialize( add_location: bool = True, add_content: bool = True, add_table_cell_location: bool = False, - add_table_cell_text: bool = True, - xml_compliant: bool = True, ) -> str: + params = IDocTagsParams( + add_location=add_location, + add_content=add_content, + add_table_cell_location=add_table_cell_location, + ) ser = IDocTagsDocSerializer( doc=doc, - params=IDocTagsParams( - add_location=add_location, - add_content=add_content, - add_table_cell_location=add_table_cell_location, - add_table_cell_text=add_table_cell_text, - xml_compliant=xml_compliant, - ), + params=params, ) return ser.serialize().text @@ -524,7 +521,7 @@ def test_roundtrip_complex_table_with_caption_prov(): doc.add_table(data=td, caption=cap, prov=_default_prov()) - dt = _serialize(doc, add_table_cell_text=True, add_content=True) + dt = _serialize(doc, add_content=True) if DO_PRINT: print(dt) doc2 = _deserialize(dt) @@ -1063,7 +1060,7 @@ def test_roundtrip_table_with_rich_cells(): doc.add_table_cell(table_item=table, cell=rich_cell_2_1) # Serialize and deserialize - dt = _serialize(doc, add_table_cell_text=True, add_content=True) + dt = _serialize(doc, add_content=True) if DO_PRINT: print("\n", dt) doc2 = _deserialize(dt) @@ -1081,7 +1078,7 @@ def test_roundtrip_table_with_rich_cells(): assert len(rich_cells) >= 1 # At least one rich cell should be preserved # Verify round-trip serialization - dt2 = _serialize(doc2, add_table_cell_text=True, add_content=True) + dt2 = _serialize(doc2, add_content=True) if DO_PRINT: print("\ndt:", dt) print("\ndt2:", dt2) @@ -1096,11 +1093,11 @@ def test_roundtrip_table_with_rich_cells(): def test_constructed_doc(sample_doc: DoclingDocument): doc = sample_doc - dt = _serialize(doc, add_table_cell_text=True, add_content=True) + dt = _serialize(doc) doc2 = _deserialize(dt) - dt2 = _serialize(doc2, add_table_cell_text=True, add_content=True) + dt2 = _serialize(doc2) # if DO_PRINT: # print(f"--------------------------dt:\n\n{dt}\n\n") @@ -1115,12 +1112,10 @@ def test_constructed_doc(sample_doc: DoclingDocument): def test_constructed_rich_table_doc(rich_table_doc: DoclingDocument): doc = rich_table_doc - dt = _serialize(doc, add_table_cell_text=True, add_content=True, xml_compliant=True) + dt = _serialize(doc, add_content=True) doc2 = _deserialize(dt) - dt2 = _serialize( - doc2, add_table_cell_text=True, add_content=True, xml_compliant=True - ) + dt2 = _serialize(doc2, add_content=True) assert dt2 == dt diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py index 64fe0735..0dc196ba 100644 --- a/test/test_serialization_idoctag.py +++ b/test/test_serialization_idoctag.py @@ -1,10 +1,12 @@ """Unit tests for IDocTags create_closing_token helper.""" from pathlib import Path +from test.test_serialization import verify import pytest from docling_core.experimental.idoctags import ( + EscapeMode, IDocTagsDocSerializer, IDocTagsParams, IDocTagsSerializationMode, @@ -17,8 +19,12 @@ Script, TableData, ) - -from .test_serialization import verify +from docling_core.types.doc.document import ( + DescriptionMetaField, + PictureMeta, + SummaryMetaField, +) +from docling_core.types.doc.labels import CodeLanguageLabel # =============================== # IDocTags unit-tests @@ -186,114 +192,70 @@ def test_idoctags_meta(): verify(exp_file=src.with_suffix(".gt.idt.xml"), actual=actual) -def test_xml_compliant_escaping(): - """Test that xml_compliant parameter properly escapes XML special characters.""" - doc = DoclingDocument(name="test_xml_escape") - - # Add text with XML special characters - doc.add_text( - label=DocItemLabel.TEXT, text="Text with & special chars like > and <" +def _create_escape_test_doc(inp_doc: DoclingDocument): + doc = inp_doc.model_copy(deep=True) + doc.add_text(label=DocItemLabel.TEXT, text="Simple text") + doc.add_text(label=DocItemLabel.TEXT, text=" 4 leading spaces, 1 trailing ") + doc.add_text(label=DocItemLabel.TEXT, text="Some 'single' quotes") + doc.add_text(label=DocItemLabel.TEXT, text='Some "double" quotes') + text_item = doc.add_text(label=DocItemLabel.TEXT, text="An ampersand: &") + text_item.meta = PictureMeta( + summary=SummaryMetaField(text="Summary with & ampersands"), + description=DescriptionMetaField(text="Description content"), ) - - # Add a table with special characters in cells - td = TableData(num_rows=0, num_cols=2) + doc.add_code(text="0 == 0") + doc.add_code(text=" 1 leading space, 4 trailing ") + doc.add_code(text="0 < 1") + doc.add_code(text="42 == 42", code_language=CodeLanguageLabel.PYTHON) + doc.add_code(text="42 < 1337", code_language=CodeLanguageLabel.PYTHON) + + td = TableData(num_cols=2) + td.add_row(["Foo", "Bar"]) td.add_row(["Header & Title", "Value > 100"]) td.add_row(["