diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
index 009fccad..d839aa86 100644
--- a/docling_core/experimental/idoctags.py
+++ b/docling_core/experimental/idoctags.py
@@ -1,10 +1,8 @@
"""Define classes for DocTags serialization."""
import copy
-import html
import re
from enum import Enum
-from html.parser import HTMLParser
from itertools import groupby
from typing import Any, ClassVar, Final, Optional, cast
from xml.dom.minidom import Element, Text, parseString
@@ -373,8 +371,9 @@ class IDocTagsToken(str, Enum):
INLINE = "inline"
# Formatting
- BOLD = "bold" # instead of "strong"
- ITALIC = "italic" # instead of "em"
+ BOLD = "bold"
+ ITALIC = "italic"
+ UNDERLINE = "underline"
STRIKETHROUGH = "strikethrough"
SUPERSCRIPT = "superscript"
SUBSCRIPT = "subscript"
@@ -981,6 +980,15 @@ class IDocTagsSerializationMode(str, Enum):
LLM_FRIENDLY = "llm_friendly"
+class EscapeMode(str, Enum):
+ """XML escape mode for IDocTags output."""
+
+ CDATA_ALWAYS = "cdata_always" # wrap all text in CDATA
+ CDATA_WHEN_NEEDED = (
+ "cdata_when_needed" # wrap text in CDATA only if it contains special characters
+ )
+
+
class IDocTagsParams(CommonParams):
"""IDocTags-specific serialization parameters independent of DocTags."""
@@ -1002,7 +1010,7 @@ class IDocTagsParams(CommonParams):
# Expand self-closing forms of non-self-closing tokens after pretty-printing
preserve_empty_non_selfclosing: bool = True
# XML compliance: escape special characters in text content
- xml_compliant: bool = False
+ escape_mode: EscapeMode = EscapeMode.CDATA_WHEN_NEEDED
def _get_delim(*, params: IDocTagsParams) -> str:
@@ -1014,84 +1022,13 @@ def _get_delim(*, params: IDocTagsParams) -> str:
raise RuntimeError(f"Unknown IDocTags mode: {params.mode}")
-class _WhitelistHTMLParser(HTMLParser):
- """XML-safe sanitizer that preserves only specific IDocTags formatting and content tags.
-
- Preserves these tags (attributes are stripped):
- bold, italic, strikethrough, superscript, subscript, inline, text, code, formula, facets.
- All other tags are escaped literally.
- """
-
- # Allowed formatting and content tags
- _ALLOWED = {
- IDocTagsToken.BOLD.value,
- IDocTagsToken.ITALIC.value,
- IDocTagsToken.STRIKETHROUGH.value,
- IDocTagsToken.SUPERSCRIPT.value,
- IDocTagsToken.SUBSCRIPT.value,
- IDocTagsToken.INLINE.value,
- IDocTagsToken.TEXT.value,
- IDocTagsToken.CODE.value,
- IDocTagsToken.FORMULA.value,
- IDocTagsToken.FACETS.value,
- }
-
- def __init__(self):
- super().__init__(convert_charrefs=False)
- self.out = []
-
- def handle_starttag(self, tag, attrs):
- if tag in self._ALLOWED:
- self.out.append(f"<{tag}>")
- else:
- # Escape disallowed tags literally
- self.out.append(html.escape(self.get_starttag_text(), quote=False))
-
- def handle_endtag(self, tag):
- if tag in self._ALLOWED:
- self.out.append(f"{tag}>")
- else:
- self.out.append(html.escape(f"{tag}>", quote=False))
-
- def handle_startendtag(self, tag, attrs):
- if tag in self._ALLOWED:
- self.out.append(f"<{tag}>{tag}>")
- else:
- self.out.append(html.escape(self.get_starttag_text(), quote=False))
-
- def handle_data(self, data):
- self.out.append(html.escape(data, quote=False))
-
- def handle_entityref(self, name):
- self.out.append(f"&{name};")
-
- def handle_charref(self, name):
- self.out.append(f"{name};")
-
- def handle_comment(self, data):
- self.out.append(html.escape(f"", quote=False))
-
-
-# def _escape_xml_text(text: str, xml_compliant: bool) -> str:
-# """Escape XML special characters if xml_compliant is enabled."""
-# if xml_compliant:
-# return html.escape(text, quote=False)
-# return text
-
-
-def _escape_xml_text(text: str, xml_compliant: bool) -> str:
- """Escape text for XML while optionally preserving specific IDocTags formatting tags.
-
- If xml_compliant=True, preserves only these tags (attributes stripped):
- bold, italic, strikethrough, superscript, subscript, inline, text, code, formula, facets.
- All other tags are escaped. If xml_compliant=False, returns text unchanged.
- """
- if not xml_compliant:
- return text
- parser = _WhitelistHTMLParser()
- parser.feed(text)
- parser.close()
- return "".join(parser.out)
+def _escape_text(text: str, escape_mode: EscapeMode) -> str:
+ if escape_mode == EscapeMode.CDATA_ALWAYS or (
+ escape_mode == EscapeMode.CDATA_WHEN_NEEDED
+ and any(c in text for c in ['"', "'", "&", "<", ">"])
+ ):
+ return f""
+ return text
class IDocTagsListSerializer(BaseModel, BaseListSerializer):
@@ -1333,6 +1270,12 @@ def _serialize_single_item(
elif isinstance(item, ListItem):
tok = IDocTagsToken.LIST_TEXT
wrap_open_token = f"<{tok.value}>"
+ elif isinstance(item, CodeItem):
+ tok = IDocTagsToken.CODE
+ if item.code_language != CodeLanguageLabel.UNKNOWN:
+ wrap_open_token = f'<{tok.value} {IDocTagsAttributeKey.CLASS.value}="{item.code_language.value}">'
+ else:
+ wrap_open_token = f"<{tok.value}>"
elif (
isinstance(item, TextItem) and item.label == DocItemLabel.CHECKBOX_SELECTED
):
@@ -1402,44 +1345,26 @@ def _serialize_single_item(
hyperlink=item.hyperlink,
)
else:
+ text_part = _escape_text(item.text, params.escape_mode)
text_part = doc_serializer.post_process(
- text=item.text,
+ text=text_part,
formatting=item.formatting,
hyperlink=item.hyperlink,
)
- # For code blocks, preserve language using a lightweight facets marker
- # e.g., language=python before the code content.
- if isinstance(item, CodeItem):
- # lang = getattr(item.code_language, "value", str(item.code_language))
- if item.code_language != CodeLanguageLabel.UNKNOWN:
- parts.append(
- _wrap(
- # text=f"language={lang.lower()}",
- text=item.code_language.value,
- wrap_tag=IDocTagsToken.FACETS.value,
- )
- )
- # Keep the textual code content as-is (no stripping)
- else:
- text_part = text_part.strip()
-
- # Apply XML escaping if xml_compliant is enabled
- text_part = _escape_xml_text(text_part, params.xml_compliant)
-
if text_part:
parts.append(text_part)
if params.add_caption and isinstance(item, FloatingItem):
cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
if cap_text:
- cap_text = _escape_xml_text(cap_text, params.xml_compliant)
+ cap_text = _escape_text(cap_text, params.escape_mode)
parts.append(cap_text)
if params.add_footnote and isinstance(item, FloatingItem):
ftn_text = doc_serializer.serialize_footnotes(item=item, **kwargs).text
if ftn_text:
- ftn_text = _escape_xml_text(ftn_text, params.xml_compliant)
+ ftn_text = _escape_text(ftn_text, params.escape_mode)
parts.append(ftn_text)
text_res = "".join(parts)
@@ -1496,12 +1421,12 @@ def _serialize_meta_field(
if name == MetaFieldName.SUMMARY and isinstance(
field_val, SummaryMetaField
):
- escaped_text = _escape_xml_text(field_val.text, params.xml_compliant)
+ escaped_text = _escape_text(field_val.text, params.escape_mode)
txt = f"{escaped_text}"
elif name == MetaFieldName.DESCRIPTION and isinstance(
field_val, DescriptionMetaField
):
- escaped_text = _escape_xml_text(field_val.text, params.xml_compliant)
+ escaped_text = _escape_text(field_val.text, params.escape_mode)
txt = f"{escaped_text}"
elif name == MetaFieldName.CLASSIFICATION and isinstance(
field_val, PictureClassificationMetaField
@@ -1509,12 +1434,12 @@ def _serialize_meta_field(
class_name = self._humanize_text(
field_val.get_main_prediction().class_name
)
- escaped_class_name = _escape_xml_text(class_name, params.xml_compliant)
+ escaped_class_name = _escape_text(class_name, params.escape_mode)
txt = f"{escaped_class_name}"
elif name == MetaFieldName.MOLECULE and isinstance(
field_val, MoleculeMetaField
):
- escaped_smi = _escape_xml_text(field_val.smi, params.xml_compliant)
+ escaped_smi = _escape_text(field_val.smi, params.escape_mode)
txt = f"{escaped_smi}"
elif name == MetaFieldName.TABULAR_CHART and isinstance(
field_val, TabularChartMetaField
@@ -1524,9 +1449,7 @@ def _serialize_meta_field(
# elif tmp := str(field_val or ""):
# txt = tmp
elif name not in {v.value for v in MetaFieldName}:
- escaped_text = _escape_xml_text(
- str(field_val or ""), params.xml_compliant
- )
+ escaped_text = _escape_text(str(field_val or ""), params.escape_mode)
txt = _wrap(text=escaped_text, wrap_tag=name)
return txt
return None
@@ -1744,9 +1667,7 @@ def _emit_otsl(
parts.append(cell_loc)
if params.add_content:
# Apply XML escaping to table cell content
- escaped_content = _escape_xml_text(
- content, params.xml_compliant
- )
+ escaped_content = _escape_text(content, params.escape_mode)
parts.append(escaped_content)
else:
parts.append(
@@ -2139,6 +2060,11 @@ def serialize_italic(self, text: str, **kwargs: Any) -> str:
"""Apply IDocTags-specific italic serialization."""
return _wrap(text=text, wrap_tag=IDocTagsToken.ITALIC.value)
+ @override
+ def serialize_underline(self, text: str, **kwargs: Any) -> str:
+ """Apply IDocTags-specific underline serialization."""
+ return _wrap(text=text, wrap_tag=IDocTagsToken.UNDERLINE.value)
+
@override
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
"""Apply IDocTags-specific strikethrough serialization."""
@@ -2341,7 +2267,12 @@ def _extract_code_content_and_language(
self, el: Element
) -> tuple[str, CodeLanguageLabel]:
"""Extract code content and language from a element."""
- lang_label = CodeLanguageLabel.UNKNOWN
+ try:
+ lang_label = CodeLanguageLabel(
+ el.getAttribute(IDocTagsAttributeKey.CLASS.value)
+ )
+ except ValueError:
+ lang_label = CodeLanguageLabel.UNKNOWN
parts: list[str] = []
for node in el.childNodes:
if isinstance(node, Text):
@@ -2349,36 +2280,9 @@ def _extract_code_content_and_language(
parts.append(node.data)
elif isinstance(node, Element):
nm_child = node.tagName
- if nm_child == IDocTagsToken.FACETS.value:
- language_text = self._get_text(node).strip()
- try:
- lang_label = next(
- lbl
- for lbl in CodeLanguageLabel
- if lbl.value == language_text
- )
- except StopIteration:
- lang_label = CodeLanguageLabel.UNKNOWN
-
- """
- facets_text = self._get_text(node).strip()
- if "=" in facets_text:
- key, val = facets_text.split("=", 1)
- if key.strip().lower() == "language":
- val_norm = val.strip().lower()
- try:
- lang_label = next(
- lbl
- for lbl in CodeLanguageLabel
- if lbl.value.lower() == val_norm
- )
- except StopIteration:
- lang_label = CodeLanguageLabel.UNKNOWN
- """
- continue
if nm_child == IDocTagsToken.LOCATION.value:
continue
- if nm_child == IDocTagsToken.BR.value:
+ elif nm_child == IDocTagsToken.BR.value:
parts.append("\n")
else:
parts.append(self._get_text(node))
@@ -2793,11 +2697,12 @@ def _extract_text_with_formatting(
# Mapping of format tags to Formatting attributes
format_tags = {
- IDocTagsToken.BOLD.value: "bold",
- IDocTagsToken.ITALIC.value: "italic",
- IDocTagsToken.STRIKETHROUGH.value: "strikethrough",
- IDocTagsToken.SUPERSCRIPT.value: "superscript",
- IDocTagsToken.SUBSCRIPT.value: "subscript",
+ IDocTagsToken.BOLD,
+ IDocTagsToken.ITALIC,
+ IDocTagsToken.STRIKETHROUGH,
+ IDocTagsToken.UNDERLINE,
+ IDocTagsToken.SUPERSCRIPT,
+ IDocTagsToken.SUBSCRIPT,
}
if tag_name in format_tags:
@@ -2815,6 +2720,8 @@ def _extract_text_with_formatting(
child_formatting.italic = True
elif tag_name == IDocTagsToken.STRIKETHROUGH.value:
child_formatting.strikethrough = True
+ elif tag_name == IDocTagsToken.UNDERLINE.value:
+ child_formatting.underline = True
elif tag_name == IDocTagsToken.SUPERSCRIPT.value:
child_formatting.script = Script.SUPER
elif tag_name == IDocTagsToken.SUBSCRIPT.value:
diff --git a/examples/convert_to_idoctags.py b/examples/convert_to_idoctags.py
index 61692c0a..fac952b9 100644
--- a/examples/convert_to_idoctags.py
+++ b/examples/convert_to_idoctags.py
@@ -18,6 +18,7 @@
from docling_core.types.doc import DoclingDocument, ImageRef
from docling_core.types.doc.base import ImageRefMode
from docling_core.experimental.idoctags import (
+ EscapeMode,
IDocTagsSerializationMode,
IDocTagsParams,
IDocTagsVocabulary,
@@ -28,7 +29,7 @@
import numpy as np
# In order to download **before** the datasets library, run
-#
+#
# HF_HUB_DISABLE_XET=1 hf download --repo-type dataset "{hf-repo-id}"
#
@@ -151,7 +152,7 @@ def _write_report(rows: list[dict[str, str]], path: Path) -> None:
- Row ID
- Loaded DoclingDocument
- Loaded DoclingDocument Error
- - Serialized IDocTags (mode, xml_compliant, content) for all combinations
+ - Serialized IDocTags (mode, escape_mode, content) for all combinations
- Serialized HTML
- Serialized HTML Error
@@ -166,12 +167,12 @@ def _write_report(rows: list[dict[str, str]], path: Path) -> None:
"Loaded DoclingDocument Error",
]
- # Add all combinations of mode, xml_compliant, and content
- for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
- for comp in [True, False]:
+ # Add all combinations of mode, escape_mode, and content
+ for mode in IDocTagsSerializationMode:
+ for esc_mode in EscapeMode:
for content in [True, False]:
- cols.append(f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})")
- cols.append(f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error")
+ cols.append(f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})")
+ cols.append(f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content}) Error")
cols.extend([
"Serialized HTML",
@@ -193,10 +194,10 @@ def _count_yes(key: str) -> int:
]
# Add summary rows for all combinations
- for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
- for comp in [True, False]:
+ for mode in IDocTagsSerializationMode:
+ for esc_mode in EscapeMode:
for content in [True, False]:
- col_name = f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"
+ col_name = f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})"
summary_rows.append({"Metric": col_name, "Count": _count_yes(col_name)})
summary_rows.append({"Metric": "Serialized HTML", "Count": _count_yes("Serialized HTML")})
@@ -315,14 +316,14 @@ def _count_yes(key: str) -> int:
"Loaded DoclingDocument": _yes(False),
"Loaded DoclingDocument Error": "",
"Serialized HTML": _yes(False),
- "Serialized HTML Error": "",
+ "Serialized HTML Error": "",
}
- for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
- for comp in [True, False]:
+ for mode in IDocTagsSerializationMode:
+ for esc_mode in EscapeMode:
for content in [True, False]:
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(False)
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = ""
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})"] = _yes(False)
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}) Error"] = ""
try:
doc = DoclingDocument.model_validate_json(text)
@@ -338,12 +339,12 @@ def _count_yes(key: str) -> int:
# Record failure outcome for this row
row_result["Loaded DoclingDocument Error"] = str(exc)
- for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
- for comp in [True, False]:
+ for mode in IDocTagsSerializationMode:
+ for esc_mode in EscapeMode:
for content in [True, False]:
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(False)
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = "NA"
-
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content})"] = _yes(False)
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode.value}, content={content}) Error"] = "NA"
+
results_rows.append(row_result)
continue
@@ -353,24 +354,24 @@ def _count_yes(key: str) -> int:
# __.save(png_path)
for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
- for comp in [True, False]:
+ for esc_mode in [True, False]:
for content in [True, False]:
try:
params_probe = IDocTagsParams()
params_probe.add_content = content
params_probe.mode = mode
- params_probe.xml_compliant = comp
+ params_probe.escape_mode = esc_mode
params_probe.pretty_indentation = " " if mode==IDocTagsSerializationMode.HUMAN_FRIENDLY else None
iser_probe = IDocTagsDocSerializer(doc=doc, params=params_probe)
_ = iser_probe.serialize().text
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(True)
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = ""
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content})"] = _yes(True)
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}) Error"] = ""
except Exception as exc_:
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})"] = _yes(False)
- row_result[f"Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}) Error"] = str(exc_)
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content})"] = _yes(False)
+ row_result[f"Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}) Error"] = str(exc_)
# Attempt HTML export (non-writing) to check serialization capability
try:
@@ -401,9 +402,9 @@ def _count_yes(rows: list[dict[str, str]], key: str) -> int:
print(f" - Total processed: {len(results_rows)}")
print(f" - Loaded DoclingDocument: {_count_yes(results_rows, 'Loaded DoclingDocument')}")
for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
- for comp in [True, False]:
+ for esc_mode in [True, False]:
for content in [True, False]:
- print(f" - Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content}): {_count_yes(results_rows, f'Serialized IDocTags ({mode.value}, xml_compliant={comp}, content={content})')}")
+ print(f" - Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content}): {_count_yes(results_rows, f'Serialized IDocTags ({mode.value}, escape_mode={esc_mode}, content={content})')}")
print(f" - Serialized HTML: {_count_yes(results_rows, 'Serialized HTML')}")
if errors:
diff --git a/test/conftest.py b/test/conftest.py
index 7dadfc5c..63b4aaf2 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -26,8 +26,8 @@
)
-@pytest.fixture(scope="session")
-def _construct_doc() -> DoclingDocument:
+# factored out of fixture to simplify IDE-level debugging
+def _construct_doc_impl() -> DoclingDocument:
"""Fixture for a DoclingDocument to be reused across a test session."""
doc = DoclingDocument(name="Untitled 1")
@@ -375,6 +375,12 @@ def _construct_doc() -> DoclingDocument:
return doc
+@pytest.fixture(scope="session")
+def _construct_doc() -> DoclingDocument:
+ """Fixture for a DoclingDocument to be reused across a test session."""
+ return _construct_doc_impl()
+
+
@pytest.fixture(scope="function")
def sample_doc(_construct_doc: DoclingDocument) -> DoclingDocument:
"""Copy of a DoclingDocument for each test function."""
diff --git a/test/data/doc/cdata_always.gt.idt.xml b/test/data/doc/cdata_always.gt.idt.xml
new file mode 100644
index 00000000..251768cd
--- /dev/null
+++ b/test/data/doc/cdata_always.gt.idt.xml
@@ -0,0 +1,169 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is the caption of table 1.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is the caption of figure 1.
+
+
+ This is the caption of figure 2.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ & ampersands]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 100]]>
+
+]]>
+
+
+
+]]>
+
+
+
+
+
+
+
+
+
diff --git a/test/data/doc/cdata_when_needed.gt.idt.xml b/test/data/doc/cdata_when_needed.gt.idt.xml
new file mode 100644
index 00000000..3312d779
--- /dev/null
+++ b/test/data/doc/cdata_when_needed.gt.idt.xml
@@ -0,0 +1,179 @@
+
+
+ item of leading list
+
+ Title of the Document
+ Author 1
+Affiliation 1
+ Author 2
+Affiliation 2
+ 1. Introduction
+ This paper introduces the biggest invention ever made. ...
+
+ list item 1
+ list item 2
+ list item 3
+
+ list item 3.a
+ list item 3.b
+ list item 3.c
+
+ list item 3.c.i
+
+
+ list item 4
+
+
+ This is the caption of table 1.
+
+
+ Product
+
+ Years
+
+
+
+
+ 2016
+
+ 2017
+
+
+ Apple
+
+ 49823
+
+ 695944
+
+
+
+
+ This is the caption of figure 1.
+
+
+ This is the caption of figure 2.
+
+
+ item 1 of list
+
+
+ item 1 of list after empty list
+ item 2 of list after empty list
+
+
+ item 1 of neighboring list
+ item 2 of neighboring list
+
+ item 1 of sub list
+
+
+ Here a code snippet:
+
+ (to be displayed inline)
+
+
+
+
+ Here a formula:
+ E=mc^2
+ (to be displayed inline)
+
+
+
+
+ Here a code block:
+
+ Here a formula block:
+ E=mc^2
+
+ Some formatting chops:
+
+ bold
+
+
+ italic
+
+
+ underline
+
+
+ strikethrough
+
+
+ subscript
+
+
+ superscript
+
+ hyperlink
+
+
+
+
+
+ everything at the same time.
+
+
+
+
+
+
+ Item 1 in A
+ Item 2 in A
+ Item 3 in A
+
+ Item 1 in B
+ Item 2 in B
+
+ Item 1 in C
+ Item 2 in C
+
+ Item 3 in B
+
+ Item 4 in A
+
+
+ List item without parent list group
+
+ The end.
+ Simple text
+ 4 leading spaces, 1 trailing
+
+
+
+
+ & ampersands]]>
+ Description content
+
+
+ 0 == 0
+ 1 leading space, 4 trailing
+
+ 42 == 42
+
+
+
+
+ Foo
+
+ Bar
+
+
+
+ 100]]>
+
+]]>
+
+
+ Only
+
+]]>
+
+
+
+
+
+
+
+
+
diff --git a/test/data/doc/ddoc_0.v0.gt.idt b/test/data/doc/ddoc_0.v0.gt.idt
index ba2dc6d6..a0c01bd1 100644
--- a/test/data/doc/ddoc_0.v0.gt.idt
+++ b/test/data/doc/ddoc_0.v0.gt.idt
@@ -73,8 +73,7 @@
- Set connect string for connecting to ndb_mgmd. Syntax: "[nodeid=id;][host=]hostname[:port]". Overrides entries in NDB_CONNECTSTRING and my.cnf.
-
+
diff --git a/test/data/doc/dummy_doc_with_meta.gt.idt.xml b/test/data/doc/dummy_doc_with_meta.gt.idt.xml
index 020e1115..17f37fea 100644
--- a/test/data/doc/dummy_doc_with_meta.gt.idt.xml
+++ b/test/data/doc/dummy_doc_with_meta.gt.idt.xml
@@ -23,7 +23,7 @@
...
Bar chart
CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
- {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}
+
diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py
index a91b5a96..1b1191b3 100644
--- a/test/test_deserializer_idoctags.py
+++ b/test/test_deserializer_idoctags.py
@@ -27,18 +27,15 @@ def _serialize(
add_location: bool = True,
add_content: bool = True,
add_table_cell_location: bool = False,
- add_table_cell_text: bool = True,
- xml_compliant: bool = True,
) -> str:
+ params = IDocTagsParams(
+ add_location=add_location,
+ add_content=add_content,
+ add_table_cell_location=add_table_cell_location,
+ )
ser = IDocTagsDocSerializer(
doc=doc,
- params=IDocTagsParams(
- add_location=add_location,
- add_content=add_content,
- add_table_cell_location=add_table_cell_location,
- add_table_cell_text=add_table_cell_text,
- xml_compliant=xml_compliant,
- ),
+ params=params,
)
return ser.serialize().text
@@ -524,7 +521,7 @@ def test_roundtrip_complex_table_with_caption_prov():
doc.add_table(data=td, caption=cap, prov=_default_prov())
- dt = _serialize(doc, add_table_cell_text=True, add_content=True)
+ dt = _serialize(doc, add_content=True)
if DO_PRINT:
print(dt)
doc2 = _deserialize(dt)
@@ -1063,7 +1060,7 @@ def test_roundtrip_table_with_rich_cells():
doc.add_table_cell(table_item=table, cell=rich_cell_2_1)
# Serialize and deserialize
- dt = _serialize(doc, add_table_cell_text=True, add_content=True)
+ dt = _serialize(doc, add_content=True)
if DO_PRINT:
print("\n", dt)
doc2 = _deserialize(dt)
@@ -1081,7 +1078,7 @@ def test_roundtrip_table_with_rich_cells():
assert len(rich_cells) >= 1 # At least one rich cell should be preserved
# Verify round-trip serialization
- dt2 = _serialize(doc2, add_table_cell_text=True, add_content=True)
+ dt2 = _serialize(doc2, add_content=True)
if DO_PRINT:
print("\ndt:", dt)
print("\ndt2:", dt2)
@@ -1096,11 +1093,11 @@ def test_roundtrip_table_with_rich_cells():
def test_constructed_doc(sample_doc: DoclingDocument):
doc = sample_doc
- dt = _serialize(doc, add_table_cell_text=True, add_content=True)
+ dt = _serialize(doc)
doc2 = _deserialize(dt)
- dt2 = _serialize(doc2, add_table_cell_text=True, add_content=True)
+ dt2 = _serialize(doc2)
# if DO_PRINT:
# print(f"--------------------------dt:\n\n{dt}\n\n")
@@ -1115,12 +1112,10 @@ def test_constructed_doc(sample_doc: DoclingDocument):
def test_constructed_rich_table_doc(rich_table_doc: DoclingDocument):
doc = rich_table_doc
- dt = _serialize(doc, add_table_cell_text=True, add_content=True, xml_compliant=True)
+ dt = _serialize(doc, add_content=True)
doc2 = _deserialize(dt)
- dt2 = _serialize(
- doc2, add_table_cell_text=True, add_content=True, xml_compliant=True
- )
+ dt2 = _serialize(doc2, add_content=True)
assert dt2 == dt
diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py
index 64fe0735..0dc196ba 100644
--- a/test/test_serialization_idoctag.py
+++ b/test/test_serialization_idoctag.py
@@ -1,10 +1,12 @@
"""Unit tests for IDocTags create_closing_token helper."""
from pathlib import Path
+from test.test_serialization import verify
import pytest
from docling_core.experimental.idoctags import (
+ EscapeMode,
IDocTagsDocSerializer,
IDocTagsParams,
IDocTagsSerializationMode,
@@ -17,8 +19,12 @@
Script,
TableData,
)
-
-from .test_serialization import verify
+from docling_core.types.doc.document import (
+ DescriptionMetaField,
+ PictureMeta,
+ SummaryMetaField,
+)
+from docling_core.types.doc.labels import CodeLanguageLabel
# ===============================
# IDocTags unit-tests
@@ -186,114 +192,70 @@ def test_idoctags_meta():
verify(exp_file=src.with_suffix(".gt.idt.xml"), actual=actual)
-def test_xml_compliant_escaping():
- """Test that xml_compliant parameter properly escapes XML special characters."""
- doc = DoclingDocument(name="test_xml_escape")
-
- # Add text with XML special characters
- doc.add_text(
- label=DocItemLabel.TEXT, text="Text with & special chars like > and <"
+def _create_escape_test_doc(inp_doc: DoclingDocument):
+ doc = inp_doc.model_copy(deep=True)
+ doc.add_text(label=DocItemLabel.TEXT, text="Simple text")
+ doc.add_text(label=DocItemLabel.TEXT, text=" 4 leading spaces, 1 trailing ")
+ doc.add_text(label=DocItemLabel.TEXT, text="Some 'single' quotes")
+ doc.add_text(label=DocItemLabel.TEXT, text='Some "double" quotes')
+ text_item = doc.add_text(label=DocItemLabel.TEXT, text="An ampersand: &")
+ text_item.meta = PictureMeta(
+ summary=SummaryMetaField(text="Summary with & ampersands"),
+ description=DescriptionMetaField(text="Description content"),
)
-
- # Add a table with special characters in cells
- td = TableData(num_rows=0, num_cols=2)
+ doc.add_code(text="0 == 0")
+ doc.add_code(text=" 1 leading space, 4 trailing ")
+ doc.add_code(text="0 < 1")
+ doc.add_code(text="42 == 42", code_language=CodeLanguageLabel.PYTHON)
+ doc.add_code(text="42 < 1337", code_language=CodeLanguageLabel.PYTHON)
+
+ td = TableData(num_cols=2)
+ td.add_row(["Foo", "Bar"])
td.add_row(["Header & Title", "Value > 100"])
td.add_row(["