From c4bcf78378a9ec4c4a1523af114a5bca4dbb305e Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 14 Nov 2025 11:41:09 +0100
Subject: [PATCH 01/22] refactor: move WebVTT data model from docling

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 416 +++++++++++++++++++++++++
 test/data/webvtt/webvtt_example_01.vtt |  42 +++
 test/data/webvtt/webvtt_example_02.vtt |  15 +
 test/data/webvtt/webvtt_example_03.vtt |  57 ++++
 test/test_webvtt.py                    | 199 ++++++++++++
 5 files changed, 729 insertions(+)
 create mode 100644 docling_core/types/doc/webvtt.py
 create mode 100644 test/data/webvtt/webvtt_example_01.vtt
 create mode 100644 test/data/webvtt/webvtt_example_02.vtt
 create mode 100644 test/data/webvtt/webvtt_example_03.vtt
 create mode 100644 test/test_webvtt.py

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
new file mode 100644
index 00000000..eccae4a6
--- /dev/null
+++ b/docling_core/types/doc/webvtt.py
@@ -0,0 +1,416 @@
+"""Models for the Docling's adoption of Web Video Text Tracks format."""
+
+import logging
+import re
+from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic.types import StringConstraints
+from typing_extensions import Self, override
+
+_log = logging.getLogger(__name__)
+
+
+class _WebVTTTimestamp(BaseModel):
+    """Model representing a WebVTT timestamp.
+
+    A WebVTT timestamp is always interpreted relative to the current playback position
+    of the media data that the WebVTT file is to be synchronized with.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    raw: Annotated[
+        str,
+        Field(
+            description="A representation of the WebVTT Timestamp as a single string"
+        ),
+    ]
+
+    _pattern: ClassVar[re.Pattern] = re.compile(
+        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
+    )
+    _hours: int
+    _minutes: int
+    _seconds: int
+    _millis: int
+
+    @model_validator(mode="after")
+    def validate_raw(self) -> Self:
+        m = self._pattern.match(self.raw)
+        if not m:
+            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
+        self._hours = int(m.group(1)) if m.group(1) else 0
+        self._minutes = int(m.group(2))
+        self._seconds = int(m.group(3))
+        self._millis = int(m.group(4))
+
+        if self._minutes < 0 or self._minutes > 59:
+            raise ValueError("Minutes must be between 0 and 59")
+        if self._seconds < 0 or self._seconds > 59:
+            raise ValueError("Seconds must be between 0 and 59")
+
+        return self
+
+    @property
+    def seconds(self) -> float:
+        """A representation of the WebVTT Timestamp in seconds."""
+        return (
+            self._hours * 3600
+            + self._minutes * 60
+            + self._seconds
+            + self._millis / 1000.0
+        )
+
+    @override
+    def __str__(self) -> str:
+        return self.raw
+
+
+_WebVTTCueIdentifier = Annotated[
+    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
+]
+
+
+class _WebVTTCueTimings(BaseModel):
+    """Model representating WebVTT cue timings."""
+
+    start: Annotated[
+        _WebVTTTimestamp, Field(description="Start time offset of the cue")
+    ]
+    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
+
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        if self.start and self.end:
+            if self.end.seconds <= self.start.seconds:
+                raise ValueError("End timestamp must be greater than start timestamp")
+        return self
+
+    @override
+    def __str__(self):
+        return f"{self.start} --> {self.end}"
+
+
+class _WebVTTCueTextSpan(BaseModel):
+    """Model representing a WebVTT cue text span."""
+
+    text: str
+    span_type: Literal["text"] = "text"
+
+    @field_validator("text", mode="after")
+    @classmethod
+    def validate_text(cls, value: str) -> str:
+        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
+            raise ValueError("Cue text span contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Cue text span cannot be empty")
+        return value
+
+    @override
+    def __str__(self):
+        return self.text
+
+
+class _WebVTTCueVoiceSpan(BaseModel):
+    """Model representing a WebVTT cue voice span."""
+
+    annotation: Annotated[
+        str,
+        Field(
+            description=(
+                "Cue span start tag annotation text representing the name of thevoice"
+            )
+        ),
+    ]
+    classes: Annotated[
+        list[str],
+        Field(description="List of classes representing the cue span's significance"),
+    ] = []
+    components: Annotated[
+        list["_WebVTTCueComponent"],
+        Field(description="The components representing the cue internal text"),
+    ] = []
+    span_type: Literal["v"] = "v"
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    def validate_annotation(cls, value: str) -> str:
+        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
+            raise ValueError(
+                "Cue span start tag annotation contains invalid characters"
+            )
+        if not value:
+            raise ValueError("Cue text span cannot be empty")
+        return value
+
+    @field_validator("classes", mode="after")
+    @classmethod
+    def validate_classes(cls, value: list[str]) -> list[str]:
+        for item in value:
+            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
+                raise ValueError(
+                    "A cue span start tag class contains invalid characters"
+                )
+            if not item:
+                raise ValueError("Cue span start tag classes cannot be empty")
+        return value
+
+    @override
+    def __str__(self):
+        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
+        inner = "".join(str(span) for span in self.components)
+        return f"<{tag} {self.annotation}>{inner}</v>"
+
+
+class _WebVTTCueClassSpan(BaseModel):
+    span_type: Literal["c"] = "c"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<c>{inner}</c>"
+
+
+class _WebVTTCueItalicSpan(BaseModel):
+    span_type: Literal["i"] = "i"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<i>{inner}</i>"
+
+
+class _WebVTTCueBoldSpan(BaseModel):
+    span_type: Literal["b"] = "b"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<b>{inner}</b>"
+
+
+class _WebVTTCueUnderlineSpan(BaseModel):
+    span_type: Literal["u"] = "u"
+    components: list["_WebVTTCueComponent"]
+
+    @override
+    def __str__(self):
+        inner = "".join(str(span) for span in self.components)
+        return f"<u>{inner}</u>"
+
+
+_WebVTTCueComponent = Annotated[
+    Union[
+        _WebVTTCueTextSpan,
+        _WebVTTCueClassSpan,
+        _WebVTTCueItalicSpan,
+        _WebVTTCueBoldSpan,
+        _WebVTTCueUnderlineSpan,
+        _WebVTTCueVoiceSpan,
+    ],
+    Field(discriminator="span_type", description="The WebVTT cue component"),
+]
+
+
+class _WebVTTCueBlock(BaseModel):
+    """Model representing a WebVTT cue block.
+
+    The optional WebVTT cue settings list is not supported.
+    The cue payload is limited to the following spans: text, class, italic, bold,
+    underline, and voice.
+    """
+
+    model_config = ConfigDict(regex_engine="python-re")
+
+    identifier: Optional[_WebVTTCueIdentifier] = Field(
+        None, description="The WebVTT cue identifier"
+    )
+    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
+
+    _pattern_block: ClassVar[re.Pattern] = re.compile(
+        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
+    )
+    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
+        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
+        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
+    )
+
+    @field_validator("payload", mode="after")
+    @classmethod
+    def validate_payload(cls, payload):
+        for voice in payload:
+            if "-->" in str(voice):
+                raise ValueError("Cue payload must not contain '-->'")
+        return payload
+
+    @classmethod
+    def parse(cls, raw: str) -> "_WebVTTCueBlock":
+        lines = raw.strip().splitlines()
+        if not lines:
+            raise ValueError("Cue block must have at least one line")
+        identifier: Optional[_WebVTTCueIdentifier] = None
+        timing_line = lines[0]
+        if "-->" not in timing_line and len(lines) > 1:
+            identifier = timing_line
+            timing_line = lines[1]
+            cue_lines = lines[2:]
+        else:
+            cue_lines = lines[1:]
+
+        if "-->" not in timing_line:
+            raise ValueError("Cue block must contain WebVTT cue timings")
+
+        start, end = [t.strip() for t in timing_line.split("-->")]
+        end = re.split(" |\t", end)[0]  # ignore the cue settings list
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(
+            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+        )
+        cue_text = " ".join(cue_lines).strip()
+        if cue_text.startswith("<v") and "</v>" not in cue_text:
+            # adding close tag for cue voice spans without end tag
+            cue_text += "</v>"
+
+        stack: list[list[_WebVTTCueComponent]] = [[]]
+        tag_stack: list[Union[str, tuple]] = []
+
+        pos = 0
+        matches = list(cls._pattern_block.finditer(cue_text))
+        i = 0
+        while i < len(matches):
+            match = matches[i]
+            if match.start() > pos:
+                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
+            tag = match.group(0)
+
+            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
+                tag_type = tag[1:2]
+                tag_stack.append(tag_type)
+                stack.append([])
+            elif tag == "</i>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueItalicSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</b>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueBoldSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</u>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
+                tag_stack.pop()
+            elif tag == "</c>":
+                children = stack.pop()
+                stack[-1].append(_WebVTTCueClassSpan(components=children))
+                tag_stack.pop()
+            elif tag.startswith("<v"):
+                tag_stack.append(("v", tag))
+                stack.append([])
+            elif tag.startswith("</v"):
+                children = stack.pop() if stack else []
+                if (
+                    tag_stack
+                    and isinstance(tag_stack[-1], tuple)
+                    and tag_stack[-1][0] == "v"
+                ):
+                    _, voice = cast(tuple, tag_stack.pop())
+                    voice_match = cls._pattern_voice_tag.match(voice)
+                    if voice_match:
+                        class_string = voice_match.group("class")
+                        annotation = voice_match.group("annotation")
+                        if annotation:
+                            classes: list[str] = []
+                            if class_string:
+                                classes = [c for c in class_string.split(".") if c]
+                            stack[-1].append(
+                                _WebVTTCueVoiceSpan(
+                                    annotation=annotation.strip(),
+                                    classes=classes,
+                                    components=children,
+                                )
+                            )
+
+            pos = match.end()
+            i += 1
+
+        if pos < len(cue_text):
+            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
+
+        return cls(
+            identifier=identifier,
+            timings=timings,
+            payload=stack[0],
+        )
+
+    def __str__(self):
+        parts = []
+        if self.identifier:
+            parts.append(f"{self.identifier}\n")
+        timings_line = str(self.timings)
+        parts.append(timings_line + "\n")
+        for idx, span in enumerate(self.payload):
+            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
+                # the end tag may be omitted for brevity
+                parts.append(str(span).removesuffix("</v>"))
+            else:
+                parts.append(str(span))
+
+        return "".join(parts)
+
+
+class _WebVTTFile(BaseModel):
+    """A model representing a WebVTT file."""
+
+    cue_blocks: list[_WebVTTCueBlock]
+
+    @staticmethod
+    def verify_signature(content: str) -> bool:
+        if not content:
+            return False
+        elif len(content) == 6:
+            return content == "WEBVTT"
+        elif len(content) > 6 and content.startswith("WEBVTT"):
+            return content[6] in (" ", "\t", "\n")
+        else:
+            return False
+
+    @classmethod
+    def parse(cls, raw: str) -> "_WebVTTFile":
+        # Normalize newlines to LF
+        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
+
+        # Check WebVTT signature
+        if not cls.verify_signature(raw):
+            raise ValueError("Invalid WebVTT file signature")
+
+        # Strip "WEBVTT" header line
+        lines = raw.split("\n", 1)
+        body = lines[1] if len(lines) > 1 else ""
+
+        # Remove NOTE/STYLE/REGION blocks
+        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
+        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
+
+        # Split into cue blocks
+        raw_blocks = re.split(r"\n\s*\n", body.strip())
+        cues: list[_WebVTTCueBlock] = []
+        for block in raw_blocks:
+            try:
+                cues.append(_WebVTTCueBlock.parse(block))
+            except ValueError as e:
+                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
+
+        return cls(cue_blocks=cues)
+
+    def __iter__(self):
+        return iter(self.cue_blocks)
+
+    def __getitem__(self, idx):
+        return self.cue_blocks[idx]
+
+    def __len__(self):
+        return len(self.cue_blocks)
diff --git a/test/data/webvtt/webvtt_example_01.vtt b/test/data/webvtt/webvtt_example_01.vtt
new file mode 100644
index 00000000..333ca4a8
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_01.vtt
@@ -0,0 +1,42 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:11.000 --> 00:13.000
+<v Roger Bingham>We are in New York City
+
+00:13.000 --> 00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street
+
+00:16.000 --> 00:18.000
+<v Roger Bingham>from the American Museum of Natural History
+
+00:18.000 --> 00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson
+
+00:20.000 --> 00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium
+
+00:22.000 --> 00:24.000
+<v Roger Bingham>at the AMNH.
+
+00:24.000 --> 00:26.000
+<v Roger Bingham>Thank you for walking down here.
+
+00:27.000 --> 00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.
+
+00:30.000 --> 00:31.500 align:right size:50%
+<v Roger Bingham>When we e-mailed—
+
+00:30.500 --> 00:32.500 align:left size:50%
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?
+
+00:32.000 --> 00:35.500 align:right size:50%
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos
+
+00:32.500 --> 00:33.500 align:left size:50%
+<v Neil deGrasse Tyson><i>Laughs</i>
+
+00:35.500 --> 00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.
diff --git a/test/data/webvtt/webvtt_example_02.vtt b/test/data/webvtt/webvtt_example_02.vtt
new file mode 100644
index 00000000..1152a1e8
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_02.vtt
@@ -0,0 +1,15 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:00.000 --> 00:02.000
+<v.first.loud Esme>It’s a blue apple tree!
+
+00:02.000 --> 00:04.000
+<v Mary>No way!
+
+00:04.000 --> 00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:06.000 --> 00:08.000
+<v.loud Mary>That’s awesome!
\ No newline at end of file
diff --git a/test/data/webvtt/webvtt_example_03.vtt b/test/data/webvtt/webvtt_example_03.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_03.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
\ No newline at end of file
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
new file mode 100644
index 00000000..75f5dfc1
--- /dev/null
+++ b/test/test_webvtt.py
@@ -0,0 +1,199 @@
+# Assisted by watsonx Code Assistant
+
+
+import pytest
+from pydantic import ValidationError
+
+from docling_core.types.doc.webvtt import (
+    _WebVTTCueItalicSpan,
+    _WebVTTCueTextSpan,
+    _WebVTTCueTimings,
+    _WebVTTCueVoiceSpan,
+    _WebVTTFile,
+    _WebVTTTimestamp,
+)
+
+from .test_data_gen_flag import GEN_TEST_DATA
+
+GENERATE = GEN_TEST_DATA
+
+
+def test_vtt_cue_commponents():
+    """Test WebVTT components."""
+    valid_timestamps = [
+        "00:01:02.345",
+        "12:34:56.789",
+        "02:34.567",
+        "00:00:00.000",
+    ]
+    valid_total_seconds = [
+        1 * 60 + 2.345,
+        12 * 3600 + 34 * 60 + 56.789,
+        2 * 60 + 34.567,
+        0.0,
+    ]
+    for idx, ts in enumerate(valid_timestamps):
+        model = _WebVTTTimestamp(raw=ts)
+        assert model.seconds == valid_total_seconds[idx]
+
+    """Test invalid WebVTT timestamps."""
+    invalid_timestamps = [
+        "00:60:02.345",  # minutes > 59
+        "00:01:60.345",  # seconds > 59
+        "00:01:02.1000",  # milliseconds > 999
+        "01:02:03",  # missing milliseconds
+        "01:02",  # missing milliseconds
+        ":01:02.345",  # extra : for missing hours
+        "abc:01:02.345",  # invalid format
+    ]
+    for ts in invalid_timestamps:
+        with pytest.raises(ValidationError):
+            _WebVTTTimestamp(raw=ts)
+
+    """Test the timestamp __str__ method."""
+    model = _WebVTTTimestamp(raw="00:01:02.345")
+    assert str(model) == "00:01:02.345"
+
+    """Test valid cue timings."""
+    start = _WebVTTTimestamp(raw="00:10.005")
+    end = _WebVTTTimestamp(raw="00:14.007")
+    cue_timings = _WebVTTCueTimings(start=start, end=end)
+    assert cue_timings.start == start
+    assert cue_timings.end == end
+    assert str(cue_timings) == "00:10.005 --> 00:14.007"
+
+    """Test invalid cue timings with end timestamp before start."""
+    start = _WebVTTTimestamp(raw="00:10.700")
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start, end=end)
+    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
+
+    """Test invalid cue timings with missing end."""
+    start = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(start=start)
+    assert "Field required" in str(excinfo.value)
+
+    """Test invalid cue timings with missing start."""
+    end = _WebVTTTimestamp(raw="00:10.500")
+    with pytest.raises(ValidationError) as excinfo:
+        _WebVTTCueTimings(end=end)
+    assert "Field required" in str(excinfo.value)
+
+    """Test with valid text."""
+    valid_text = "This is a valid cue text span."
+    span = _WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
+    assert str(span) == valid_text
+
+    """Test with text containing newline characters."""
+    invalid_text = "This cue text span\ncontains a newline."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing ampersand."""
+    invalid_text = "This cue text span contains &."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with text containing less-than sign."""
+    invalid_text = "This cue text span contains <."
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+
+    """Test with empty text."""
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text="")
+
+    """Test that annotation validation works correctly."""
+    valid_annotation = "valid-annotation"
+    invalid_annotation = "invalid\nannotation"
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
+    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+
+    """Test that classes validation works correctly."""
+    annotation = "speaker name"
+    valid_classes = ["class1", "class2"]
+    invalid_classes = ["class\nwith\nnewlines", ""]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+
+    """Test that components validation works correctly."""
+    annotation = "speaker name"
+    valid_components = [_WebVTTCueTextSpan(text="random text")]
+    invalid_components = [123, "not a component"]
+    with pytest.raises(ValidationError):
+        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
+    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+
+    """Test valid cue voice spans."""
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        classes=["loud", "clear"],
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+
+    expected_str = "<v.loud.clear speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+    cue_span = _WebVTTCueVoiceSpan(
+        annotation="speaker",
+        components=[_WebVTTCueTextSpan(text="random text")],
+    )
+    expected_str = "<v speaker>random text</v>"
+    assert str(cue_span) == expected_str
+
+
+def test_webvtt_file():
+    """Test WebVTT files."""
+    with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    block = vtt.cue_blocks[11]
+    assert str(block.timings) == "00:32.500 --> 00:33.500"
+    assert len(block.payload) == 1
+    cue_span = block.payload[0]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert cue_span.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.classes
+    assert len(cue_span.components) == 1
+    comp = cue_span.components[0]
+    assert isinstance(comp, _WebVTTCueItalicSpan)
+    assert len(comp.components) == 1
+    comp2 = comp.components[0]
+    assert isinstance(comp2, _WebVTTCueTextSpan)
+    assert comp2.text == "Laughs"
+
+    with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 4
+    reverse = (
+        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
+        "https://www.w3.org/TR/webvtt1/\n\n"
+    )
+    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
+    assert content == reverse
+
+    with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = _WebVTTFile.parse(content)
+    assert len(vtt) == 13
+    for block in vtt:
+        assert block.identifier
+    block = vtt.cue_blocks[0]
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
+    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+    block = vtt.cue_blocks[2]
+    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
+    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
+    assert block.payload[0].text == "Good."

From 813d2b5b175dc5f012685a7bc202761ff29e7090 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 14 Nov 2025 14:53:05 +0100
Subject: [PATCH 02/22] fix(webvtt): deal with HTML entities in cue text spans

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 15 ++++++++++++++-
 test/test_webvtt.py              |  6 ++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index eccae4a6..d7cabdc3 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -98,10 +98,23 @@ class _WebVTTCueTextSpan(BaseModel):
     text: str
     span_type: Literal["text"] = "text"
 
+    _valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
+    _entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);")
+
     @field_validator("text", mode="after")
     @classmethod
     def validate_text(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
+        for match in cls._entity_pattern.finditer(value):
+            entity = match.group(1)
+            if entity not in cls._valid_entities:
+                raise ValueError(
+                    f"Cue text span contains an invalid HTML entity: &{entity};"
+                )
+        if "&" in re.sub(cls._entity_pattern, "", value):
+            raise ValueError(
+                "Found '&' not part of a valid entity in the cue text span"
+            )
+        if any(ch in value for ch in {"\n", "\r", "<"}):
             raise ValueError("Cue text span contains invalid characters")
         if len(value) == 0:
             raise ValueError("Cue text span cannot be empty")
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 75f5dfc1..ea4f2889 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -96,6 +96,12 @@ def test_vtt_cue_commponents():
     invalid_text = "This cue text span contains &."
     with pytest.raises(ValidationError):
         _WebVTTCueTextSpan(text=invalid_text)
+    invalid_text = "An invalid &foo; entity"
+    with pytest.raises(ValidationError):
+        _WebVTTCueTextSpan(text=invalid_text)
+    valid_text = "My favorite book is Pride &amp; Prejudice"
+    span = _WebVTTCueTextSpan(text=valid_text)
+    assert span.text == valid_text
 
     """Test with text containing less-than sign."""
     invalid_text = "This cue text span contains <."

From 67305d50495453146956a076024974eee21ca12b Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 17 Nov 2025 03:32:05 +0100
Subject: [PATCH 03/22] refactor(webvtt): support more WebVTT models

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 367 +++++++++++++++++++------------
 test/test_webvtt.py              | 137 +++++++++---
 2 files changed, 332 insertions(+), 172 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index d7cabdc3..6d60a2d8 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -2,7 +2,8 @@
 
 import logging
 import re
-from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+from enum import Enum
+from typing import Annotated, ClassVar, Literal, Optional, Union
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.types import StringConstraints
@@ -11,8 +12,24 @@
 _log = logging.getLogger(__name__)
 
 
+_VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
+_ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
+_START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
+
+
+class _WebVTTLineTerminator(str, Enum):
+    CRLF = "\r\n"
+    LF = "\n"
+    CR = "\r"
+
+
+_WebVTTCueIdentifier = Annotated[
+    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
+]
+
+
 class _WebVTTTimestamp(BaseModel):
-    """Model representing a WebVTT timestamp.
+    """WebVTT timestamp.
 
     A WebVTT timestamp is always interpreted relative to the current playback position
     of the media data that the WebVTT file is to be synchronized with.
@@ -67,13 +84,8 @@ def __str__(self) -> str:
         return self.raw
 
 
-_WebVTTCueIdentifier = Annotated[
-    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
-
-
 class _WebVTTCueTimings(BaseModel):
-    """Model representating WebVTT cue timings."""
+    """WebVTT cue timings."""
 
     start: Annotated[
         _WebVTTTimestamp, Field(description="Start time offset of the cue")
@@ -93,31 +105,27 @@ def __str__(self):
 
 
 class _WebVTTCueTextSpan(BaseModel):
-    """Model representing a WebVTT cue text span."""
+    """WebVTT cue text span."""
 
-    text: str
-    span_type: Literal["text"] = "text"
-
-    _valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
-    _entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);")
+    kind: Literal["text"] = "text"
+    text: Annotated[str, Field(description="The cue text.")]
 
     @field_validator("text", mode="after")
     @classmethod
-    def validate_text(cls, value: str) -> str:
-        for match in cls._entity_pattern.finditer(value):
+    def is_valid_text(cls, value: str) -> str:
+        for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
-            if entity not in cls._valid_entities:
+            if entity not in _VALID_ENTITIES:
                 raise ValueError(
-                    f"Cue text span contains an invalid HTML entity: &{entity};"
+                    f"Cue text contains an invalid HTML entity: &{entity};"
                 )
-        if "&" in re.sub(cls._entity_pattern, "", value):
-            raise ValueError(
-                "Found '&' not part of a valid entity in the cue text span"
-            )
+        if "&" in re.sub(_ENTITY_PATTERN, "", value):
+            raise ValueError("Found '&' not part of a valid entity in the cue text")
         if any(ch in value for ch in {"\n", "\r", "<"}):
-            raise ValueError("Cue text span contains invalid characters")
+            raise ValueError("Cue text contains invalid characters")
         if len(value) == 0:
-            raise ValueError("Cue text span cannot be empty")
+            raise ValueError("Cue text cannot be empty")
+
         return value
 
     @override
@@ -125,37 +133,48 @@ def __str__(self):
         return self.text
 
 
-class _WebVTTCueVoiceSpan(BaseModel):
-    """Model representing a WebVTT cue voice span."""
+class _WebVTTCueComponentWithTerminator(BaseModel):
+    """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
-    annotation: Annotated[
-        str,
+    component: "_WebVTTCueComponent"
+    terminator: Optional[_WebVTTLineTerminator] = None
+
+    @override
+    def __str__(self):
+        return f"{self.component}{self.terminator.value if self.terminator else ''}"
+
+
+class _WebVTTCueInternalText(BaseModel):
+    """WebVTT cue internal text."""
+
+    terminator: Optional[_WebVTTLineTerminator] = None
+    components: Annotated[
+        list[_WebVTTCueComponentWithTerminator],
         Field(
             description=(
-                "Cue span start tag annotation text representing the name of thevoice"
+                "WebVTT caption or subtitle cue components representing the "
+                "cue internal text"
             )
         ),
-    ]
+    ] = []
+
+    @override
+    def __str__(self):
+        cue_str = (
+            f"{self.terminator.value if self.terminator else ''}"
+            f"{''.join(str(span) for span in self.components)}"
+        )
+        return cue_str
+
+
+class _WebVTTCueSpanStartTag(BaseModel):
+    """WebVTT cue span start tag."""
+
+    name: Annotated[_START_TAG_NAMES, Field(description="The tag name")]
     classes: Annotated[
         list[str],
         Field(description="List of classes representing the cue span's significance"),
     ] = []
-    components: Annotated[
-        list["_WebVTTCueComponent"],
-        Field(description="The components representing the cue internal text"),
-    ] = []
-    span_type: Literal["v"] = "v"
-
-    @field_validator("annotation", mode="after")
-    @classmethod
-    def validate_annotation(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
-            raise ValueError(
-                "Cue span start tag annotation contains invalid characters"
-            )
-        if not value:
-            raise ValueError("Cue text span cannot be empty")
-        return value
 
     @field_validator("classes", mode="after")
     @classmethod
@@ -169,51 +188,113 @@ def validate_classes(cls, value: list[str]) -> list[str]:
                 raise ValueError("Cue span start tag classes cannot be empty")
         return value
 
+    def _get_name_with_classes(self) -> str:
+        return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
+
     @override
     def __str__(self):
-        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
-        inner = "".join(str(span) for span in self.components)
-        return f"<{tag} {self.annotation}>{inner}</v>"
+        return f"<{self._get_name_with_classes()}>"
 
 
-class _WebVTTCueClassSpan(BaseModel):
-    span_type: Literal["c"] = "c"
-    components: list["_WebVTTCueComponent"]
+class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag):
+    """WebVTT cue span start tag requiring an annotation."""
 
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<c>{inner}</c>"
+    annotation: Annotated[str, Field(description="Cue span start tag annotation")]
 
+    @field_validator("annotation", mode="after")
+    @classmethod
+    def is_valid_annotation(cls, value: str) -> str:
+        for match in _ENTITY_PATTERN.finditer(value):
+            entity = match.group(1)
+            if entity not in _VALID_ENTITIES:
+                raise ValueError(
+                    f"Annotation contains an invalid HTML entity: &{entity};"
+                )
+        if "&" in re.sub(_ENTITY_PATTERN, "", value):
+            raise ValueError("Found '&' not part of a valid entity in annotation")
+        if any(ch in value for ch in {"\n", "\r", ">"}):
+            raise ValueError("Annotation contains invalid characters")
+        if len(value) == 0:
+            raise ValueError("Annotation cannot be empty")
 
-class _WebVTTCueItalicSpan(BaseModel):
-    span_type: Literal["i"] = "i"
-    components: list["_WebVTTCueComponent"]
+        return value
 
     @override
     def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<i>{inner}</i>"
+        return f"<{self._get_name_with_classes()} {self.annotation}>"
 
 
-class _WebVTTCueBoldSpan(BaseModel):
-    span_type: Literal["b"] = "b"
-    components: list["_WebVTTCueComponent"]
+class _WebVTTCueComponentBase(BaseModel):
+    """WebVTT caption or subtitle cue component.
 
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<b>{inner}</b>"
+    All the WebVTT caption or subtitle cue components are represented by this class
+    except the WebVTT cue text span, which requires different definitions.
+    """
 
+    kind: Literal["c", "b", "i", "u", "v", "lang"]
+    start_tag: _WebVTTCueSpanStartTag
+    internal_text: _WebVTTCueInternalText
 
-class _WebVTTCueUnderlineSpan(BaseModel):
-    span_type: Literal["u"] = "u"
-    components: list["_WebVTTCueComponent"]
+    @model_validator(mode="after")
+    def check_tag_names_match(self) -> Self:
+        if self.kind != self.start_tag.name:
+            raise ValueError("The tag name of this cue component should be {self.kind}")
+        return self
 
     @override
     def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<u>{inner}</u>"
+        return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
+
+
+class _WebVTTCueVoiceSpan(_WebVTTCueComponentBase):
+    """WebVTT cue voice span associated with a specific voice."""
+
+    kind: Literal["v"] = "v"
+    start_tag: _WebVTTCueSpanStartTagAnnotated
+
+
+class _WebVTTCueClassSpan(_WebVTTCueComponentBase):
+    """WebVTT cue class span.
+
+    It represents a span of text and it is used to annotate parts of the cue with
+    applicable classes without implying further meaning (such as italics or bold).
+    """
+
+    kind: Literal["c"] = "c"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="c")
+
+
+class _WebVTTCueItalicSpan(_WebVTTCueComponentBase):
+    """WebVTT cue italic span representing a span of italic text."""
+
+    kind: Literal["i"] = "i"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="i")
+
+
+class _WebVTTCueBoldSpan(_WebVTTCueComponentBase):
+    """WebVTT cue bold span representing a span of bold text."""
+
+    kind: Literal["b"] = "b"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="b")
+
+
+class _WebVTTCueUnderlineSpan(_WebVTTCueComponentBase):
+    """WebVTT cue underline span representing a span of underline text."""
+
+    kind: Literal["u"] = "u"
+    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="u")
+
+
+class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
+    """WebVTT cue language span.
+
+    It represents a span of text and it is used to annotate parts of the cue where the
+    applicable language might be different than the surrounding text's, without
+    implying further meaning (such as italics or bold).
+    """
+
+    kind: Literal["lang"] = "lang"
+    start_tag: _WebVTTCueSpanStartTagAnnotated
 
 
 _WebVTTCueComponent = Annotated[
@@ -224,8 +305,12 @@ def __str__(self):
         _WebVTTCueBoldSpan,
         _WebVTTCueUnderlineSpan,
         _WebVTTCueVoiceSpan,
+        _WebVTTCueLanguageSpan,
     ],
-    Field(discriminator="span_type", description="The WebVTT cue component"),
+    Field(
+        discriminator="kind",
+        description="The type of WebVTT caption or subtitle cue component.",
+    ),
 ]
 
 
@@ -243,14 +328,17 @@ class _WebVTTCueBlock(BaseModel):
         None, description="The WebVTT cue identifier"
     )
     timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
-    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
+    payload: Annotated[
+        list[_WebVTTCueComponentWithTerminator],
+        Field(description="The WebVTT caption or subtitle cue text"),
+    ]
 
-    _pattern_block: ClassVar[re.Pattern] = re.compile(
-        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
-    )
-    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
-        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
-        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
+    # pattern of a WebVTT cue span start/end tag
+    _pattern_tag: ClassVar[re.Pattern] = re.compile(
+        r"<(?P<end>/?)"
+        r"(?P<tag>i|b|c|u|v|lang)"
+        r"(?P<class>(?:\.[^\t\n\r &<>.]+)*)"
+        r"(?:[ \t](?P<annotation>[^\n\r&>]*))?>"
     )
 
     @field_validator("payload", mode="after")
@@ -284,74 +372,77 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
             start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
         )
         cue_text = " ".join(cue_lines).strip()
-        if cue_text.startswith("<v") and "</v>" not in cue_text:
-            # adding close tag for cue voice spans without end tag
-            cue_text += "</v>"
+        # adding close tag for cue spans without end tag
+        for omm in {"v"}:
+            if cue_text.startswith(f"<{omm}") and f"</{omm}>" not in cue_text:
+                cue_text += f"</{omm}>"
+                break
 
-        stack: list[list[_WebVTTCueComponent]] = [[]]
-        tag_stack: list[Union[str, tuple]] = []
+        stack: list[list[_WebVTTCueComponentWithTerminator]] = [[]]
+        tag_stack: list[dict] = []
 
         pos = 0
-        matches = list(cls._pattern_block.finditer(cue_text))
+        matches = list(cls._pattern_tag.finditer(cue_text))
         i = 0
         while i < len(matches):
             match = matches[i]
             if match.start() > pos:
-                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
-            tag = match.group(0)
-
-            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
-                tag_type = tag[1:2]
-                tag_stack.append(tag_type)
-                stack.append([])
-            elif tag == "</i>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueItalicSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</b>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueBoldSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</u>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</c>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueClassSpan(components=children))
-                tag_stack.pop()
-            elif tag.startswith("<v"):
-                tag_stack.append(("v", tag))
-                stack.append([])
-            elif tag.startswith("</v"):
-                children = stack.pop() if stack else []
-                if (
-                    tag_stack
-                    and isinstance(tag_stack[-1], tuple)
-                    and tag_stack[-1][0] == "v"
-                ):
-                    _, voice = cast(tuple, tag_stack.pop())
-                    voice_match = cls._pattern_voice_tag.match(voice)
-                    if voice_match:
-                        class_string = voice_match.group("class")
-                        annotation = voice_match.group("annotation")
-                        if annotation:
-                            classes: list[str] = []
-                            if class_string:
-                                classes = [c for c in class_string.split(".") if c]
-                            stack[-1].append(
-                                _WebVTTCueVoiceSpan(
-                                    annotation=annotation.strip(),
-                                    classes=classes,
-                                    components=children,
-                                )
+                stack[-1].append(
+                    _WebVTTCueComponentWithTerminator(
+                        component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()])
+                    )
+                )
+            gps = {k: (v if v else None) for k, v in match.groupdict().items()}
+
+            if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}:
+                if not gps["end"]:
+                    tag_stack.append(gps)
+                    stack.append([])
+                else:
+                    children = stack.pop() if stack else []
+                    if tag_stack:
+                        closed = tag_stack.pop()
+                        if (ct := closed["tag"]) != gps["tag"]:
+                            raise ValueError(f"Incorrect end tag: {ct}")
+                        class_string = closed["class"]
+                        annotation = closed["annotation"]
+                        classes: list[str] = []
+                        if class_string:
+                            classes = [c for c in class_string.split(".") if c]
+                        st = (
+                            _WebVTTCueSpanStartTagAnnotated(
+                                name=ct, classes=classes, annotation=annotation.strip()
                             )
+                            if annotation
+                            else _WebVTTCueSpanStartTag(name=ct, classes=classes)
+                        )
+                        it = _WebVTTCueInternalText(components=children)
+                        cp: _WebVTTCueComponent
+                        if ct == "c":
+                            cp = _WebVTTCueClassSpan(start_tag=st, internal_text=it)
+                        elif ct == "b":
+                            cp = _WebVTTCueBoldSpan(start_tag=st, internal_text=it)
+                        elif ct == "i":
+                            cp = _WebVTTCueItalicSpan(start_tag=st, internal_text=it)
+                        elif ct == "u":
+                            cp = _WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
+                        elif ct == "lang":
+                            cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
+                        elif ct == "v":
+                            cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
+                        stack[-1].append(
+                            _WebVTTCueComponentWithTerminator(component=cp)
+                        )
 
             pos = match.end()
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
+            stack[-1].append(
+                _WebVTTCueComponentWithTerminator(
+                    component=_WebVTTCueTextSpan(text=cue_text[pos:])
+                )
+            )
 
         return cls(
             identifier=identifier,
@@ -366,13 +457,13 @@ def __str__(self):
         timings_line = str(self.timings)
         parts.append(timings_line + "\n")
         for idx, span in enumerate(self.payload):
-            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
+            if idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
                 # the end tag may be omitted for brevity
                 parts.append(str(span).removesuffix("</v>"))
             else:
                 parts.append(str(span))
 
-        return "".join(parts)
+        return "".join(parts) + "\n"
 
 
 class _WebVTTFile(BaseModel):
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index ea4f2889..b4d408cb 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -1,11 +1,20 @@
-# Assisted by watsonx Code Assistant
+"""Test the data model for WebVTT files.
 
+Assisted by watsonx Code Assistant.
+Examples extracted from https://www.w3.org/TR/webvtt1/
+Copyright © 2019 World Wide Web Consortium.
+"""
 
 import pytest
 from pydantic import ValidationError
 
 from docling_core.types.doc.webvtt import (
+    _WebVTTCueBlock,
+    _WebVTTCueComponentWithTerminator,
+    _WebVTTCueInternalText,
     _WebVTTCueItalicSpan,
+    _WebVTTCueLanguageSpan,
+    _WebVTTCueSpanStartTagAnnotated,
     _WebVTTCueTextSpan,
     _WebVTTCueTimings,
     _WebVTTCueVoiceSpan,
@@ -18,7 +27,7 @@
 GENERATE = GEN_TEST_DATA
 
 
-def test_vtt_cue_commponents():
+def test_vtt_cue_commponents() -> None:
     """Test WebVTT components."""
     valid_timestamps = [
         "00:01:02.345",
@@ -72,13 +81,13 @@ def test_vtt_cue_commponents():
     """Test invalid cue timings with missing end."""
     start = _WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start)
+        _WebVTTCueTimings(start=start)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test invalid cue timings with missing start."""
     end = _WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(end=end)
+        _WebVTTCueTimings(end=end)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test with valid text."""
@@ -116,44 +125,105 @@ def test_vtt_cue_commponents():
     valid_annotation = "valid-annotation"
     invalid_annotation = "invalid\nannotation"
     with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
-    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
+        _WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
+    assert _WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
 
     """Test that classes validation works correctly."""
     annotation = "speaker name"
     valid_classes = ["class1", "class2"]
     invalid_classes = ["class\nwith\nnewlines", ""]
     with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
+        _WebVTTCueSpanStartTagAnnotated(
+            name="v", annotation=annotation, classes=invalid_classes
+        )
+    assert _WebVTTCueSpanStartTagAnnotated(
+        name="v", annotation=annotation, classes=valid_classes
+    )
 
     """Test that components validation works correctly."""
     annotation = "speaker name"
-    valid_components = [_WebVTTCueTextSpan(text="random text")]
+    valid_components = [
+        _WebVTTCueComponentWithTerminator(
+            component=_WebVTTCueTextSpan(text="random text")
+        )
+    ]
     invalid_components = [123, "not a component"]
     with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
+        _WebVTTCueInternalText(components=invalid_components)
+    assert _WebVTTCueInternalText(components=valid_components)
 
     """Test valid cue voice spans."""
     cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        classes=["loud", "clear"],
-        components=[_WebVTTCueTextSpan(text="random text")],
+        start_tag=_WebVTTCueSpanStartTagAnnotated(
+            name="v", annotation="speaker", classes=["loud", "clear"]
+        ),
+        internal_text=_WebVTTCueInternalText(
+            components=[
+                _WebVTTCueComponentWithTerminator(
+                    component=_WebVTTCueTextSpan(text="random text")
+                )
+            ]
+        ),
     )
-
     expected_str = "<v.loud.clear speaker>random text</v>"
     assert str(cue_span) == expected_str
 
     cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        components=[_WebVTTCueTextSpan(text="random text")],
+        start_tag=_WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
+        internal_text=_WebVTTCueInternalText(
+            components=[
+                _WebVTTCueComponentWithTerminator(
+                    component=_WebVTTCueTextSpan(text="random text")
+                )
+            ]
+        ),
     )
     expected_str = "<v speaker>random text</v>"
     assert str(cue_span) == expected_str
 
 
-def test_webvtt_file():
+def test_webvttcueblock_parse() -> None:
+    """Test the method parse of _WebVTTCueBlock class."""
+    raw: str = (
+        "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n"
+    )
+    block: _WebVTTCueBlock = _WebVTTCueBlock.parse(raw)
+    assert str(block.timings) == "04:02.500 --> 04:05.000"
+    assert len(block.payload) == 1
+    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert (
+        block.payload[0].component.text
+        == "J’ai commencé le basket à l'âge de 13, 14 ans"
+    )
+    assert raw == str(block)
+
+    raw = (
+        "04:05.001 --> 04:07.800\n"
+        "Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier\n"
+    )
+    block = _WebVTTCueBlock.parse(raw)
+    assert str(block.timings) == "04:05.001 --> 04:07.800"
+    assert len(block.payload) == 3
+    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert block.payload[0].component.text == "Sur les "
+    assert isinstance(block.payload[1], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[1].component, _WebVTTCueItalicSpan)
+    assert len(block.payload[1].component.internal_text.components) == 1
+    lang_span = block.payload[1].component.internal_text.components[0].component
+    assert isinstance(lang_span, _WebVTTCueLanguageSpan)
+    assert isinstance(
+        lang_span.internal_text.components[0].component, _WebVTTCueTextSpan
+    )
+    assert lang_span.internal_text.components[0].component.text == "playground"
+    assert isinstance(block.payload[2], _WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[2].component, _WebVTTCueTextSpan)
+    assert block.payload[2].component.text == ", ici à Montpellier"
+    assert raw == str(block)
+
+
+def test_webvtt_file() -> None:
     """Test WebVTT files."""
     with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -163,16 +233,16 @@ def test_webvtt_file():
     assert str(block.timings) == "00:32.500 --> 00:33.500"
     assert len(block.payload) == 1
     cue_span = block.payload[0]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
-    assert cue_span.annotation == "Neil deGrasse Tyson"
-    assert not cue_span.classes
-    assert len(cue_span.components) == 1
-    comp = cue_span.components[0]
-    assert isinstance(comp, _WebVTTCueItalicSpan)
-    assert len(comp.components) == 1
-    comp2 = comp.components[0]
-    assert isinstance(comp2, _WebVTTCueTextSpan)
-    assert comp2.text == "Laughs"
+    assert isinstance(cue_span.component, _WebVTTCueVoiceSpan)
+    assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson"
+    assert not cue_span.component.start_tag.classes
+    assert len(cue_span.component.internal_text.components) == 1
+    comp = cue_span.component.internal_text.components[0]
+    assert isinstance(comp.component, _WebVTTCueItalicSpan)
+    assert len(comp.component.internal_text.components) == 1
+    comp2 = comp.component.internal_text.components[0]
+    assert isinstance(comp2.component, _WebVTTCueTextSpan)
+    assert comp2.component.text == "Laughs"
 
     with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -182,8 +252,8 @@ def test_webvtt_file():
         "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
         "https://www.w3.org/TR/webvtt1/\n\n"
     )
-    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
-    assert content == reverse
+    reverse += "\n".join([str(block) for block in vtt.cue_blocks])
+    assert content == reverse.rstrip()
 
     with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -195,11 +265,10 @@ def test_webvtt_file():
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
     assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
+    assert isinstance(block.payload[0].component, _WebVTTCueVoiceSpan)
     block = vtt.cue_blocks[2]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
     assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
-    assert block.payload[0].text == "Good."
+    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert block.payload[0].component.text == "Good."

From a9ff665e1b3e99eb918e2c7968daf7477ef20f72 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Thu, 27 Nov 2025 18:58:35 +0100
Subject: [PATCH 04/22] refactor(DoclingDocument): create a new provenance
 model for media file types

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/azure.py   |  17 +-
 docling_core/transforms/serializer/common.py  |  20 +-
 docling_core/transforms/serializer/doctags.py |  14 +-
 .../visualizer/key_value_visualizer.py        |  13 +-
 .../visualizer/layout_visualizer.py           |  16 +-
 .../visualizer/reading_order_visualizer.py    |   3 +-
 .../transforms/visualizer/table_visualizer.py |  11 +-
 docling_core/types/doc/__init__.py            |   1 +
 docling_core/types/doc/document.py            | 196 ++++++++++-----
 docling_core/types/doc/webvtt.py              |  73 ++----
 docling_core/utils/legacy.py                  |   8 +-
 docs/DoclingDocument.json                     | 229 ++++++++++++++++--
 12 files changed, 439 insertions(+), 162 deletions(-)

diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py
index 385aca6a..ed91aee2 100644
--- a/docling_core/transforms/serializer/azure.py
+++ b/docling_core/transforms/serializer/azure.py
@@ -44,9 +44,10 @@
     DocSerializer,
     create_ser_result,
 )
-from docling_core.types.doc.base import CoordOrigin
-from docling_core.types.doc.document import (
+from docling_core.types.doc import (
+    CoordOrigin,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
     FormItem,
     InlineGroup,
@@ -54,12 +55,12 @@
     ListGroup,
     NodeItem,
     PictureItem,
+    ProvenanceItem,
     RefItem,
     RichTableCell,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel
 
 
 def _bbox_to_polygon_coords(
@@ -76,7 +77,7 @@ def _bbox_to_polygon_coords(
 
 def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]:
     """Compute a TOPLEFT-origin polygon for the first provenance of the item."""
-    if not item.prov:
+    if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
         return None
 
     prov = item.prov[0]
@@ -187,7 +188,7 @@ def serialize(
 
         # Lists may be represented either as TextItem(ListItem) or via groups;
         # we treat any TextItem as a paragraph-like entry.
-        if item.prov:
+        if item.prov and isinstance(item.prov[0], ProvenanceItem):
             prov = item.prov[0]
             page_no = prov.page_no
             polygon = _bbox_to_polygon_for_item(doc, item)
@@ -237,7 +238,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov:
+        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
             return create_ser_result()
 
         prov = item.prov[0]
@@ -308,7 +309,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov:
+        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
             return create_ser_result()
 
         prov = item.prov[0]
@@ -324,7 +325,7 @@ def serialize(
         for foot_ref in item.footnotes:
             if isinstance(foot_ref, RefItem):
                 tgt = foot_ref.resolve(doc)
-                if isinstance(tgt, TextItem) and tgt.prov:
+                if isinstance(tgt, TextItem) and tgt.prov and isinstance(tgt.prov[0], ProvenanceItem):
                     f_poly = _bbox_to_polygon_for_item(doc, tgt)
                     if f_poly is not None:
                         foots.append(
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index 3a8ad71c..c9c497f4 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -35,11 +35,11 @@
     SerializationResult,
     Span,
 )
-from docling_core.types.doc.document import (
-    DOCUMENT_TOKENS_EXPORT_LABELS,
+from docling_core.types.doc import (
     ContentLayer,
     DescriptionAnnotation,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
     FloatingItem,
     Formatting,
@@ -52,12 +52,13 @@
     PictureDataType,
     PictureItem,
     PictureMoleculeData,
+    ProvenanceItem,
     Script,
     TableAnnotationType,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS
 
 _DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
 _DEFAULT_LAYERS = set(ContentLayer)
@@ -108,7 +109,7 @@ def _iterate_items(
                     add_page_breaks=add_page_breaks,
                     visited=my_visited,
                 ):
-                    if isinstance(it, DocItem) and it.prov:
+                    if isinstance(it, DocItem) and it.prov and isinstance(it.prov[0], ProvenanceItem):
                         page_no = it.prov[0].page_no
                         if prev_page_nr is not None and page_no > prev_page_nr:
                             yield (
@@ -120,7 +121,7 @@ def _iterate_items(
                                 lvl,
                             )
                         break
-            elif isinstance(item, DocItem) and item.prov:
+            elif isinstance(item, DocItem) and item.prov and isinstance(item.prov[0], ProvenanceItem):
                 page_no = item.prov[0].page_no
                 if prev_page_nr is None or page_no > prev_page_nr:
                     if prev_page_nr is not None:  # close previous range
@@ -301,7 +302,13 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
                             or item.content_layer not in params.layers
                             or (
                                 params.pages is not None
-                                and ((not item.prov) or item.prov[0].page_no not in params.pages)
+                                and (
+                                    (not item.prov)
+                                    or (
+                                        isinstance(item.prov[0], ProvenanceItem)
+                                        and item.prov[0].page_no not in params.pages
+                                    )
+                                )
                             )
                         )
                     )
@@ -671,6 +678,7 @@ def _get_applicable_pages(self) -> Optional[list[int]]:
             if (
                 isinstance(item, DocItem)
                 and item.prov
+                and isinstance(item.prov[0], ProvenanceItem)
                 and (self.params.pages is None or item.prov[0].page_no in self.params.pages)
                 and ix >= self.params.start_idx
                 and ix < self.params.stop_idx
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index e5672638..16549652 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -26,11 +26,13 @@
     _should_use_legacy_annotations,
     create_ser_result,
 )
-from docling_core.types.doc.base import BoundingBox
 from docling_core.types.doc.document import (
+    BoundingBox,
     CodeItem,
     DocItem,
+    DocItemLabel,
     DoclingDocument,
+    DocumentToken,
     FloatingItem,
     FormItem,
     GroupItem,
@@ -40,6 +42,7 @@
     ListItem,
     NodeItem,
     PictureClassificationData,
+    PictureClassificationLabel,
     PictureItem,
     PictureMoleculeData,
     PictureTabularChartData,
@@ -47,10 +50,9 @@
     SectionHeaderItem,
     TableData,
     TableItem,
+    TableToken,
     TextItem,
 )
-from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
-from docling_core.types.doc.tokens import DocumentToken, TableToken
 
 
 def _wrap(text: str, wrap_tag: str) -> str:
@@ -343,7 +345,7 @@ def serialize(
         results: list[SerializationResult] = []
 
         page_no = 1
-        if len(item.prov) > 0:
+        if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem):
             page_no = item.prov[0].page_no
 
         if params.add_location:
@@ -361,7 +363,7 @@ def serialize(
 
         for cell in item.graph.cells:
             cell_txt = ""
-            if cell.prov is not None:
+            if cell.prov is not None and isinstance(cell.prov, ProvenanceItem):
                 if len(doc.pages.keys()):
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
                     cell_txt += DocumentToken.get_location(
@@ -469,7 +471,7 @@ def _get_inline_location_tags(
         doc_items: list[DocItem] = []
         for it, _ in doc.iterate_items(root=item):
             if isinstance(it, DocItem):
-                for prov in it.prov:
+                for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)):
                     boxes.append(prov.bbox)
                     doc_items.append(it)
         if prov is None:
diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py
index 5ed7b843..e2b10264 100644
--- a/docling_core/transforms/visualizer/key_value_visualizer.py
+++ b/docling_core/transforms/visualizer/key_value_visualizer.py
@@ -16,8 +16,13 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc.document import ContentLayer, DoclingDocument
-from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
+from docling_core.types.doc import (
+    ContentLayer,
+    DoclingDocument,
+    GraphCellLabel,
+    GraphLinkLabel,
+    ProvenanceItem,
+)
 
 # ---------------------------------------------------------------------------
 # Helper functions / constants
@@ -82,7 +87,7 @@ def _draw_key_value_layer(
             # First draw cells (rectangles + optional labels)
             # ------------------------------------------------------------------
             for cell in cell_dict.values():
-                if cell.prov is None or cell.prov.page_no != page_no:
+                if cell.prov is None or not isinstance(cell.prov, ProvenanceItem) or cell.prov.page_no != page_no:
                     continue  # skip cells not on this page or without bbox
 
                 tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height)
@@ -149,6 +154,8 @@ def _draw_key_value_layer(
                 if (
                     src_cell.prov is None
                     or tgt_cell.prov is None
+                    or not isinstance(src_cell.prov, ProvenanceItem)
+                    or not isinstance(tgt_cell.prov, ProvenanceItem)
                     or src_cell.prov.page_no != page_no
                     or tgt_cell.prov.page_no != page_no
                 ):
diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py
index 369a7b38..8ac6bf81 100644
--- a/docling_core/transforms/visualizer/layout_visualizer.py
+++ b/docling_core/transforms/visualizer/layout_visualizer.py
@@ -10,10 +10,16 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc import DocItemLabel
-from docling_core.types.doc.base import CoordOrigin
-from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc import (
+    BoundingRectangle,
+    ContentLayer,
+    CoordOrigin,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    ProvenanceItem,
+    TextCell,
+)
 
 
 class _TLBoundingRectangle(BoundingRectangle):
@@ -173,7 +179,7 @@ def _draw_doc_layout(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in elem.prov:
+            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
                 page_nr = prov.page_no
 
                 if page_nr in my_images:
diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py
index 60874333..27583613 100644
--- a/docling_core/transforms/visualizer/reading_order_visualizer.py
+++ b/docling_core/transforms/visualizer/reading_order_visualizer.py
@@ -14,6 +14,7 @@
     DocItem,
     DoclingDocument,
     PictureItem,
+    ProvenanceItem,
 )
 
 
@@ -130,7 +131,7 @@ def _draw_doc_reading_order(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in elem.prov:
+            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
                 page_no = prov.page_no
                 image = my_images.get(page_no)
 
diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py
index ba80cda3..787a1bff 100644
--- a/docling_core/transforms/visualizer/table_visualizer.py
+++ b/docling_core/transforms/visualizer/table_visualizer.py
@@ -10,7 +10,12 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
+from docling_core.types.doc import (
+    ContentLayer,
+    DoclingDocument,
+    ProvenanceItem,
+    TableItem,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -190,10 +195,10 @@ def _draw_doc_tables(
                 image = pil_img.copy()
                 my_images[page_nr] = image
 
-        for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
+        for _, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
             if not isinstance(elem, TableItem):
                 continue
-            if len(elem.prov) == 0:
+            if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem):
                 continue  # Skip elements without provenances
 
             if len(elem.prov) == 1:
diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index 3c699f89..f0e0e92d 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -61,6 +61,7 @@
     Script,
     SectionHeaderItem,
     SummaryMetaField,
+    TableAnnotationType,
     TableCell,
     TableData,
     TableItem,
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 2f18c0f2..898be029 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -32,10 +32,12 @@
     AnyUrl,
     BaseModel,
     ConfigDict,
+    Discriminator,
     Field,
     FieldSerializationInfo,
     SerializerFunctionWrapHandler,
     StringConstraints,
+    Tag,
     computed_field,
     field_serializer,
     field_validator,
@@ -65,6 +67,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
+from docling_core.types.doc.webvtt import _WebVTTTimestamp
 
 _logger = logging.getLogger(__name__)
 
@@ -1185,11 +1188,81 @@ def from_multipage_doctags_and_images(
 
 
 class ProvenanceItem(BaseModel):
-    """ProvenanceItem."""
+    """Provenance information for elements extracted from a textual document.
 
-    page_no: int
-    bbox: BoundingBox
-    charspan: tuple[int, int]
+    A `ProvenanceItem` object acts as a lightweight pointer back into the original
+    document for an extracted element. It applies to documents with an explicity
+    or implicit layout, such as PDF, HTML, docx, or pptx.
+    """
+
+    page_no: Annotated[int, Field(description="Page number")]
+    bbox: Annotated[BoundingBox, Field(description="Bounding box")]
+    charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")]
+
+
+class ProvenanceTrack(BaseModel):
+    """Provenance information for elements extracted from media assets.
+
+    A `ProvenanceTrack` instance describes a cue in a text track associated with a
+    media element (audio, video, subtitles, screen recordings, ...).
+    """
+
+    start_time: Annotated[
+        _WebVTTTimestamp,
+        Field(
+            examples=["00.11.000", "00:00:06.500", "01:28:34.300"],
+            description="Start time offset of the track cue",
+        ),
+    ]
+    end_time: Annotated[
+        _WebVTTTimestamp,
+        Field(
+            examples=["00.12.000", "00:00:08.200", "01:29:30.100"],
+            description="End time offset of the track cue",
+        ),
+    ]
+    identifier: Optional[str] = Field(
+        None,
+        examples=["test", "123", "b72d946"],
+        description="An identifier of the cue",
+    )
+    voice: Optional[str] = Field(
+        None,
+        examples=["Mary", "Fred", "Name Surname"],
+        description="The cue voice (speaker)",
+    )
+    language: Optional[str] = Field(
+        None,
+        examples=["en", "en-GB", "fr-CA"],
+        description="Language of the cue in BCP 47 language tag format",
+    )
+    classes: Optional[list[str]] = Field(
+        None,
+        min_length=1,
+        examples=["first", "loud", "yellow"],
+        description="Classes for describing the cue significance",
+    )
+
+
+def get_provenance_discriminator_value(v: Any) -> str:
+    """Callable discriminator for provenance instances.
+
+    Args:
+        v: Either dict or model input.
+
+    Returns:
+        A string discriminator of provenance instances.
+    """
+    fields = {"bbox", "page_no", "charspan"}
+    if isinstance(v, dict):
+        return "item" if any(f in v for f in fields) else "track"
+    return "item" if any(hasattr(v, f) for f in fields) else "track"
+
+
+ProvenanceType = Annotated[
+    Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]],
+    Discriminator(get_provenance_discriminator_value),
+]
 
 
 class ContentLayer(str, Enum):
@@ -1498,7 +1571,7 @@ class DocItem(NodeItem):  # Base type for any element that carries content, can
     """DocItem."""
 
     label: DocItemLabel
-    prov: list[ProvenanceItem] = []
+    prov: list[ProvenanceType] = []
     comments: list[FineRef] = []  # References to comment items annotating this content
 
     @model_serializer(mode="wrap")
@@ -1523,7 +1596,7 @@ def get_location_tokens(
             return ""
 
         location = ""
-        for prov in self.prov:
+        for prov in (item for item in self.prov if isinstance(item, ProvenanceItem)):
             page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
 
             loc_str = DocumentToken.get_location(
@@ -1545,10 +1618,13 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if a valid image of the page containing this DocItem is not available
         in doc.
         """
-        if not len(self.prov):
+        if not self.prov or prov_index >= len(self.prov):
+            return None
+        prov = self.prov[prov_index]
+        if not isinstance(prov, ProvenanceItem):
             return None
 
-        page = doc.pages.get(self.prov[prov_index].page_no)
+        page = doc.pages.get(prov.page_no)
         if page is None or page.size is None or page.image is None:
             return None
 
@@ -1556,9 +1632,9 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if not page_image:
             return None
         crop_bbox = (
-            self.prov[prov_index]
-            .bbox.to_top_left_origin(page_height=page.size.height)
-            .scale_to_size(old_size=page.size, new_size=page.image.size)
+            prov.bbox.to_top_left_origin(page_height=page.size.height).scale_to_size(
+                old_size=page.size, new_size=page.image.size
+            )
             # .scaled(scale=page_image.height / page.size.height)
         )
         return page_image.crop(crop_bbox.as_tuple())
@@ -2229,7 +2305,7 @@ def export_to_otsl(
             return ""
 
         page_no = 0
-        if len(self.prov) > 0:
+        if len(self.prov) > 0 and isinstance(self.prov[0], ProvenanceItem):
             page_no = self.prov[0].page_no
 
         for i in range(nrows):
@@ -2359,7 +2435,7 @@ class GraphCell(BaseModel):
     text: str  # sanitized text
     orig: str  # text as seen on document
 
-    prov: Optional[ProvenanceItem] = None
+    prov: Optional[ProvenanceType] = None
 
     # in case you have a text, table or picture item
     item_ref: Optional[RefItem] = None
@@ -3008,7 +3084,7 @@ def add_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3019,7 +3095,7 @@ def add_list_item(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3060,7 +3136,7 @@ def add_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3071,7 +3147,7 @@ def add_text(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3197,7 +3273,7 @@ def add_table(
         self,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
@@ -3207,7 +3283,7 @@ def add_table(
 
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
 
@@ -3243,7 +3319,7 @@ def add_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
     ):
@@ -3252,7 +3328,7 @@ def add_picture(
         :param data: Optional[list[PictureData]]: (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3284,7 +3360,7 @@ def add_title(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3295,7 +3371,7 @@ def add_title(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3330,7 +3406,7 @@ def add_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3343,7 +3419,7 @@ def add_code(
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3381,7 +3457,7 @@ def add_formula(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3392,7 +3468,7 @@ def add_formula(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3426,7 +3502,7 @@ def add_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3438,7 +3514,7 @@ def add_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3471,13 +3547,13 @@ def add_heading(
     def add_key_values(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_key_values.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3502,13 +3578,13 @@ def add_key_values(
     def add_form(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_form.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3697,7 +3773,7 @@ def insert_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3710,7 +3786,7 @@ def insert_list_item(
         :param enumerated: bool:  (Default value = False)
         :param marker: Optional[str]:  (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3769,7 +3845,7 @@ def insert_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3781,7 +3857,7 @@ def insert_text(
         :param label: DocItemLabel:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3881,7 +3957,7 @@ def insert_table(
         sibling: NodeItem,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
         annotations: Optional[list[TableAnnotationType]] = None,
@@ -3892,7 +3968,7 @@ def insert_table(
         :param sibling: NodeItem:
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param annotations: Optional[list[TableAnnotationType]]: (Default value = None)
@@ -3929,7 +4005,7 @@ def insert_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         after: bool = True,
     ) -> PictureItem:
@@ -3939,7 +4015,7 @@ def insert_picture(
         :param annotations: Optional[list[PictureDataType]]: (Default value = None)
         :param image: Optional[ImageRef]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
@@ -3973,7 +4049,7 @@ def insert_title(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3984,7 +4060,7 @@ def insert_title(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4024,7 +4100,7 @@ def insert_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4037,7 +4113,7 @@ def insert_code(
         :param code_language: Optional[str]: (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4079,7 +4155,7 @@ def insert_formula(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4090,7 +4166,7 @@ def insert_formula(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4129,7 +4205,7 @@ def insert_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4141,7 +4217,7 @@ def insert_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4179,14 +4255,14 @@ def insert_key_values(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         after: bool = True,
     ) -> KeyValueItem:
         """Creates a new KeyValueItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: KeyValueItem: The newly created KeyValueItem item.
@@ -4208,14 +4284,14 @@ def insert_form(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceItem] = None,
+        prov: Optional[ProvenanceType] = None,
         after: bool = True,
     ) -> FormItem:
         """Creates a new FormItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceItem]:  (Default value = None)
+        :param prov: Optional[ProvenanceType]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: FormItem: The newly created FormItem item.
@@ -4552,7 +4628,10 @@ def _iterate_items_with_stack(
             (not isinstance(root, GroupItem) or with_groups)
             and (
                 not isinstance(root, DocItem)
-                or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov))
+                or (
+                    page_nrs is None
+                    or any(prov.page_no in page_nrs for prov in root.prov if isinstance(prov, ProvenanceItem))
+                )
             )
             and root.content_layer in my_layers
         )
@@ -4655,7 +4734,7 @@ def _with_pictures_refs(
         image_dir.mkdir(parents=True, exist_ok=True)
 
         if image_dir.is_dir():
-            for item, level in result.iterate_items(page_no=page_no, with_groups=False):
+            for item, _ in result.iterate_items(page_no=page_no, with_groups=False):
                 if isinstance(item, PictureItem):
                     img = item.get_image(doc=self)
                     if img is not None:
@@ -4674,10 +4753,11 @@ def _with_pictures_refs(
                             else:
                                 obj_path = loc_path
 
-                            if item.image is None:
+                            if item.image is None and isinstance(item.prov[0], ProvenanceItem):
                                 scale = img.size[0] / item.prov[0].bbox.width
                                 item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale))
-                            item.image.uri = Path(obj_path)
+                            elif item.image is not None:
+                                item.image.uri = Path(obj_path)
 
                         # if item.image._pil is not None:
                         #    item.image._pil.close()
@@ -6079,7 +6159,7 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
                     if isinstance(new_item, DocItem):
                         # update page numbers
                         # NOTE other prov sources (e.g. GraphCell) currently not covered
-                        for prov in new_item.prov:
+                        for prov in (item for item in new_item.prov if isinstance(item, ProvenanceItem)):
                             prov.page_no += page_delta
 
                     if item.parent:
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 6d60a2d8..bddd6140 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -23,9 +23,7 @@ class _WebVTTLineTerminator(str, Enum):
     CR = "\r"
 
 
-_WebVTTCueIdentifier = Annotated[
-    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
+_WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
 class _WebVTTTimestamp(BaseModel):
@@ -39,14 +37,10 @@ class _WebVTTTimestamp(BaseModel):
 
     raw: Annotated[
         str,
-        Field(
-            description="A representation of the WebVTT Timestamp as a single string"
-        ),
+        Field(description="A representation of the WebVTT Timestamp as a single string"),
     ]
 
-    _pattern: ClassVar[re.Pattern] = re.compile(
-        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
-    )
+    _pattern: ClassVar[re.Pattern] = re.compile(r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$")
     _hours: int
     _minutes: int
     _seconds: int
@@ -72,12 +66,7 @@ def validate_raw(self) -> Self:
     @property
     def seconds(self) -> float:
         """A representation of the WebVTT Timestamp in seconds."""
-        return (
-            self._hours * 3600
-            + self._minutes * 60
-            + self._seconds
-            + self._millis / 1000.0
-        )
+        return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
 
     @override
     def __str__(self) -> str:
@@ -87,9 +76,7 @@ def __str__(self) -> str:
 class _WebVTTCueTimings(BaseModel):
     """WebVTT cue timings."""
 
-    start: Annotated[
-        _WebVTTTimestamp, Field(description="Start time offset of the cue")
-    ]
+    start: Annotated[_WebVTTTimestamp, Field(description="Start time offset of the cue")]
     end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
 
     @model_validator(mode="after")
@@ -116,9 +103,7 @@ def is_valid_text(cls, value: str) -> str:
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
-                raise ValueError(
-                    f"Cue text contains an invalid HTML entity: &{entity};"
-                )
+                raise ValueError(f"Cue text contains an invalid HTML entity: &{entity};")
         if "&" in re.sub(_ENTITY_PATTERN, "", value):
             raise ValueError("Found '&' not part of a valid entity in the cue text")
         if any(ch in value for ch in {"\n", "\r", "<"}):
@@ -150,20 +135,12 @@ class _WebVTTCueInternalText(BaseModel):
     terminator: Optional[_WebVTTLineTerminator] = None
     components: Annotated[
         list[_WebVTTCueComponentWithTerminator],
-        Field(
-            description=(
-                "WebVTT caption or subtitle cue components representing the "
-                "cue internal text"
-            )
-        ),
+        Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
     ] = []
 
     @override
     def __str__(self):
-        cue_str = (
-            f"{self.terminator.value if self.terminator else ''}"
-            f"{''.join(str(span) for span in self.components)}"
-        )
+        cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
         return cue_str
 
 
@@ -181,9 +158,7 @@ class _WebVTTCueSpanStartTag(BaseModel):
     def validate_classes(cls, value: list[str]) -> list[str]:
         for item in value:
             if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
-                raise ValueError(
-                    "A cue span start tag class contains invalid characters"
-                )
+                raise ValueError("A cue span start tag class contains invalid characters")
             if not item:
                 raise ValueError("Cue span start tag classes cannot be empty")
         return value
@@ -207,9 +182,7 @@ def is_valid_annotation(cls, value: str) -> str:
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
-                raise ValueError(
-                    f"Annotation contains an invalid HTML entity: &{entity};"
-                )
+                raise ValueError(f"Annotation contains an invalid HTML entity: &{entity};")
         if "&" in re.sub(_ENTITY_PATTERN, "", value):
             raise ValueError("Found '&' not part of a valid entity in annotation")
         if any(ch in value for ch in {"\n", "\r", ">"}):
@@ -324,9 +297,7 @@ class _WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Optional[_WebVTTCueIdentifier] = Field(
-        None, description="The WebVTT cue identifier"
-    )
+    identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
     timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[_WebVTTCueComponentWithTerminator],
@@ -368,9 +339,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
 
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(
-            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
-        )
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end))
         cue_text = " ".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
@@ -388,9 +357,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
             match = matches[i]
             if match.start() > pos:
                 stack[-1].append(
-                    _WebVTTCueComponentWithTerminator(
-                        component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()])
-                    )
+                    _WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
                 )
             gps = {k: (v if v else None) for k, v in match.groupdict().items()}
 
@@ -410,9 +377,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
                         st = (
-                            _WebVTTCueSpanStartTagAnnotated(
-                                name=ct, classes=classes, annotation=annotation.strip()
-                            )
+                            _WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
                             if annotation
                             else _WebVTTCueSpanStartTag(name=ct, classes=classes)
                         )
@@ -430,19 +395,13 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                             cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
                         elif ct == "v":
                             cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
-                        stack[-1].append(
-                            _WebVTTCueComponentWithTerminator(component=cp)
-                        )
+                        stack[-1].append(_WebVTTCueComponentWithTerminator(component=cp))
 
             pos = match.end()
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(
-                _WebVTTCueComponentWithTerminator(
-                    component=_WebVTTCueTextSpan(text=cue_text[pos:])
-                )
-            )
+            stack[-1].append(_WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos:])))
 
         return cls(
             identifier=identifier,
diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py
index 04761799..5ebac4be 100644
--- a/docling_core/utils/legacy.py
+++ b/docling_core/utils/legacy.py
@@ -7,20 +7,23 @@
 
 from docling_core.types.doc import (
     BoundingBox,
+    ContentLayer,
     CoordOrigin,
     DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
+    GroupItem,
+    ListItem,
     PictureItem,
     ProvenanceItem,
     SectionHeaderItem,
     Size,
     TableCell,
+    TableData,
     TableItem,
     TextItem,
 )
-from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
 from docling_core.types.legacy_doc.base import (
     BaseCell,
     BaseText,
@@ -162,6 +165,7 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
                         span=[0, len(item.text)],
                     )
                     for p in item.prov
+                    if isinstance(p, ProvenanceItem)
                 ]
                 main_text.append(
                     BaseText(
@@ -283,6 +287,7 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, 0],
                             )
                             for p in item.prov
+                            if isinstance(p, ProvenanceItem)
                         ],
                     )
                 )
@@ -310,6 +315,7 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, len(caption)],
                             )
                             for p in item.prov
+                            if isinstance(p, ProvenanceItem)
                         ],
                         obj_type=doc_item_label_to_legacy_type(item.label),
                         text=caption,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 03b7d8cd..eca74ef4 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -233,7 +233,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -651,7 +658,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -793,7 +807,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -873,13 +894,21 @@
         "prov": {
           "anyOf": [
             {
-              "$ref": "#/$defs/ProvenanceItem"
+              "oneOf": [
+                {
+                  "$ref": "#/$defs/ProvenanceItem"
+                },
+                {
+                  "$ref": "#/$defs/ProvenanceTrack"
+                }
+              ]
             },
             {
               "type": "null"
             }
           ],
-          "default": null
+          "default": null,
+          "title": "Prov"
         },
         "item_ref": {
           "anyOf": [
@@ -1198,7 +1227,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -1370,7 +1406,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -1746,7 +1789,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2139,16 +2189,19 @@
       "type": "object"
     },
     "ProvenanceItem": {
-      "description": "ProvenanceItem.",
+      "description": "Provenance information for elements extracted from a textual document.\n\nA `ProvenanceItem` object acts as a lightweight pointer back into the original\ndocument for an extracted element. It applies to documents with an explicity\nor implicit layout, such as PDF, HTML, docx, or pptx.",
       "properties": {
         "page_no": {
+          "description": "Page number",
           "title": "Page No",
           "type": "integer"
         },
         "bbox": {
-          "$ref": "#/$defs/BoundingBox"
+          "$ref": "#/$defs/BoundingBox",
+          "description": "Bounding box"
         },
         "charspan": {
+          "description": "Character span (0-indexed)",
           "maxItems": 2,
           "minItems": 2,
           "prefixItems": [
@@ -2171,6 +2224,111 @@
       "title": "ProvenanceItem",
       "type": "object"
     },
+    "ProvenanceTrack": {
+      "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
+      "properties": {
+        "start_time": {
+          "$ref": "#/$defs/_WebVTTTimestamp",
+          "description": "Start time offset of the track cue",
+          "examples": [
+            "00.11.000",
+            "00:00:06.500",
+            "01:28:34.300"
+          ]
+        },
+        "end_time": {
+          "$ref": "#/$defs/_WebVTTTimestamp",
+          "description": "End time offset of the track cue",
+          "examples": [
+            "00.12.000",
+            "00:00:08.200",
+            "01:29:30.100"
+          ]
+        },
+        "identifier": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "An identifier of the cue",
+          "examples": [
+            "test",
+            "123",
+            "b72d946"
+          ],
+          "title": "Identifier"
+        },
+        "voice": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The cue voice (speaker)",
+          "examples": [
+            "Mary",
+            "Fred",
+            "Name Surname"
+          ],
+          "title": "Voice"
+        },
+        "language": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Language of the cue in BCP 47 language tag format",
+          "examples": [
+            "en",
+            "en-GB",
+            "fr-CA"
+          ],
+          "title": "Language"
+        },
+        "classes": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "minItems": 1,
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Classes for describing the cue significance",
+          "examples": [
+            "first",
+            "loud",
+            "yellow"
+          ],
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "ProvenanceTrack",
+      "type": "object"
+    },
     "RefItem": {
       "description": "RefItem.",
       "properties": {
@@ -2327,7 +2485,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2622,7 +2787,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2827,7 +2999,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2939,7 +3118,14 @@
         "prov": {
           "default": [],
           "items": {
-            "$ref": "#/$defs/ProvenanceItem"
+            "oneOf": [
+              {
+                "$ref": "#/$defs/ProvenanceItem"
+              },
+              {
+                "$ref": "#/$defs/ProvenanceTrack"
+              }
+            ]
           },
           "title": "Prov",
           "type": "array"
@@ -2997,6 +3183,21 @@
       ],
       "title": "TitleItem",
       "type": "object"
+    },
+    "_WebVTTTimestamp": {
+      "description": "WebVTT timestamp.\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
+      "properties": {
+        "raw": {
+          "description": "A representation of the WebVTT Timestamp as a single string",
+          "title": "Raw",
+          "type": "string"
+        }
+      },
+      "required": [
+        "raw"
+      ],
+      "title": "_WebVTTTimestamp",
+      "type": "object"
     }
   },
   "description": "DoclingDocument.",

From 110c319a40a83998fa14f61a0dfe6218a8e16b18 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Thu, 4 Dec 2025 14:49:53 +0100
Subject: [PATCH 05/22] refactor(webvtt): make WebVTTTimestamp public

Since WebVTTTimestamp is used in DoclingDocument, the class should be public.
Strengthen validation of cue language start tag annotation.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/__init__.py |  1 +
 docling_core/types/doc/document.py |  6 ++--
 docling_core/types/doc/webvtt.py   | 50 ++++++++++++++++++++++++------
 docs/DoclingDocument.json          | 10 +++---
 test/test_webvtt.py                | 31 ++++++++++++------
 5 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index f0e0e92d..d8ddd0b4 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -56,6 +56,7 @@
     PictureStackedBarChartData,
     PictureTabularChartData,
     ProvenanceItem,
+    ProvenanceTrack,
     RefItem,
     RichTableCell,
     Script,
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 898be029..000d0811 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -67,7 +67,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
-from docling_core.types.doc.webvtt import _WebVTTTimestamp
+from docling_core.types.doc.webvtt import WebVTTTimestamp
 
 _logger = logging.getLogger(__name__)
 
@@ -1208,14 +1208,14 @@ class ProvenanceTrack(BaseModel):
     """
 
     start_time: Annotated[
-        _WebVTTTimestamp,
+        WebVTTTimestamp,
         Field(
             examples=["00.11.000", "00:00:06.500", "01:28:34.300"],
             description="Start time offset of the track cue",
         ),
     ]
     end_time: Annotated[
-        _WebVTTTimestamp,
+        WebVTTTimestamp,
         Field(
             examples=["00.12.000", "00:00:08.200", "01:29:30.100"],
             description="End time offset of the track cue",
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index bddd6140..f6a6ea73 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -26,9 +26,18 @@ class _WebVTTLineTerminator(str, Enum):
 _WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
-class _WebVTTTimestamp(BaseModel):
+class WebVTTTimestamp(BaseModel):
     """WebVTT timestamp.
 
+    The timestamp is a string consisting of the following components in the given order:
+
+    - hours (optional, required if non-zero): two or more digits
+    - minutes: two digits between 0 and 59
+    - a colon character (:)
+    - seconds: two digits between 0 and 59
+    - a full stop character (.)
+    - thousandths of a second: three digits
+
     A WebVTT timestamp is always interpreted relative to the current playback position
     of the media data that the WebVTT file is to be synchronized with.
     """
@@ -48,6 +57,7 @@ class _WebVTTTimestamp(BaseModel):
 
     @model_validator(mode="after")
     def validate_raw(self) -> Self:
+        """Validate the WebVTT timestamp as a string."""
         m = self._pattern.match(self.raw)
         if not m:
             raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
@@ -70,14 +80,15 @@ def seconds(self) -> float:
 
     @override
     def __str__(self) -> str:
+        """Return a string representation of a WebVTT timestamp."""
         return self.raw
 
 
 class _WebVTTCueTimings(BaseModel):
     """WebVTT cue timings."""
 
-    start: Annotated[_WebVTTTimestamp, Field(description="Start time offset of the cue")]
-    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
+    start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")]
+    end: Annotated[WebVTTTimestamp, Field(description="End time offset of the cue")]
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
@@ -197,6 +208,21 @@ def __str__(self):
         return f"<{self._get_name_with_classes()} {self.annotation}>"
 
 
+class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated):
+    _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE)
+
+    name: Literal["lang"] = Field("lang", description="The tag name")
+
+    @field_validator("annotation", mode="after")
+    @classmethod
+    @override
+    def is_valid_annotation(cls, value: str) -> str:
+        if cls._pattern.match(value):
+            return value
+        else:
+            raise ValueError("Annotation should be in BCP 47 language tag format")
+
+
 class _WebVTTCueComponentBase(BaseModel):
     """WebVTT caption or subtitle cue component.
 
@@ -267,7 +293,7 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
     """
 
     kind: Literal["lang"] = "lang"
-    start_tag: _WebVTTCueSpanStartTagAnnotated
+    start_tag: _WebVTTCueLanguageSpanStartTag
 
 
 _WebVTTCueComponent = Annotated[
@@ -339,7 +365,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
 
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end))
+        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
         cue_text = " ".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
@@ -376,11 +402,15 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                         classes: list[str] = []
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
-                        st = (
-                            _WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
-                            if annotation
-                            else _WebVTTCueSpanStartTag(name=ct, classes=classes)
-                        )
+                        st: _WebVTTCueSpanStartTag
+                        if annotation and ct == "lang":
+                            st = _WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
+                        elif annotation:
+                            st = _WebVTTCueSpanStartTagAnnotated(
+                                name=ct, classes=classes, annotation=annotation.strip()
+                            )
+                        else:
+                            st = _WebVTTCueSpanStartTag(name=ct, classes=classes)
                         it = _WebVTTCueInternalText(components=children)
                         cp: _WebVTTCueComponent
                         if ct == "c":
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index eca74ef4..adc3aac5 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2228,7 +2228,7 @@
       "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
       "properties": {
         "start_time": {
-          "$ref": "#/$defs/_WebVTTTimestamp",
+          "$ref": "#/$defs/WebVTTTimestamp",
           "description": "Start time offset of the track cue",
           "examples": [
             "00.11.000",
@@ -2237,7 +2237,7 @@
           ]
         },
         "end_time": {
-          "$ref": "#/$defs/_WebVTTTimestamp",
+          "$ref": "#/$defs/WebVTTTimestamp",
           "description": "End time offset of the track cue",
           "examples": [
             "00.12.000",
@@ -3184,8 +3184,8 @@
       "title": "TitleItem",
       "type": "object"
     },
-    "_WebVTTTimestamp": {
-      "description": "WebVTT timestamp.\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
+    "WebVTTTimestamp": {
+      "description": "WebVTT timestamp.\n\nThe timestamp is a string consisting of the following components in the given order:\n\n- hours (optional, required if non-zero): two or more digits\n- minutes: two digits between 0 and 59\n- a colon character (:)\n- seconds: two digits between 0 and 59\n- a full stop character (.)\n- thousandths of a second: three digits\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
       "properties": {
         "raw": {
           "description": "A representation of the WebVTT Timestamp as a single string",
@@ -3196,7 +3196,7 @@
       "required": [
         "raw"
       ],
-      "title": "_WebVTTTimestamp",
+      "title": "WebVTTTimestamp",
       "type": "object"
     }
   },
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index b4d408cb..f4013831 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -9,17 +9,18 @@
 from pydantic import ValidationError
 
 from docling_core.types.doc.webvtt import (
+    WebVTTTimestamp,
     _WebVTTCueBlock,
     _WebVTTCueComponentWithTerminator,
     _WebVTTCueInternalText,
     _WebVTTCueItalicSpan,
     _WebVTTCueLanguageSpan,
+    _WebVTTCueLanguageSpanStartTag,
     _WebVTTCueSpanStartTagAnnotated,
     _WebVTTCueTextSpan,
     _WebVTTCueTimings,
     _WebVTTCueVoiceSpan,
     _WebVTTFile,
-    _WebVTTTimestamp,
 )
 
 from .test_data_gen_flag import GEN_TEST_DATA
@@ -42,7 +43,7 @@ def test_vtt_cue_commponents() -> None:
         0.0,
     ]
     for idx, ts in enumerate(valid_timestamps):
-        model = _WebVTTTimestamp(raw=ts)
+        model = WebVTTTimestamp(raw=ts)
         assert model.seconds == valid_total_seconds[idx]
 
     """Test invalid WebVTT timestamps."""
@@ -57,35 +58,35 @@ def test_vtt_cue_commponents() -> None:
     ]
     for ts in invalid_timestamps:
         with pytest.raises(ValidationError):
-            _WebVTTTimestamp(raw=ts)
+            WebVTTTimestamp(raw=ts)
 
     """Test the timestamp __str__ method."""
-    model = _WebVTTTimestamp(raw="00:01:02.345")
+    model = WebVTTTimestamp(raw="00:01:02.345")
     assert str(model) == "00:01:02.345"
 
     """Test valid cue timings."""
-    start = _WebVTTTimestamp(raw="00:10.005")
-    end = _WebVTTTimestamp(raw="00:14.007")
+    start = WebVTTTimestamp(raw="00:10.005")
+    end = WebVTTTimestamp(raw="00:14.007")
     cue_timings = _WebVTTCueTimings(start=start, end=end)
     assert cue_timings.start == start
     assert cue_timings.end == end
     assert str(cue_timings) == "00:10.005 --> 00:14.007"
 
     """Test invalid cue timings with end timestamp before start."""
-    start = _WebVTTTimestamp(raw="00:10.700")
-    end = _WebVTTTimestamp(raw="00:10.500")
+    start = WebVTTTimestamp(raw="00:10.700")
+    end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
         _WebVTTCueTimings(start=start, end=end)
     assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
 
     """Test invalid cue timings with missing end."""
-    start = _WebVTTTimestamp(raw="00:10.500")
+    start = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
         _WebVTTCueTimings(start=start)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test invalid cue timings with missing start."""
-    end = _WebVTTTimestamp(raw="00:10.500")
+    end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
         _WebVTTCueTimings(end=end)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
@@ -272,3 +273,13 @@ def test_webvtt_file() -> None:
     assert len(block.payload) == 1
     assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
+
+
+def test_webvtt_cue_language_span_start_tag():
+    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
+    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
+    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
+    with pytest.raises(ValidationError, match="BCP 47"):
+        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
+    with pytest.raises(ValidationError, match="BCP 47"):
+        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')

From 190fd2789a27a9770ca6720c061d72fba6a2fd5b Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 10:49:23 +0100
Subject: [PATCH 06/22] refactor(webvtt): set languages to a list of strings in
 ProvenanceTrack

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/document.py |  6 +++---
 docs/DoclingDocument.json          | 21 ++++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 000d0811..c1dcaaa6 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1231,10 +1231,10 @@ class ProvenanceTrack(BaseModel):
         examples=["Mary", "Fred", "Name Surname"],
         description="The cue voice (speaker)",
     )
-    language: Optional[str] = Field(
+    languages: Optional[list[str]] = Field(
         None,
-        examples=["en", "en-GB", "fr-CA"],
-        description="Language of the cue in BCP 47 language tag format",
+        examples=[["en", "en-GB"], ["fr-CA"]],
+        description="Languages of the cue in BCP 47 language tag format",
     )
     classes: Optional[list[str]] = Field(
         None,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index adc3aac5..35175601 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2281,23 +2281,30 @@
           ],
           "title": "Voice"
         },
-        "language": {
+        "languages": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "type": "string"
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "Language of the cue in BCP 47 language tag format",
+          "description": "Languages of the cue in BCP 47 language tag format",
           "examples": [
-            "en",
-            "en-GB",
-            "fr-CA"
+            [
+              "en",
+              "en-GB"
+            ],
+            [
+              "fr-CA"
+            ]
           ],
-          "title": "Language"
+          "title": "Languages"
         },
         "classes": {
           "anyOf": [

From 0ee98a0465eae4e4587952c2d9ac5a855aa833ba Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 11:04:59 +0100
Subject: [PATCH 07/22] tests(webvtt): add test for ProvenanceTrack

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 test/test_doc_base.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 709e2eac..18d2cf11 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,6 +1,8 @@
 import pytest
 from pydantic import ValidationError
 
+from docling_core.types.doc.document import ProvenanceTrack
+from docling_core.types.doc.webvtt import WebVTTTimestamp
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -37,3 +39,34 @@ def test_prov():
     with pytest.raises(ValidationError, match="at least 2 items"):
         prov["span"] = [0]
         Prov(**prov)
+
+
+def test_prov_track():
+    """Test the class ProvenanceTrack."""
+
+    valid_track = ProvenanceTrack(
+        start_time=WebVTTTimestamp(raw="00:11.000"),
+        end_time=WebVTTTimestamp(raw="00:12.000"),
+        identifier="test",
+        voice="Mary",
+        languages=["en", "en-GB"],
+        classes=["v.first.loud", "i.foreignphrase"],
+    )
+
+    assert valid_track
+    assert valid_track.start_time == WebVTTTimestamp(raw="00:11.000")
+    assert valid_track.end_time == WebVTTTimestamp(raw="00:12.000")
+    assert valid_track.identifier == "test"
+    assert valid_track.voice == "Mary"
+    assert valid_track.languages == ["en", "en-GB"]
+    assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
+
+    with pytest.raises(ValidationError, match="end_time"):
+        ProvenanceTrack(start_time=WebVTTTimestamp(raw="00:11.000"))
+
+    with pytest.raises(ValidationError, match="should be a valid list"):
+        ProvenanceTrack(
+            start_time=WebVTTTimestamp(raw="00:11.000"),
+            end_time=WebVTTTimestamp(raw="00:12.000"),
+            languages="en",
+        )

From 5206c0ce0956b3f6cc43f22558bc4715b0bc96a3 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 11:29:51 +0100
Subject: [PATCH 08/22] refactor(webvtt): make all WebVTT classes public for
 reuse

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 156 +++++++++++++++++++------------
 test/test_webvtt.py              | 130 +++++++++++++-------------
 2 files changed, 161 insertions(+), 125 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index f6a6ea73..550498a9 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -84,7 +84,7 @@ def __str__(self) -> str:
         return self.raw
 
 
-class _WebVTTCueTimings(BaseModel):
+class WebVTTCueTimings(BaseModel):
     """WebVTT cue timings."""
 
     start: Annotated[WebVTTTimestamp, Field(description="Start time offset of the cue")]
@@ -92,6 +92,7 @@ class _WebVTTCueTimings(BaseModel):
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
+        """Ensure start timestamp is less than or equal to end timestamp."""
         if self.start and self.end:
             if self.end.seconds <= self.start.seconds:
                 raise ValueError("End timestamp must be greater than start timestamp")
@@ -99,10 +100,11 @@ def check_order(self) -> Self:
 
     @override
     def __str__(self):
+        """Return a string representation of the cue timings."""
         return f"{self.start} --> {self.end}"
 
 
-class _WebVTTCueTextSpan(BaseModel):
+class WebVTTCueTextSpan(BaseModel):
     """WebVTT cue text span."""
 
     kind: Literal["text"] = "text"
@@ -111,6 +113,7 @@ class _WebVTTCueTextSpan(BaseModel):
     @field_validator("text", mode="after")
     @classmethod
     def is_valid_text(cls, value: str) -> str:
+        """Ensure cue text contains only permitted characters and HTML entities."""
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
@@ -126,36 +129,39 @@ def is_valid_text(cls, value: str) -> str:
 
     @override
     def __str__(self):
+        """Return a string representation of the cue text span."""
         return self.text
 
 
-class _WebVTTCueComponentWithTerminator(BaseModel):
+class WebVTTCueComponentWithTerminator(BaseModel):
     """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
-    component: "_WebVTTCueComponent"
+    component: "WebVTTCueComponent"
     terminator: Optional[_WebVTTLineTerminator] = None
 
     @override
     def __str__(self):
+        """Return a string representation of the cue component with terminator."""
         return f"{self.component}{self.terminator.value if self.terminator else ''}"
 
 
-class _WebVTTCueInternalText(BaseModel):
+class WebVTTCueInternalText(BaseModel):
     """WebVTT cue internal text."""
 
     terminator: Optional[_WebVTTLineTerminator] = None
     components: Annotated[
-        list[_WebVTTCueComponentWithTerminator],
+        list[WebVTTCueComponentWithTerminator],
         Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
     ] = []
 
     @override
     def __str__(self):
+        """Return a string representation of the cue internal text."""
         cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
         return cue_str
 
 
-class _WebVTTCueSpanStartTag(BaseModel):
+class WebVTTCueSpanStartTag(BaseModel):
     """WebVTT cue span start tag."""
 
     name: Annotated[_START_TAG_NAMES, Field(description="The tag name")]
@@ -167,6 +173,7 @@ class _WebVTTCueSpanStartTag(BaseModel):
     @field_validator("classes", mode="after")
     @classmethod
     def validate_classes(cls, value: list[str]) -> list[str]:
+        """Validate cue span start tag classes."""
         for item in value:
             if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
                 raise ValueError("A cue span start tag class contains invalid characters")
@@ -175,14 +182,16 @@ def validate_classes(cls, value: list[str]) -> list[str]:
         return value
 
     def _get_name_with_classes(self) -> str:
+        """Return the name of the cue span start tag with classes."""
         return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
 
     @override
     def __str__(self):
+        """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()}>"
 
 
-class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag):
+class WebVTTCueSpanStartTagAnnotated(WebVTTCueSpanStartTag):
     """WebVTT cue span start tag requiring an annotation."""
 
     annotation: Annotated[str, Field(description="Cue span start tag annotation")]
@@ -190,6 +199,7 @@ class _WebVTTCueSpanStartTagAnnotated(_WebVTTCueSpanStartTag):
     @field_validator("annotation", mode="after")
     @classmethod
     def is_valid_annotation(cls, value: str) -> str:
+        """Ensure annotation contains only permitted characters and HTML entities."""
         for match in _ENTITY_PATTERN.finditer(value):
             entity = match.group(1)
             if entity not in _VALID_ENTITIES:
@@ -205,10 +215,13 @@ def is_valid_annotation(cls, value: str) -> str:
 
     @override
     def __str__(self):
+        """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()} {self.annotation}>"
 
 
-class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated):
+class WebVTTCueLanguageSpanStartTag(WebVTTCueSpanStartTagAnnotated):
+    """WebVTT cue language span start tag."""
+
     _pattern: ClassVar[re.Pattern] = re.compile(r"^[a-zA-Z]{2,3}(-[a-zA-Z0-9]{2,8})*$", re.IGNORECASE)
 
     name: Literal["lang"] = Field("lang", description="The tag name")
@@ -217,13 +230,14 @@ class _WebVTTCueLanguageSpanStartTag(_WebVTTCueSpanStartTagAnnotated):
     @classmethod
     @override
     def is_valid_annotation(cls, value: str) -> str:
+        """Ensure that the language annotation is in BCP 47 language tag format."""
         if cls._pattern.match(value):
             return value
         else:
             raise ValueError("Annotation should be in BCP 47 language tag format")
 
 
-class _WebVTTCueComponentBase(BaseModel):
+class WebVTTCueComponentBase(BaseModel):
     """WebVTT caption or subtitle cue component.
 
     All the WebVTT caption or subtitle cue components are represented by this class
@@ -231,28 +245,30 @@ class _WebVTTCueComponentBase(BaseModel):
     """
 
     kind: Literal["c", "b", "i", "u", "v", "lang"]
-    start_tag: _WebVTTCueSpanStartTag
-    internal_text: _WebVTTCueInternalText
+    start_tag: WebVTTCueSpanStartTag
+    internal_text: WebVTTCueInternalText
 
     @model_validator(mode="after")
     def check_tag_names_match(self) -> Self:
+        """Ensure that the start tag name matches this cue component type."""
         if self.kind != self.start_tag.name:
             raise ValueError("The tag name of this cue component should be {self.kind}")
         return self
 
     @override
     def __str__(self):
+        """Return a string representation of the cue component."""
         return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
 
 
-class _WebVTTCueVoiceSpan(_WebVTTCueComponentBase):
+class WebVTTCueVoiceSpan(WebVTTCueComponentBase):
     """WebVTT cue voice span associated with a specific voice."""
 
     kind: Literal["v"] = "v"
-    start_tag: _WebVTTCueSpanStartTagAnnotated
+    start_tag: WebVTTCueSpanStartTagAnnotated
 
 
-class _WebVTTCueClassSpan(_WebVTTCueComponentBase):
+class WebVTTCueClassSpan(WebVTTCueComponentBase):
     """WebVTT cue class span.
 
     It represents a span of text and it is used to annotate parts of the cue with
@@ -260,31 +276,31 @@ class _WebVTTCueClassSpan(_WebVTTCueComponentBase):
     """
 
     kind: Literal["c"] = "c"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="c")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="c")
 
 
-class _WebVTTCueItalicSpan(_WebVTTCueComponentBase):
+class WebVTTCueItalicSpan(WebVTTCueComponentBase):
     """WebVTT cue italic span representing a span of italic text."""
 
     kind: Literal["i"] = "i"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="i")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="i")
 
 
-class _WebVTTCueBoldSpan(_WebVTTCueComponentBase):
+class WebVTTCueBoldSpan(WebVTTCueComponentBase):
     """WebVTT cue bold span representing a span of bold text."""
 
     kind: Literal["b"] = "b"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="b")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="b")
 
 
-class _WebVTTCueUnderlineSpan(_WebVTTCueComponentBase):
+class WebVTTCueUnderlineSpan(WebVTTCueComponentBase):
     """WebVTT cue underline span representing a span of underline text."""
 
     kind: Literal["u"] = "u"
-    start_tag: _WebVTTCueSpanStartTag = _WebVTTCueSpanStartTag(name="u")
+    start_tag: WebVTTCueSpanStartTag = WebVTTCueSpanStartTag(name="u")
 
 
-class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
+class WebVTTCueLanguageSpan(WebVTTCueComponentBase):
     """WebVTT cue language span.
 
     It represents a span of text and it is used to annotate parts of the cue where the
@@ -293,18 +309,18 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
     """
 
     kind: Literal["lang"] = "lang"
-    start_tag: _WebVTTCueLanguageSpanStartTag
+    start_tag: WebVTTCueLanguageSpanStartTag
 
 
-_WebVTTCueComponent = Annotated[
+WebVTTCueComponent = Annotated[
     Union[
-        _WebVTTCueTextSpan,
-        _WebVTTCueClassSpan,
-        _WebVTTCueItalicSpan,
-        _WebVTTCueBoldSpan,
-        _WebVTTCueUnderlineSpan,
-        _WebVTTCueVoiceSpan,
-        _WebVTTCueLanguageSpan,
+        WebVTTCueTextSpan,
+        WebVTTCueClassSpan,
+        WebVTTCueItalicSpan,
+        WebVTTCueBoldSpan,
+        WebVTTCueUnderlineSpan,
+        WebVTTCueVoiceSpan,
+        WebVTTCueLanguageSpan,
     ],
     Field(
         discriminator="kind",
@@ -313,7 +329,7 @@ class _WebVTTCueLanguageSpan(_WebVTTCueComponentBase):
 ]
 
 
-class _WebVTTCueBlock(BaseModel):
+class WebVTTCueBlock(BaseModel):
     """Model representing a WebVTT cue block.
 
     The optional WebVTT cue settings list is not supported.
@@ -324,9 +340,9 @@ class _WebVTTCueBlock(BaseModel):
     model_config = ConfigDict(regex_engine="python-re")
 
     identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
-    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
+    timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
-        list[_WebVTTCueComponentWithTerminator],
+        list[WebVTTCueComponentWithTerminator],
         Field(description="The WebVTT caption or subtitle cue text"),
     ]
 
@@ -341,13 +357,22 @@ class _WebVTTCueBlock(BaseModel):
     @field_validator("payload", mode="after")
     @classmethod
     def validate_payload(cls, payload):
+        """Ensure that the cue payload contains valid text."""
         for voice in payload:
             if "-->" in str(voice):
                 raise ValueError("Cue payload must not contain '-->'")
         return payload
 
     @classmethod
-    def parse(cls, raw: str) -> "_WebVTTCueBlock":
+    def parse(cls, raw: str) -> "WebVTTCueBlock":
+        """Parse a WebVTT cue block from a string.
+
+        Args:
+            raw: The raw WebVTT cue block string.
+
+        Returns:
+            The parsed WebVTT cue block.
+        """
         lines = raw.strip().splitlines()
         if not lines:
             raise ValueError("Cue block must have at least one line")
@@ -365,7 +390,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
 
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
+        timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
         cue_text = " ".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
@@ -373,7 +398,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                 cue_text += f"</{omm}>"
                 break
 
-        stack: list[list[_WebVTTCueComponentWithTerminator]] = [[]]
+        stack: list[list[WebVTTCueComponentWithTerminator]] = [[]]
         tag_stack: list[dict] = []
 
         pos = 0
@@ -383,7 +408,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
             match = matches[i]
             if match.start() > pos:
                 stack[-1].append(
-                    _WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
+                    WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
                 )
             gps = {k: (v if v else None) for k, v in match.groupdict().items()}
 
@@ -402,36 +427,34 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
                         classes: list[str] = []
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
-                        st: _WebVTTCueSpanStartTag
+                        st: WebVTTCueSpanStartTag
                         if annotation and ct == "lang":
-                            st = _WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
+                            st = WebVTTCueLanguageSpanStartTag(name=ct, classes=classes, annotation=annotation.strip())
                         elif annotation:
-                            st = _WebVTTCueSpanStartTagAnnotated(
-                                name=ct, classes=classes, annotation=annotation.strip()
-                            )
+                            st = WebVTTCueSpanStartTagAnnotated(name=ct, classes=classes, annotation=annotation.strip())
                         else:
-                            st = _WebVTTCueSpanStartTag(name=ct, classes=classes)
-                        it = _WebVTTCueInternalText(components=children)
-                        cp: _WebVTTCueComponent
+                            st = WebVTTCueSpanStartTag(name=ct, classes=classes)
+                        it = WebVTTCueInternalText(components=children)
+                        cp: WebVTTCueComponent
                         if ct == "c":
-                            cp = _WebVTTCueClassSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueClassSpan(start_tag=st, internal_text=it)
                         elif ct == "b":
-                            cp = _WebVTTCueBoldSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueBoldSpan(start_tag=st, internal_text=it)
                         elif ct == "i":
-                            cp = _WebVTTCueItalicSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueItalicSpan(start_tag=st, internal_text=it)
                         elif ct == "u":
-                            cp = _WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueUnderlineSpan(start_tag=st, internal_text=it)
                         elif ct == "lang":
-                            cp = _WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
+                            cp = WebVTTCueLanguageSpan(start_tag=st, internal_text=it)
                         elif ct == "v":
-                            cp = _WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
-                        stack[-1].append(_WebVTTCueComponentWithTerminator(component=cp))
+                            cp = WebVTTCueVoiceSpan(start_tag=st, internal_text=it)
+                        stack[-1].append(WebVTTCueComponentWithTerminator(component=cp))
 
             pos = match.end()
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(_WebVTTCueComponentWithTerminator(component=_WebVTTCueTextSpan(text=cue_text[pos:])))
+            stack[-1].append(WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos:])))
 
         return cls(
             identifier=identifier,
@@ -440,6 +463,7 @@ def parse(cls, raw: str) -> "_WebVTTCueBlock":
         )
 
     def __str__(self):
+        """Return a string representation of the WebVTT cue block."""
         parts = []
         if self.identifier:
             parts.append(f"{self.identifier}\n")
@@ -455,13 +479,14 @@ def __str__(self):
         return "".join(parts) + "\n"
 
 
-class _WebVTTFile(BaseModel):
+class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
-    cue_blocks: list[_WebVTTCueBlock]
+    cue_blocks: list[WebVTTCueBlock]
 
     @staticmethod
     def verify_signature(content: str) -> bool:
+        """Verify the WebVTT file signature."""
         if not content:
             return False
         elif len(content) == 6:
@@ -472,7 +497,15 @@ def verify_signature(content: str) -> bool:
             return False
 
     @classmethod
-    def parse(cls, raw: str) -> "_WebVTTFile":
+    def parse(cls, raw: str) -> "WebVTTFile":
+        """Parse a WebVTT file.
+
+        Args:
+            raw: The raw WebVTT file content.
+
+        Returns:
+            The parsed WebVTT file.
+        """
         # Normalize newlines to LF
         raw = raw.replace("\r\n", "\n").replace("\r", "\n")
 
@@ -490,20 +523,23 @@ def parse(cls, raw: str) -> "_WebVTTFile":
 
         # Split into cue blocks
         raw_blocks = re.split(r"\n\s*\n", body.strip())
-        cues: list[_WebVTTCueBlock] = []
+        cues: list[WebVTTCueBlock] = []
         for block in raw_blocks:
             try:
-                cues.append(_WebVTTCueBlock.parse(block))
+                cues.append(WebVTTCueBlock.parse(block))
             except ValueError as e:
                 _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
 
         return cls(cue_blocks=cues)
 
     def __iter__(self):
+        """Return an iterator over the cue blocks."""
         return iter(self.cue_blocks)
 
     def __getitem__(self, idx):
+        """Return the cue block at the given index."""
         return self.cue_blocks[idx]
 
     def __len__(self):
+        """Return the number of cue blocks."""
         return len(self.cue_blocks)
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index f4013831..9e47f1a8 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -9,18 +9,18 @@
 from pydantic import ValidationError
 
 from docling_core.types.doc.webvtt import (
+    WebVTTCueBlock,
+    WebVTTCueComponentWithTerminator,
+    WebVTTCueInternalText,
+    WebVTTCueItalicSpan,
+    WebVTTCueLanguageSpan,
+    WebVTTCueLanguageSpanStartTag,
+    WebVTTCueSpanStartTagAnnotated,
+    WebVTTCueTextSpan,
+    WebVTTCueTimings,
+    WebVTTCueVoiceSpan,
+    WebVTTFile,
     WebVTTTimestamp,
-    _WebVTTCueBlock,
-    _WebVTTCueComponentWithTerminator,
-    _WebVTTCueInternalText,
-    _WebVTTCueItalicSpan,
-    _WebVTTCueLanguageSpan,
-    _WebVTTCueLanguageSpanStartTag,
-    _WebVTTCueSpanStartTagAnnotated,
-    _WebVTTCueTextSpan,
-    _WebVTTCueTimings,
-    _WebVTTCueVoiceSpan,
-    _WebVTTFile,
 )
 
 from .test_data_gen_flag import GEN_TEST_DATA
@@ -67,7 +67,7 @@ def test_vtt_cue_commponents() -> None:
     """Test valid cue timings."""
     start = WebVTTTimestamp(raw="00:10.005")
     end = WebVTTTimestamp(raw="00:14.007")
-    cue_timings = _WebVTTCueTimings(start=start, end=end)
+    cue_timings = WebVTTCueTimings(start=start, end=end)
     assert cue_timings.start == start
     assert cue_timings.end == end
     assert str(cue_timings) == "00:10.005 --> 00:14.007"
@@ -76,92 +76,92 @@ def test_vtt_cue_commponents() -> None:
     start = WebVTTTimestamp(raw="00:10.700")
     end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start, end=end)
+        WebVTTCueTimings(start=start, end=end)
     assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
 
     """Test invalid cue timings with missing end."""
     start = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start)  # type: ignore[call-arg]
+        WebVTTCueTimings(start=start)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test invalid cue timings with missing start."""
     end = WebVTTTimestamp(raw="00:10.500")
     with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(end=end)  # type: ignore[call-arg]
+        WebVTTCueTimings(end=end)  # type: ignore[call-arg]
     assert "Field required" in str(excinfo.value)
 
     """Test with valid text."""
     valid_text = "This is a valid cue text span."
-    span = _WebVTTCueTextSpan(text=valid_text)
+    span = WebVTTCueTextSpan(text=valid_text)
     assert span.text == valid_text
     assert str(span) == valid_text
 
     """Test with text containing newline characters."""
     invalid_text = "This cue text span\ncontains a newline."
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
 
     """Test with text containing ampersand."""
     invalid_text = "This cue text span contains &."
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
     invalid_text = "An invalid &foo; entity"
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
     valid_text = "My favorite book is Pride &amp; Prejudice"
-    span = _WebVTTCueTextSpan(text=valid_text)
+    span = WebVTTCueTextSpan(text=valid_text)
     assert span.text == valid_text
 
     """Test with text containing less-than sign."""
     invalid_text = "This cue text span contains <."
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
+        WebVTTCueTextSpan(text=invalid_text)
 
     """Test with empty text."""
     with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text="")
+        WebVTTCueTextSpan(text="")
 
     """Test that annotation validation works correctly."""
     valid_annotation = "valid-annotation"
     invalid_annotation = "invalid\nannotation"
     with pytest.raises(ValidationError):
-        _WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
-    assert _WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
+        WebVTTCueSpanStartTagAnnotated(name="v", annotation=invalid_annotation)
+    assert WebVTTCueSpanStartTagAnnotated(name="v", annotation=valid_annotation)
 
     """Test that classes validation works correctly."""
     annotation = "speaker name"
     valid_classes = ["class1", "class2"]
     invalid_classes = ["class\nwith\nnewlines", ""]
     with pytest.raises(ValidationError):
-        _WebVTTCueSpanStartTagAnnotated(
+        WebVTTCueSpanStartTagAnnotated(
             name="v", annotation=annotation, classes=invalid_classes
         )
-    assert _WebVTTCueSpanStartTagAnnotated(
+    assert WebVTTCueSpanStartTagAnnotated(
         name="v", annotation=annotation, classes=valid_classes
     )
 
     """Test that components validation works correctly."""
     annotation = "speaker name"
     valid_components = [
-        _WebVTTCueComponentWithTerminator(
-            component=_WebVTTCueTextSpan(text="random text")
+        WebVTTCueComponentWithTerminator(
+            component=WebVTTCueTextSpan(text="random text")
         )
     ]
     invalid_components = [123, "not a component"]
     with pytest.raises(ValidationError):
-        _WebVTTCueInternalText(components=invalid_components)
-    assert _WebVTTCueInternalText(components=valid_components)
+        WebVTTCueInternalText(components=invalid_components)
+    assert WebVTTCueInternalText(components=valid_components)
 
     """Test valid cue voice spans."""
-    cue_span = _WebVTTCueVoiceSpan(
-        start_tag=_WebVTTCueSpanStartTagAnnotated(
+    cue_span = WebVTTCueVoiceSpan(
+        start_tag=WebVTTCueSpanStartTagAnnotated(
             name="v", annotation="speaker", classes=["loud", "clear"]
         ),
-        internal_text=_WebVTTCueInternalText(
+        internal_text=WebVTTCueInternalText(
             components=[
-                _WebVTTCueComponentWithTerminator(
-                    component=_WebVTTCueTextSpan(text="random text")
+                WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text="random text")
                 )
             ]
         ),
@@ -169,12 +169,12 @@ def test_vtt_cue_commponents() -> None:
     expected_str = "<v.loud.clear speaker>random text</v>"
     assert str(cue_span) == expected_str
 
-    cue_span = _WebVTTCueVoiceSpan(
-        start_tag=_WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
-        internal_text=_WebVTTCueInternalText(
+    cue_span = WebVTTCueVoiceSpan(
+        start_tag=WebVTTCueSpanStartTagAnnotated(name="v", annotation="speaker"),
+        internal_text=WebVTTCueInternalText(
             components=[
-                _WebVTTCueComponentWithTerminator(
-                    component=_WebVTTCueTextSpan(text="random text")
+                WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text="random text")
                 )
             ]
         ),
@@ -188,11 +188,11 @@ def test_webvttcueblock_parse() -> None:
     raw: str = (
         "04:02.500 --> 04:05.000\n" "J’ai commencé le basket à l'âge de 13, 14 ans\n"
     )
-    block: _WebVTTCueBlock = _WebVTTCueBlock.parse(raw)
+    block: WebVTTCueBlock = WebVTTCueBlock.parse(raw)
     assert str(block.timings) == "04:02.500 --> 04:05.000"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert (
         block.payload[0].component.text
         == "J’ai commencé le basket à l'âge de 13, 14 ans"
@@ -203,23 +203,23 @@ def test_webvttcueblock_parse() -> None:
         "04:05.001 --> 04:07.800\n"
         "Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier\n"
     )
-    block = _WebVTTCueBlock.parse(raw)
+    block = WebVTTCueBlock.parse(raw)
     assert str(block.timings) == "04:05.001 --> 04:07.800"
     assert len(block.payload) == 3
-    assert isinstance(block.payload[0], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[0], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Sur les "
-    assert isinstance(block.payload[1], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[1].component, _WebVTTCueItalicSpan)
+    assert isinstance(block.payload[1], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[1].component, WebVTTCueItalicSpan)
     assert len(block.payload[1].component.internal_text.components) == 1
     lang_span = block.payload[1].component.internal_text.components[0].component
-    assert isinstance(lang_span, _WebVTTCueLanguageSpan)
+    assert isinstance(lang_span, WebVTTCueLanguageSpan)
     assert isinstance(
-        lang_span.internal_text.components[0].component, _WebVTTCueTextSpan
+        lang_span.internal_text.components[0].component, WebVTTCueTextSpan
     )
     assert lang_span.internal_text.components[0].component.text == "playground"
-    assert isinstance(block.payload[2], _WebVTTCueComponentWithTerminator)
-    assert isinstance(block.payload[2].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[2], WebVTTCueComponentWithTerminator)
+    assert isinstance(block.payload[2].component, WebVTTCueTextSpan)
     assert block.payload[2].component.text == ", ici à Montpellier"
     assert raw == str(block)
 
@@ -228,26 +228,26 @@ def test_webvtt_file() -> None:
     """Test WebVTT files."""
     with open("./test/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = _WebVTTFile.parse(content)
+        vtt = WebVTTFile.parse(content)
     assert len(vtt) == 13
     block = vtt.cue_blocks[11]
     assert str(block.timings) == "00:32.500 --> 00:33.500"
     assert len(block.payload) == 1
     cue_span = block.payload[0]
-    assert isinstance(cue_span.component, _WebVTTCueVoiceSpan)
+    assert isinstance(cue_span.component, WebVTTCueVoiceSpan)
     assert cue_span.component.start_tag.annotation == "Neil deGrasse Tyson"
     assert not cue_span.component.start_tag.classes
     assert len(cue_span.component.internal_text.components) == 1
     comp = cue_span.component.internal_text.components[0]
-    assert isinstance(comp.component, _WebVTTCueItalicSpan)
+    assert isinstance(comp.component, WebVTTCueItalicSpan)
     assert len(comp.component.internal_text.components) == 1
     comp2 = comp.component.internal_text.components[0]
-    assert isinstance(comp2.component, _WebVTTCueTextSpan)
+    assert isinstance(comp2.component, WebVTTCueTextSpan)
     assert comp2.component.text == "Laughs"
 
     with open("./test/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = _WebVTTFile.parse(content)
+        vtt = WebVTTFile.parse(content)
     assert len(vtt) == 4
     reverse = (
         "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
@@ -258,7 +258,7 @@ def test_webvtt_file() -> None:
 
     with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = _WebVTTFile.parse(content)
+        vtt = WebVTTFile.parse(content)
     assert len(vtt) == 13
     for block in vtt:
         assert block.identifier
@@ -266,20 +266,20 @@ def test_webvtt_file() -> None:
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
     assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0].component, _WebVTTCueVoiceSpan)
+    assert isinstance(block.payload[0].component, WebVTTCueVoiceSpan)
     block = vtt.cue_blocks[2]
     assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
     assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
     assert len(block.payload) == 1
-    assert isinstance(block.payload[0].component, _WebVTTCueTextSpan)
+    assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
 
 
 def test_webvtt_cue_language_span_start_tag():
-    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
-    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
-    _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en-US"}')
+    WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "zh-Hant"}')
     with pytest.raises(ValidationError, match="BCP 47"):
-        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
+        WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en_US"}')
     with pytest.raises(ValidationError, match="BCP 47"):
-        _WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')
+        WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "123-de"}')

From 15ec4c496d2a846818c9b636cd064550ddbb2afc Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 12 Dec 2025 17:05:16 +0100
Subject: [PATCH 09/22] chore(webvtt): preserve newlines as
 WebVTTLineTerminator

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 38 ++++++++++++++++++--------
 test/data/webvtt/webvtt_example_04.vtt | 13 +++++++++
 test/test_webvtt.py                    | 13 +++++++++
 3 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 test/data/webvtt/webvtt_example_04.vtt

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 550498a9..023b0192 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -2,6 +2,7 @@
 
 import logging
 import re
+from collections.abc import Iterator
 from enum import Enum
 from typing import Annotated, ClassVar, Literal, Optional, Union
 
@@ -17,13 +18,15 @@
 _START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
 
 
-class _WebVTTLineTerminator(str, Enum):
+class WebVTTLineTerminator(str, Enum):
+    """WebVTT line terminator."""
+
     CRLF = "\r\n"
     LF = "\n"
     CR = "\r"
 
 
-_WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
+WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
 class WebVTTTimestamp(BaseModel):
@@ -137,7 +140,7 @@ class WebVTTCueComponentWithTerminator(BaseModel):
     """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
     component: "WebVTTCueComponent"
-    terminator: Optional[_WebVTTLineTerminator] = None
+    terminator: Optional[WebVTTLineTerminator] = None
 
     @override
     def __str__(self):
@@ -148,7 +151,7 @@ def __str__(self):
 class WebVTTCueInternalText(BaseModel):
     """WebVTT cue internal text."""
 
-    terminator: Optional[_WebVTTLineTerminator] = None
+    terminator: Optional[WebVTTLineTerminator] = None
     components: Annotated[
         list[WebVTTCueComponentWithTerminator],
         Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
@@ -339,7 +342,7 @@ class WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Optional[_WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
+    identifier: Optional[WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
     timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[WebVTTCueComponentWithTerminator],
@@ -363,6 +366,19 @@ def validate_payload(cls, payload):
                 raise ValueError("Cue payload must not contain '-->'")
         return payload
 
+    @staticmethod
+    def _create_text_components(
+        text: str,
+    ) -> Iterator[WebVTTCueComponentWithTerminator]:
+        text_list = text.split("\n")
+        for idx, line in enumerate(text.split("\n")):
+            terminator = WebVTTLineTerminator.LF if idx < len(text_list) - 1 or text.endswith("\n") else None
+            if len(line) > 0:
+                yield WebVTTCueComponentWithTerminator(
+                    component=WebVTTCueTextSpan(text=line),
+                    terminator=terminator,
+                )
+
     @classmethod
     def parse(cls, raw: str) -> "WebVTTCueBlock":
         """Parse a WebVTT cue block from a string.
@@ -376,7 +392,7 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
         lines = raw.strip().splitlines()
         if not lines:
             raise ValueError("Cue block must have at least one line")
-        identifier: Optional[_WebVTTCueIdentifier] = None
+        identifier: Optional[WebVTTCueIdentifier] = None
         timing_line = lines[0]
         if "-->" not in timing_line and len(lines) > 1:
             identifier = timing_line
@@ -391,7 +407,7 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
         start, end = [t.strip() for t in timing_line.split("-->")]
         end = re.split(" |\t", end)[0]  # ignore the cue settings list
         timings: WebVTTCueTimings = WebVTTCueTimings(start=WebVTTTimestamp(raw=start), end=WebVTTTimestamp(raw=end))
-        cue_text = " ".join(cue_lines).strip()
+        cue_text = "\n".join(cue_lines).strip()
         # adding close tag for cue spans without end tag
         for omm in {"v"}:
             if cue_text.startswith(f"<{omm}") and f"</{omm}>" not in cue_text:
@@ -407,9 +423,8 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
         while i < len(matches):
             match = matches[i]
             if match.start() > pos:
-                stack[-1].append(
-                    WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
-                )
+                text = cue_text[pos : match.start()]
+                stack[-1].extend(cls._create_text_components(text))
             gps = {k: (v if v else None) for k, v in match.groupdict().items()}
 
             if gps["tag"] in {"c", "b", "i", "u", "v", "lang"}:
@@ -454,7 +469,8 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
             i += 1
 
         if pos < len(cue_text):
-            stack[-1].append(WebVTTCueComponentWithTerminator(component=WebVTTCueTextSpan(text=cue_text[pos:])))
+            text = cue_text[pos:]
+            stack[-1].extend(cls._create_text_components(text))
 
         return cls(
             identifier=identifier,
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 00000000..91be3530
--- /dev/null
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,13 @@
+WEBVTT
+
+NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
+
+00:01.000 --> 00:04.000
+Never drink liquid nitrogen.
+
+NOTE I’m not sure the timing is right on the following cue.
+
+00:05.000 --> 00:09.000
+— It will perforate your stomach.
+— You could <b.loud>die</b>.
+<v John>This is true.</v>
\ No newline at end of file
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 9e47f1a8..1bf9edb8 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -274,6 +274,19 @@ def test_webvtt_file() -> None:
     assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
 
+    with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
+        content = f.read()
+        vtt = WebVTTFile.parse(content)
+    assert len(vtt) == 2
+    block = vtt.cue_blocks[1]
+    assert len(block.payload) == 5
+    assert str(block) == (
+        "00:05.000 --> 00:09.000\n"
+        "— It will perforate your stomach.\n"
+        "— You could <b.loud>die</b>.\n"
+        "<v John>This is true.</v>\n"
+    )
+
 
 def test_webvtt_cue_language_span_start_tag():
     WebVTTCueLanguageSpanStartTag.model_validate_json('{"annotation": "en"}')

From 714c60abaa7450fe6a83131ad1304923e158c2c2 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Sun, 14 Dec 2025 23:52:36 +0100
Subject: [PATCH 10/22] refactor(webvtt): set ProvenanceTrack time fields as
 float

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 20 ++++++++++-----
 docling_core/types/doc/webvtt.py   |  2 +-
 docs/DoclingDocument.json          | 41 ++++++++++--------------------
 test/test_doc_base.py              | 23 ++++++++++-------
 4 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index c1dcaaa6..dd911ab6 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -67,7 +67,6 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
-from docling_core.types.doc.webvtt import WebVTTTimestamp
 
 _logger = logging.getLogger(__name__)
 
@@ -1208,17 +1207,17 @@ class ProvenanceTrack(BaseModel):
     """
 
     start_time: Annotated[
-        WebVTTTimestamp,
+        float,
         Field(
-            examples=["00.11.000", "00:00:06.500", "01:28:34.300"],
-            description="Start time offset of the track cue",
+            examples=[11.0, 6.5, 5370.0],
+            description="Start time offset of the track cue in seconds",
         ),
     ]
     end_time: Annotated[
-        WebVTTTimestamp,
+        float,
         Field(
-            examples=["00.12.000", "00:00:08.200", "01:29:30.100"],
-            description="End time offset of the track cue",
+            examples=[12.0, 8.2, 5370.1],
+            description="End time offset of the track cue in seconds",
         ),
     ]
     identifier: Optional[str] = Field(
@@ -1243,6 +1242,13 @@ class ProvenanceTrack(BaseModel):
         description="Classes for describing the cue significance",
     )
 
+    @model_validator(mode="after")
+    def check_order(self) -> Self:
+        """Ensure start time is less than the end time."""
+        if self.end_time <= self.start_time:
+            raise ValueError("End time must be greater than start time")
+        return self
+
 
 def get_provenance_discriminator_value(v: Any) -> str:
     """Callable discriminator for provenance instances.
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 023b0192..30fa1a4f 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -95,7 +95,7 @@ class WebVTTCueTimings(BaseModel):
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
-        """Ensure start timestamp is less than or equal to end timestamp."""
+        """Ensure start timestamp is less than end timestamp."""
         if self.start and self.end:
             if self.end.seconds <= self.start.seconds:
                 raise ValueError("End timestamp must be greater than start timestamp")
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 35175601..45a5d889 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2228,22 +2228,24 @@
       "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
       "properties": {
         "start_time": {
-          "$ref": "#/$defs/WebVTTTimestamp",
-          "description": "Start time offset of the track cue",
+          "description": "Start time offset of the track cue in seconds",
           "examples": [
-            "00.11.000",
-            "00:00:06.500",
-            "01:28:34.300"
-          ]
+            11.0,
+            6.5,
+            5370.0
+          ],
+          "title": "Start Time",
+          "type": "number"
         },
         "end_time": {
-          "$ref": "#/$defs/WebVTTTimestamp",
-          "description": "End time offset of the track cue",
+          "description": "End time offset of the track cue in seconds",
           "examples": [
-            "00.12.000",
-            "00:00:08.200",
-            "01:29:30.100"
-          ]
+            12.0,
+            8.2,
+            5370.1
+          ],
+          "title": "End Time",
+          "type": "number"
         },
         "identifier": {
           "anyOf": [
@@ -3190,21 +3192,6 @@
       ],
       "title": "TitleItem",
       "type": "object"
-    },
-    "WebVTTTimestamp": {
-      "description": "WebVTT timestamp.\n\nThe timestamp is a string consisting of the following components in the given order:\n\n- hours (optional, required if non-zero): two or more digits\n- minutes: two digits between 0 and 59\n- a colon character (:)\n- seconds: two digits between 0 and 59\n- a full stop character (.)\n- thousandths of a second: three digits\n\nA WebVTT timestamp is always interpreted relative to the current playback position\nof the media data that the WebVTT file is to be synchronized with.",
-      "properties": {
-        "raw": {
-          "description": "A representation of the WebVTT Timestamp as a single string",
-          "title": "Raw",
-          "type": "string"
-        }
-      },
-      "required": [
-        "raw"
-      ],
-      "title": "WebVTTTimestamp",
-      "type": "object"
     }
   },
   "description": "DoclingDocument.",
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 18d2cf11..2d1ce498 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,8 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc.document import ProvenanceTrack
-from docling_core.types.doc.webvtt import WebVTTTimestamp
+from docling_core.types.doc import ProvenanceTrack
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -45,8 +44,8 @@ def test_prov_track():
     """Test the class ProvenanceTrack."""
 
     valid_track = ProvenanceTrack(
-        start_time=WebVTTTimestamp(raw="00:11.000"),
-        end_time=WebVTTTimestamp(raw="00:12.000"),
+        start_time=11.0,
+        end_time=12.0,
         identifier="test",
         voice="Mary",
         languages=["en", "en-GB"],
@@ -54,19 +53,25 @@ def test_prov_track():
     )
 
     assert valid_track
-    assert valid_track.start_time == WebVTTTimestamp(raw="00:11.000")
-    assert valid_track.end_time == WebVTTTimestamp(raw="00:12.000")
+    assert valid_track.start_time == 11.0
+    assert valid_track.end_time == 12.0
     assert valid_track.identifier == "test"
     assert valid_track.voice == "Mary"
     assert valid_track.languages == ["en", "en-GB"]
     assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
-        ProvenanceTrack(start_time=WebVTTTimestamp(raw="00:11.000"))
+        ProvenanceTrack(start_time=11.0)
 
     with pytest.raises(ValidationError, match="should be a valid list"):
         ProvenanceTrack(
-            start_time=WebVTTTimestamp(raw="00:11.000"),
-            end_time=WebVTTTimestamp(raw="00:12.000"),
+            start_time=11.0,
+            end_time=12.0,
             languages="en",
         )
+
+    with pytest.raises(ValidationError, match="must be greater than start"):
+        ProvenanceTrack(
+            start_time=11.0,
+            end_time=11.0,
+        )

From 612bcdae384262fda8fbeb5762b2b8407bdf4375 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 15 Dec 2025 22:13:16 +0100
Subject: [PATCH 11/22] chore(webvtt): ensure start time offsets are in
 sequence

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py | 34 +++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 30fa1a4f..bf5b7227 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -4,6 +4,7 @@
 import re
 from collections.abc import Iterator
 from enum import Enum
+from functools import total_ordering
 from typing import Annotated, ClassVar, Literal, Optional, Union
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
@@ -29,6 +30,7 @@ class WebVTTLineTerminator(str, Enum):
 WebVTTCueIdentifier = Annotated[str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")]
 
 
+@total_ordering
 class WebVTTTimestamp(BaseModel):
     """WebVTT timestamp.
 
@@ -81,6 +83,18 @@ def seconds(self) -> float:
         """A representation of the WebVTT Timestamp in seconds."""
         return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
 
+    def __eq__(self, other: object) -> bool:
+        """Two timestamps are equal if their total number of seconds is equal."""
+        if not isinstance(other, WebVTTTimestamp):
+            return NotImplemented
+        return self.seconds == other.seconds
+
+    def __lt__(self, other: "WebVTTTimestamp") -> bool:
+        """Return True if this timestamp occurs before `other`."""
+        if not isinstance(other, WebVTTTimestamp):
+            return NotImplemented
+        return self.seconds < other.seconds
+
     @override
     def __str__(self) -> str:
         """Return a string representation of a WebVTT timestamp."""
@@ -97,7 +111,7 @@ class WebVTTCueTimings(BaseModel):
     def check_order(self) -> Self:
         """Ensure start timestamp is less than end timestamp."""
         if self.start and self.end:
-            if self.end.seconds <= self.start.seconds:
+            if self.end <= self.start:
                 raise ValueError("End timestamp must be greater than start timestamp")
         return self
 
@@ -512,6 +526,24 @@ def verify_signature(content: str) -> bool:
         else:
             return False
 
+    @model_validator(mode="after")
+    def validate_start_time(self) -> Self:
+        """Validate cue start times.
+
+        The start time offset of the cue must be greater than or equal to the start
+        time offsets of all previous cues.
+        """
+        idx: int = 0
+        while idx < (len(self.cue_blocks) - 1):
+            if self.cue_blocks[idx + 1].timings.start < self.cue_blocks[idx].timings.start:
+                raise ValueError(
+                    f"The start time offset of block {idx + 1} must be greater than or"
+                    " equal to the start time offsets of all previous cues in the file"
+                )
+            idx += 1
+
+        return self
+
     @classmethod
     def parse(cls, raw: str) -> "WebVTTFile":
         """Parse a WebVTT file.

From 10cdcdb564b4ba7aaf7c9177cad33a2b0666bd8b Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Tue, 16 Dec 2025 17:04:20 +0100
Subject: [PATCH 12/22] chore(webvtt): improve regex to remove
 note,region,style blocks

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 11 ++++-------
 test/data/webvtt/webvtt_example_04.vtt | 20 ++++++++++++++++++++
 test/test_webvtt.py                    |  6 +++++-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index bf5b7227..c4f7336f 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -1,7 +1,7 @@
 """Models for the Docling's adoption of Web Video Text Tracks format."""
 
-import logging
 import re
+import warnings
 from collections.abc import Iterator
 from enum import Enum
 from functools import total_ordering
@@ -11,9 +11,6 @@
 from pydantic.types import StringConstraints
 from typing_extensions import Self, override
 
-_log = logging.getLogger(__name__)
-
-
 _VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
 _ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
 _START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
@@ -512,6 +509,7 @@ def __str__(self):
 class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
+    _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
     cue_blocks: list[WebVTTCueBlock]
 
     @staticmethod
@@ -566,8 +564,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
         body = lines[1] if len(lines) > 1 else ""
 
         # Remove NOTE/STYLE/REGION blocks
-        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
-        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
+        body = re.sub(cls._pattern, "", body)
 
         # Split into cue blocks
         raw_blocks = re.split(r"\n\s*\n", body.strip())
@@ -576,7 +573,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
             try:
                 cues.append(WebVTTCueBlock.parse(block))
             except ValueError as e:
-                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
+                warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning)
 
         return cls(cue_blocks=cues)
 
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
index 91be3530..b0519be2 100644
--- a/test/data/webvtt/webvtt_example_04.vtt
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -2,6 +2,26 @@ WEBVTT
 
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 
+STYLE
+::cue {
+    background-image: linear-gradient(to bottom, dimgray, lightgray);
+    color: papayawhip;
+}
+/* Style blocks cannot use blank lines nor "dash dash greater than" */
+
+REGION
+id:editor-comments
+width: 40%
+regionanchor:0%,100%
+viewportanchor:10%,90%
+
+REGION
+id:scroll
+width: 40%
+regionanchor:100%,100%
+viewportanchor:90%,90%
+scroll:up
+
 00:01.000 --> 00:04.000
 Never drink liquid nitrogen.
 
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 1bf9edb8..51f448ed 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -5,6 +5,8 @@
 Copyright © 2019 World Wide Web Consortium.
 """
 
+import warnings
+
 import pytest
 from pydantic import ValidationError
 
@@ -276,7 +278,9 @@ def test_webvtt_file() -> None:
 
     with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
         content = f.read()
-        vtt = WebVTTFile.parse(content)
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            vtt = WebVTTFile.parse(content)
     assert len(vtt) == 2
     block = vtt.cue_blocks[1]
     assert len(block.payload) == 5

From b0b5c53acc0cf35d003794dff148f7b50ceea590 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Tue, 16 Dec 2025 17:23:34 +0100
Subject: [PATCH 13/22] chore(webvtt): parse the WebVTT file title

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/webvtt.py       | 4 +++-
 test/data/webvtt/webvtt_example_04.vtt | 2 +-
 test/test_webvtt.py                    | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index c4f7336f..6b4eba1f 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -510,6 +510,7 @@ class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
     _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
+    title: Optional[str] = None
     cue_blocks: list[WebVTTCueBlock]
 
     @staticmethod
@@ -561,6 +562,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
 
         # Strip "WEBVTT" header line
         lines = raw.split("\n", 1)
+        title = lines[0].removeprefix("WEBVTT").strip() or None
         body = lines[1] if len(lines) > 1 else ""
 
         # Remove NOTE/STYLE/REGION blocks
@@ -575,7 +577,7 @@ def parse(cls, raw: str) -> "WebVTTFile":
             except ValueError as e:
                 warnings.warn(f"Failed to parse cue block:\n{block}\n{e}", RuntimeWarning)
 
-        return cls(cue_blocks=cues)
+        return cls(title=title, cue_blocks=cues)
 
     def __iter__(self):
         """Return an iterator over the cue blocks."""
diff --git a/test/data/webvtt/webvtt_example_04.vtt b/test/data/webvtt/webvtt_example_04.vtt
index b0519be2..78b5ba0c 100644
--- a/test/data/webvtt/webvtt_example_04.vtt
+++ b/test/data/webvtt/webvtt_example_04.vtt
@@ -1,4 +1,4 @@
-WEBVTT
+WEBVTT Danger of Nitrogen
 
 NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 51f448ed..a3443fd2 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -275,6 +275,7 @@ def test_webvtt_file() -> None:
     assert len(block.payload) == 1
     assert isinstance(block.payload[0].component, WebVTTCueTextSpan)
     assert block.payload[0].component.text == "Good."
+    assert not vtt.title
 
     with open("./test/data/webvtt/webvtt_example_04.vtt", encoding="utf-8") as f:
         content = f.read()
@@ -290,6 +291,7 @@ def test_webvtt_file() -> None:
         "— You could <b.loud>die</b>.\n"
         "<v John>This is true.</v>\n"
     )
+    assert vtt.title == "Danger of Nitrogen"
 
 
 def test_webvtt_cue_language_span_start_tag():

From 4cf5e8e60f3f300a76b660a0e452e75bac414a9b Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Tue, 6 Jan 2026 15:47:34 +0100
Subject: [PATCH 14/22] chore(webvtt): rebase to latest changes in idoctags

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/experimental/idoctags.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
index dd19d7f0..7376062b 100644
--- a/docling_core/experimental/idoctags.py
+++ b/docling_core/experimental/idoctags.py
@@ -175,6 +175,8 @@ def _create_location_tokens_for_item(
         return ""
     out: list[str] = []
     for prov in item.prov:
+        if not isinstance(prov, ProvenanceItem):
+            continue
         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
         bbox = prov.bbox.to_top_left_origin(page_h).as_tuple()
         out.append(_create_location_tokens_for_bbox(bbox=bbox, page_w=page_w, page_h=page_h, xres=xres, yres=yres))
@@ -1379,12 +1381,14 @@ def serialize(
             # we will need to do something more complex I believe ...
             res: list[SerializationResult] = []
             for idp, prov_ in enumerate(item.prov):
-                item_ = copy.deepcopy(item)
+                if not isinstance(prov_, ProvenanceItem):
+                    continue
+                item_: TextItem = copy.deepcopy(item)
                 item_.prov = [prov_]
                 item_.text = item.orig[prov_.charspan[0] : prov_.charspan[1]]  # it must be `orig`, not `text` here!
                 item_.orig = item.orig[prov_.charspan[0] : prov_.charspan[1]]
-
-                item_.prov[0].charspan = (0, len(item_.orig))
+                if isinstance(item_.prov[0], ProvenanceItem):
+                    item_.prov[0].charspan = (0, len(item_.orig))
 
                 # marker field should be cleared on subsequent split parts
                 if idp > 0 and isinstance(item_, ListItem):
@@ -1748,7 +1752,7 @@ def _emit_otsl(
 
         if params.add_table_cell_location:
             # Check if we have all required information for location serialization
-            if item.prov and len(item.prov) > 0:
+            if item.prov and isinstance(item.prov[0], ProvenanceItem):
                 page_no = item.prov[0].page_no
                 if doc.pages and page_no in doc.pages:
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
@@ -1897,6 +1901,8 @@ def serialize(
             for it, _ in doc.iterate_items(root=item):
                 if isinstance(it, DocItem) and it.prov:
                     for prov in it.prov:
+                        if not isinstance(prov, ProvenanceItem):
+                            continue
                         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
                         boxes.append(prov.bbox.to_top_left_origin(page_h).as_tuple())
                         prov_page_w_h = (page_w, page_h, prov.page_no)

From 993195e636eb9bb9c6c81b416ea84b64f07e4d8a Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 19 Jan 2026 18:54:07 +0100
Subject: [PATCH 15/22] feat(webvtt): add WebVTT serializer

Add a DoclingDocument serializer to WebVTT format.
Improve WebVTT data model.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/common.py |   3 +-
 docling_core/transforms/serializer/webvtt.py | 545 +++++++++++++++++++
 docling_core/types/doc/document.py           |   6 +-
 docling_core/types/doc/webvtt.py             | 150 ++++-
 docs/DoclingDocument.json                    |   6 +-
 test/data/doc/webvtt_example_01.gt.vtt       |  40 ++
 test/data/doc/webvtt_example_01.json         | 313 +++++++++++
 test/data/doc/webvtt_example_02.gt.vtt       |  16 +
 test/data/doc/webvtt_example_02.json         | 272 +++++++++
 test/data/doc/webvtt_example_03.gt.vtt       |  57 ++
 test/data/doc/webvtt_example_03.json         | 406 ++++++++++++++
 test/data/doc/webvtt_example_04.gt.vtt       |   9 +
 test/data/doc/webvtt_example_04.json         | 194 +++++++
 test/data/doc/webvtt_example_05.gt.vtt       |  10 +
 test/data/doc/webvtt_example_05.json         | 344 ++++++++++++
 test/test_serialization.py                   |  25 +
 test/test_webvtt.py                          |   7 +-
 17 files changed, 2372 insertions(+), 31 deletions(-)
 create mode 100644 docling_core/transforms/serializer/webvtt.py
 create mode 100644 test/data/doc/webvtt_example_01.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_01.json
 create mode 100644 test/data/doc/webvtt_example_02.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_02.json
 create mode 100644 test/data/doc/webvtt_example_03.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_03.json
 create mode 100644 test/data/doc/webvtt_example_04.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_04.json
 create mode 100644 test/data/doc/webvtt_example_05.gt.vtt
 create mode 100644 test/data/doc/webvtt_example_05.json

diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index c9c497f4..c36062e0 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -324,7 +324,7 @@ def serialize_doc(
         parts: list[SerializationResult],
         **kwargs: Any,
     ) -> SerializationResult:
-        """Serialize a document out of its pages."""
+        """Serialize a document out of its parts."""
         ...
 
     def _serialize_body(self, **kwargs) -> SerializationResult:
@@ -355,7 +355,6 @@ def serialize(
         empty_res = create_ser_result()
 
         my_item = item or self.doc.body
-
         if my_item == self.doc.body:
             if my_item.meta and not self._meta_is_wrapped():
                 meta_part = self.serialize_meta(item=my_item, **my_kwargs)
diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
new file mode 100644
index 00000000..15fdbc3b
--- /dev/null
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -0,0 +1,545 @@
+"""Define classes for WebVTT serialization."""
+
+import logging
+import re
+from typing import Any, Optional, get_args
+
+from pydantic import BaseModel
+from typing_extensions import override
+
+from docling_core.transforms.serializer.base import (
+    BaseAnnotationSerializer,
+    BaseDocSerializer,
+    BaseFallbackSerializer,
+    BaseFormSerializer,
+    BaseInlineSerializer,
+    BaseKeyValueSerializer,
+    BaseListSerializer,
+    BaseMetaSerializer,
+    BasePictureSerializer,
+    BaseTableSerializer,
+    BaseTextSerializer,
+    SerializationResult,
+)
+from docling_core.transforms.serializer.common import (
+    CommonParams,
+    DocSerializer,
+    create_ser_result,
+)
+from docling_core.types.doc.document import (
+    ContentLayer,
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    Formatting,
+    FormItem,
+    InlineGroup,
+    KeyValueItem,
+    ListGroup,
+    NodeItem,
+    PictureItem,
+    ProvenanceTrack,
+    TableItem,
+    TextItem,
+    TitleItem,
+)
+from docling_core.types.doc.webvtt import (
+    START_TAG_NAMES,
+    WebVTTCueBlock,
+    WebVTTCueSpanStartTag,
+    WebVTTCueSpanStartTagAnnotated,
+    WebVTTCueTimings,
+    WebVTTFile,
+    WebVTTLineTerminator,
+    WebVTTTimestamp,
+)
+
+_logger = logging.getLogger(__name__)
+
+
+def _remove_consecutive_pairs(text: str) -> str:
+    """Remove one pass of consecutive start/end tag pairs.
+
+    This function looks for patterns like </tag><tag> where the tags are identical
+    and removes them. It handles two cases:
+    1. Direct adjacent tags with content: <tag>content</tag>whitespace<tag>
+    2. Tags with other tags in between: </tag><othertag><tag>
+
+    Args:
+        text: Input string
+
+    Returns:
+        String with one pass of consecutive pairs removed
+    """
+    # Pattern 1: Direct adjacent tags </tag><tag> with same classes and annotations
+    pattern1 = re.compile(
+        r"<([bciuv]|lang)((?:\.\w+)*)(?:\s+([^>]+))?>"  # Opening tag: capture tag, classes, annotation
+        r"((?:(?!</\1>).)*?)"  # Content (non-greedy, not containing the closing tag)
+        r"</\1>"  # Closing tag
+        r"(\s*)"  # Capture whitespace between tags (including newlines)
+        r"<\1((?:\.\w+)*)(?:\s+([^>]+))?>"  # Next opening tag: capture classes and annotation
+    )
+
+    def replacer1(match: re.Match[str]) -> str:
+        tag = match.group(1)
+        classes1 = match.group(2) or ""
+        anno1 = match.group(3) or ""
+        content = match.group(4)
+        whitespace = match.group(5)  # Whitespace between tags
+        classes2 = match.group(6) or ""
+        anno2 = match.group(7) or ""
+
+        # Only merge if classes and annotations match
+        if classes1 == classes2 and anno1 == anno2:
+            # Merge: remove the closing and opening tags, but keep the whitespace
+            return f"<{tag}{classes1}{' ' + anno1 if anno1 else ''}>{content}{whitespace}"
+        else:
+            # Don't merge - return original
+            return match.group(0)
+
+    # Pattern 2: Tags with other tags in between </tag><othertag><tag>
+    # This removes redundant </tag> and <tag> when there's another tag in between
+    pattern2 = re.compile(
+        r"</([bciuv]|lang)>"  # Closing tag
+        r"(<[^>]+>)"  # Any other tag in between
+        r"<\1(?:\.\w+)*(?:\s+[^>]+)?>"  # Same opening tag (with any classes/annotations)
+    )
+
+    def replacer2(match: re.Match[str]) -> str:
+        # Just keep the middle tag, remove the closing and opening of the same type
+        return match.group(2)
+
+    result = pattern1.sub(replacer1, text)
+    result = pattern2.sub(replacer2, result)
+
+    return result
+
+
+class WebVTTParams(CommonParams):
+    """Serialization parameters for the Web Video Text Tracks (WebVTT) format."""
+
+    layers: set[ContentLayer] = {ContentLayer.BODY}
+
+
+class WebVTTTextSerializer(BaseModel, BaseTextSerializer):
+    """Text serializer to Web Video Text Tracks (WebVTT) format."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TextItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        is_inline_scope: bool = False,
+        visited: Optional[set[str]] = None,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes the passed item."""
+        # Handle TitleItem specially - it doesn't have provenance but we need its text
+        if isinstance(item, TitleItem):
+            return create_ser_result(text=item.text, span_source=item)
+
+        # Only process items with ProvenanceTrack (WebVTT cues)
+        if not item.text or not item.prov or not isinstance(item.prov[0], ProvenanceTrack):
+            return create_ser_result()
+
+        # Apply post-processing here: formatting, classes, language, and voice
+        # If the TextItem is part of an InlineGroup, we need to further post-process it
+        # within the group context
+
+        prov: ProvenanceTrack = item.prov[0]
+        text: str = doc_serializer.post_process(
+            text=item.text,
+            formatting=item.formatting,
+            voice=prov.voice,
+            languages=prov.languages,
+            classes=prov.classes,
+        )
+        if is_inline_scope:
+            # Iteratively remove unnecessary consecutive tag pairs until no more changes
+            prev_text: Optional[str] = None
+            while prev_text != text:
+                prev_text = text
+                text = _remove_consecutive_pairs(text)
+
+        return create_ser_result(text=text, span_source=item)
+
+
+class _WebVTTTableSerializer(BaseTableSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: TableItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTPictureSerializer(BasePictureSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: PictureItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTKeyValueSerializer(BaseKeyValueSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: KeyValueItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTFormSerializer(BaseFormSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: FormItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTFallbackSerializer(BaseFallbackSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (item, doc_serializer, doc, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTListSerializer(BaseModel, BaseListSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: ListGroup,
+        doc_serializer: BaseDocSerializer,
+        doc: DoclingDocument,
+        list_level: int = 0,
+        is_inline_scope: bool = False,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, list_level, is_inline_scope, item, doc_serializer, kwargs)
+        return create_ser_result()
+
+
+class WebVTTInlineSerializer(BaseInlineSerializer):
+    """Inline group serializer to Web Video Text Tracks (WebVTT) format."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: InlineGroup,
+        doc_serializer: "BaseDocSerializer",
+        doc: DoclingDocument,
+        list_level: int = 0,
+        visited: Optional[set[str]] = None,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serializes an inline group to WebVTT format."""
+        _ = doc
+        my_visited = visited if visited is not None else set()
+        parts = doc_serializer.get_parts(
+            item=item,
+            list_level=list_level,
+            is_inline_scope=True,
+            visited=my_visited,
+            **kwargs,
+        )
+        # Include all parts, even if text is empty or whitespace-only
+        # Use 'is not None' instead of truthiness check to preserve whitespace
+        text_res = "".join([p.text for p in parts if p.text is not None])
+
+        # Apply tag normalization to the concatenated result
+        # Iteratively remove consecutive pairs until no more changes
+        prev_text = None
+        while prev_text != text_res:
+            prev_text = text_res
+            text_res = _remove_consecutive_pairs(text_res)
+
+        return create_ser_result(text=text_res, span_source=parts)
+
+
+class _WebVTTMetaSerializer(BaseModel, BaseMetaSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: NodeItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, item, kwargs)
+        return create_ser_result()
+
+
+class _WebVTTAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
+    """No-op for WebVTT output (not represented)."""
+
+    @override
+    def serialize(
+        self,
+        *,
+        item: DocItem,
+        doc: DoclingDocument,
+        **kwargs: Any,
+    ) -> SerializationResult:
+        _ = (doc, item, kwargs)
+        return create_ser_result()
+
+
+class WebVTTDocSerializer(DocSerializer):
+    """Document serializer to Web Video Text Tracks (WebVTT) format."""
+
+    text_serializer: BaseTextSerializer = WebVTTTextSerializer()
+    table_serializer: BaseTableSerializer = _WebVTTTableSerializer()
+    picture_serializer: BasePictureSerializer = _WebVTTPictureSerializer()
+    key_value_serializer: BaseKeyValueSerializer = _WebVTTKeyValueSerializer()
+    form_serializer: BaseFormSerializer = _WebVTTFormSerializer()
+    fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer()
+    list_serializer: BaseListSerializer = _WebVTTListSerializer()
+    inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer()
+    meta_serializer: Optional[BaseMetaSerializer] = _WebVTTMetaSerializer()
+    annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer()
+
+    params: CommonParams = CommonParams()
+
+    @override
+    def requires_page_break(self) -> bool:
+        """Whether to add page breaks.
+
+        WebVTT format does not support page breaks.
+        """
+        return False
+
+    @override
+    def serialize_bold(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific bold serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("b", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="b",
+            css=classes,
+        )
+
+    @override
+    def serialize_italic(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific italic serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("i", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="i",
+            css=classes,
+        )
+
+    @override
+    def serialize_underline(self, text: str, **kwargs: Any) -> str:
+        """Apply WebVTT-specific underline serialization."""
+        classes: list[str] = kwargs.get("classes", {}).get("u", [])
+
+        return self.serialize_cue_span(
+            text,
+            tag="u",
+            css=classes,
+        )
+
+    def serialize_cue_span(
+        self,
+        text: str,
+        tag: START_TAG_NAMES,
+        anno: Optional[str] = None,
+        css: list[str] = [],
+    ) -> str:
+        """Apply serialization to a WebVTT cue span."""
+        start_tag: WebVTTCueSpanStartTag
+        if tag in {"b", "i", "u", "c"}:
+            start_tag = WebVTTCueSpanStartTag(name=tag, classes=css)
+        elif tag in {"v", "lang"}:
+            if not anno:
+                _logger.warning(f"Invalid {tag} cue span without annotation: {text}")
+                return text
+            else:
+                start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, classes=css, annotation=anno)
+        else:
+            return text
+
+        res: str = f"{start_tag}{text}</{tag}>"
+        return res
+
+    @staticmethod
+    def _extract_classes(classes: list[str]) -> dict[str, list[str]]:
+        """Extract tag and values from provenance classes.
+
+        Args:
+            classes: The classes from a ProvenanceTrack object.
+
+        Returns:
+            Map of tag to class values.
+        """
+        res: dict[str, list[str]] = {}
+        for item in classes or []:
+            for prefix in get_args(START_TAG_NAMES):
+                if item == prefix:
+                    res[prefix] = []
+                    break
+                elif item.startswith(prefix + "."):
+                    cls_str: str = item[len(prefix) + 1 :]
+                    res[prefix] = cls_str.split(".")
+                    break
+        return res
+
+    @override
+    def serialize_doc(
+        self,
+        *,
+        parts: list[SerializationResult],
+        **kwargs: Any,
+    ) -> SerializationResult:
+        """Serialize a document out of its parts."""
+        title: Optional[str] = None
+
+        timings: Optional[WebVTTCueTimings] = None
+        id: Optional[str] = None
+        text: str = ""
+        cue_blocks: list[WebVTTCueBlock] = []
+        for part in parts:
+            if not part.text or not part.spans:
+                continue
+
+            # Get the doc item from the first span
+            doc_item: DocItem = part.spans[0].item
+
+            # Handle title items (check both TitleItem type and label)
+            if isinstance(doc_item, TitleItem) or (
+                isinstance(doc_item, TextItem) and doc_item.label == DocItemLabel.TITLE
+            ):
+                title = part.text
+                continue
+            if isinstance(doc_item, InlineGroup) and doc_item.children:
+                doc_item = doc_item.children[0].resolve(doc=self.doc)
+            if isinstance(doc_item, TextItem) and doc_item.prov and isinstance(doc_item.prov[0], ProvenanceTrack):
+                prov: ProvenanceTrack = doc_item.prov[0]
+                if (
+                    prov.identifier == id
+                    and timings
+                    and timings.start.seconds == prov.start_time
+                    and timings.end.seconds == prov.end_time
+                ):
+                    # When combining items with same timing, add newline and merge consecutive tags
+                    combined = text.rstrip() + WebVTTLineTerminator.LF.value + part.text
+                    # Use _remove_consecutive_pairs to merge tags like </v>\n<v Speaker A>
+                    # Iteratively remove consecutive pairs until no more changes
+                    prev_combined = None
+                    while prev_combined != combined:
+                        prev_combined = combined
+                        combined = _remove_consecutive_pairs(combined)
+                    text = combined + WebVTTLineTerminator.LF.value
+                else:
+                    if text:
+                        cue_blocks.append(WebVTTCueBlock.parse(text))
+                    timings = WebVTTCueTimings(
+                        start=WebVTTTimestamp.from_seconds(prov.start_time),
+                        end=WebVTTTimestamp.from_seconds(prov.end_time),
+                    )
+                    id = prov.identifier
+                    text = (
+                        f"{id + WebVTTLineTerminator.LF.value if id else ''}{timings}"
+                        f"{WebVTTLineTerminator.LF.value}{part.text}"
+                        f"{WebVTTLineTerminator.LF.value}"
+                    )
+        if text:
+            cue_blocks.append(WebVTTCueBlock.parse(text))
+
+        webvtt_file = WebVTTFile(title=title, cue_blocks=cue_blocks)
+        content = str(webvtt_file)
+        return create_ser_result(text=content, span_source=parts)
+
+    def post_process(
+        self,
+        text: str,
+        formatting: Optional[Formatting] = None,
+        voice: Optional[str] = None,
+        languages: Optional[list[str]] = None,
+        classes: Optional[list[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Apply some text post-processing steps by adding formatting tags.
+
+        The order of the formatting tags is determined by this function and `DocSerializer.post_process`,
+        from the innermost to the outermost:
+            1. language (<lang>)
+            2. underline (<u>)
+            3. italic (<i>)
+            4. bold (<b>)
+            5. class (<c>)
+            6. voice (<v>)
+        """
+        res: str = text
+        cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
+
+        for lang in languages or []:
+            res = self.serialize_cue_span(text=res, tag="lang", anno=lang, css=cls.get("lang", []))
+
+        res = super().post_process(text=res, formatting=formatting, classes=cls)
+
+        if "c" in cls:
+            res = self.serialize_cue_span(
+                text=res,
+                tag="c",
+                css=cls.get("c", []),
+            )
+        if voice:
+            res = self.serialize_cue_span(
+                text=res,
+                tag="v",
+                anno=voice,
+                css=cls.get("v", []),
+            )
+
+        return res
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index dd911ab6..3d3f29d1 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1238,7 +1238,7 @@ class ProvenanceTrack(BaseModel):
     classes: Optional[list[str]] = Field(
         None,
         min_length=1,
-        examples=["first", "loud", "yellow"],
+        examples=["b.first", "v.loud", "c.yellow"],
         description="Classes for describing the cue significance",
     )
 
@@ -1250,7 +1250,7 @@ def check_order(self) -> Self:
         return self
 
 
-def get_provenance_discriminator_value(v: Any) -> str:
+def _get_provenance_discriminator_value(v: Any) -> str:
     """Callable discriminator for provenance instances.
 
     Args:
@@ -1267,7 +1267,7 @@ def get_provenance_discriminator_value(v: Any) -> str:
 
 ProvenanceType = Annotated[
     Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]],
-    Discriminator(get_provenance_discriminator_value),
+    Discriminator(_get_provenance_discriminator_value),
 ]
 
 
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 6b4eba1f..6bc4a219 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -13,7 +13,7 @@
 
 _VALID_ENTITIES: set = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
 _ENTITY_PATTERN: re.Pattern = re.compile(r"&([a-zA-Z0-9]+);")
-_START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
+START_TAG_NAMES = Literal["c", "b", "i", "u", "v", "lang"]
 
 
 class WebVTTLineTerminator(str, Enum):
@@ -80,6 +80,23 @@ def seconds(self) -> float:
         """A representation of the WebVTT Timestamp in seconds."""
         return self._hours * 3600 + self._minutes * 60 + self._seconds + self._millis / 1000.0
 
+    @classmethod
+    def from_seconds(cls, seconds: float) -> Self:
+        """Create a WebVTT timestamp from seconds.
+
+        Args:
+            seconds: The time in seconds (can include fractional seconds for milliseconds).
+
+        Returns:
+            A WebVTT timestamp instance.
+        """
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis: int = round((seconds % 1) * 1000)
+
+        return cls(raw=f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}")
+
     def __eq__(self, other: object) -> bool:
         """Two timestamps are equal if their total number of seconds is equal."""
         if not isinstance(other, WebVTTTimestamp):
@@ -92,9 +109,27 @@ def __lt__(self, other: "WebVTTTimestamp") -> bool:
             return NotImplemented
         return self.seconds < other.seconds
 
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the timestamp as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0.
+
+        Returns:
+            Formatted timestamp string.
+        """
+        if omit_hours_if_zero and self._hours == 0:
+            return f"{self._minutes:02d}:{self._seconds:02d}.{self._millis:03d}"
+        return self.raw
+
     @override
     def __str__(self) -> str:
-        """Return a string representation of a WebVTT timestamp."""
+        """Return a string representation of a WebVTT timestamp.
+
+        Always returns the full timestamp format including hours (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation (MM:SS.mmm) when hours are zero.
+        """
         return self.raw
 
 
@@ -112,9 +147,27 @@ def check_order(self) -> Self:
                 raise ValueError("End timestamp must be greater than start timestamp")
         return self
 
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the cue timings as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in both timestamps.
+
+        Returns:
+            Formatted cue timings string in the format "start --> end".
+        """
+        start_str = self.start.format(omit_hours_if_zero=omit_hours_if_zero)
+        end_str = self.end.format(omit_hours_if_zero=omit_hours_if_zero)
+        return f"{start_str} --> {end_str}"
+
     @override
-    def __str__(self):
-        """Return a string representation of the cue timings."""
+    def __str__(self) -> str:
+        """Return a string representation of the cue timings.
+
+        Always returns the full format including hours (HH:MM:SS.mmm --> HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        """
         return f"{self.start} --> {self.end}"
 
 
@@ -142,7 +195,7 @@ def is_valid_text(cls, value: str) -> str:
         return value
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue text span."""
         return self.text
 
@@ -154,7 +207,7 @@ class WebVTTCueComponentWithTerminator(BaseModel):
     terminator: Optional[WebVTTLineTerminator] = None
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue component with terminator."""
         return f"{self.component}{self.terminator.value if self.terminator else ''}"
 
@@ -169,7 +222,7 @@ class WebVTTCueInternalText(BaseModel):
     ] = []
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue internal text."""
         cue_str = f"{self.terminator.value if self.terminator else ''}{''.join(str(span) for span in self.components)}"
         return cue_str
@@ -178,7 +231,7 @@ def __str__(self):
 class WebVTTCueSpanStartTag(BaseModel):
     """WebVTT cue span start tag."""
 
-    name: Annotated[_START_TAG_NAMES, Field(description="The tag name")]
+    name: Annotated[START_TAG_NAMES, Field(description="The tag name")]
     classes: Annotated[
         list[str],
         Field(description="List of classes representing the cue span's significance"),
@@ -200,7 +253,7 @@ def _get_name_with_classes(self) -> str:
         return f"{self.name}.{'.'.join(self.classes)}" if self.classes else self.name
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()}>"
 
@@ -228,7 +281,7 @@ def is_valid_annotation(cls, value: str) -> str:
         return value
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue span start tag."""
         return f"<{self._get_name_with_classes()} {self.annotation}>"
 
@@ -270,7 +323,7 @@ def check_tag_names_match(self) -> Self:
         return self
 
     @override
-    def __str__(self):
+    def __str__(self) -> str:
         """Return a string representation of the cue component."""
         return f"{self.start_tag}{self.internal_text}</{self.start_tag.name}>"
 
@@ -391,7 +444,7 @@ def _create_text_components(
                 )
 
     @classmethod
-    def parse(cls, raw: str) -> "WebVTTCueBlock":
+    def parse(cls, raw: str) -> Self:
         """Parse a WebVTT cue block from a string.
 
         Args:
@@ -489,29 +542,50 @@ def parse(cls, raw: str) -> "WebVTTCueBlock":
             payload=stack[0],
         )
 
-    def __str__(self):
-        """Return a string representation of the WebVTT cue block."""
+    def format(self, omit_hours_if_zero: bool = False, omit_voice_end: bool = False) -> str:
+        """Format the WebVTT cue block as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in the timings.
+            omit_voice_end: If True and this cue block has a WebVTT cue voice span as
+                its only component, omit the voice end tag for brevity.
+
+        Returns:
+            Formatted cue block string.
+        """
         parts = []
         if self.identifier:
             parts.append(f"{self.identifier}\n")
-        timings_line = str(self.timings)
+        timings_line = self.timings.format(omit_hours_if_zero=omit_hours_if_zero)
         parts.append(timings_line + "\n")
         for idx, span in enumerate(self.payload):
-            if idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
-                # the end tag may be omitted for brevity
+            if omit_voice_end and idx == 0 and len(self.payload) == 1 and span.component.kind == "v":
                 parts.append(str(span).removesuffix("</v>"))
             else:
                 parts.append(str(span))
 
         return "".join(parts) + "\n"
 
+    def __str__(self) -> str:
+        """Return a string representation of the WebVTT cue block.
+
+        Always returns the full format including hours in timestamps (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        Always returns the WebVTT cue voice spans with the voice end tag, even if this
+        cue block has a WebVTT cue voice span as a single component in the payload. Use
+        `format(omit_voice_end=True)` to get a shorter representation without the voice
+        end tag.
+        """
+        return self.format()
+
 
 class WebVTTFile(BaseModel):
     """A model representing a WebVTT file."""
 
     _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
-    title: Optional[str] = None
     cue_blocks: list[WebVTTCueBlock]
+    title: Optional[str] = None
 
     @staticmethod
     def verify_signature(content: str) -> bool:
@@ -544,7 +618,7 @@ def validate_start_time(self) -> Self:
         return self
 
     @classmethod
-    def parse(cls, raw: str) -> "WebVTTFile":
+    def parse(cls, raw: str) -> Self:
         """Parse a WebVTT file.
 
         Args:
@@ -579,14 +653,46 @@ def parse(cls, raw: str) -> "WebVTTFile":
 
         return cls(title=title, cue_blocks=cues)
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[WebVTTCueBlock]:  # type: ignore[override]
         """Return an iterator over the cue blocks."""
         return iter(self.cue_blocks)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx) -> WebVTTCueBlock:
         """Return the cue block at the given index."""
         return self.cue_blocks[idx]
 
-    def __len__(self):
+    def __len__(self) -> int:
         """Return the number of cue blocks."""
         return len(self.cue_blocks)
+
+    def format(self, omit_hours_if_zero: bool = False) -> str:
+        """Format the WebVTT file as a string.
+
+        Args:
+            omit_hours_if_zero: If True, omit hours when they are 0 in the timings.
+
+        Returns:
+            Formatted WebVTT file string.
+        """
+        parts: list[str] = []
+
+        if self.title:
+            parts.append(f"WEBVTT {self.title}\n")
+        else:
+            parts.append("WEBVTT\n")
+
+        for cue_block in self.cue_blocks:
+            parts.append("\n")
+            parts.append(cue_block.format(omit_hours_if_zero=omit_hours_if_zero))
+
+        # Remove the trailing newline from the last cue block
+        return "".join(parts).rstrip("\n")
+
+    def __str__(self) -> str:
+        """Return a string representation of the WebVTT file.
+
+        Always returns the full format including hours in timestamps (HH:MM:SS.mmm),
+        even when hours are zero. Use `format(omit_hours_if_zero=True)` to get
+        a shorter representation when hours are zero.
+        """
+        return self.format()
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 45a5d889..cea39ba5 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -2324,9 +2324,9 @@
           "default": null,
           "description": "Classes for describing the cue significance",
           "examples": [
-            "first",
-            "loud",
-            "yellow"
+            "b.first",
+            "v.loud",
+            "c.yellow"
           ],
           "title": "Classes"
         }
diff --git a/test/data/doc/webvtt_example_01.gt.vtt b/test/data/doc/webvtt_example_01.gt.vtt
new file mode 100644
index 00000000..cad1c72a
--- /dev/null
+++ b/test/data/doc/webvtt_example_01.gt.vtt
@@ -0,0 +1,40 @@
+WEBVTT
+
+00:00:11.000 --> 00:00:13.000
+<v Roger Bingham>We are in New York City</v>
+
+00:00:13.000 --> 00:00:16.000
+<v Roger Bingham>We’re actually at the Lucern Hotel, just down the street</v>
+
+00:00:16.000 --> 00:00:18.000
+<v Roger Bingham>from the American Museum of Natural History</v>
+
+00:00:18.000 --> 00:00:20.000
+<v Roger Bingham>And with me is Neil deGrasse Tyson</v>
+
+00:00:20.000 --> 00:00:22.000
+<v Roger Bingham>Astrophysicist, Director of the Hayden Planetarium</v>
+
+00:00:22.000 --> 00:00:24.000
+<v Roger Bingham>at the AMNH.</v>
+
+00:00:24.000 --> 00:00:26.000
+<v Roger Bingham>Thank you for walking down here.</v>
+
+00:00:27.000 --> 00:00:30.000
+<v Roger Bingham>And I want to do a follow-up on the last conversation we did.</v>
+
+00:00:30.000 --> 00:00:31.500
+<v Roger Bingham>When we e-mailed—</v>
+
+00:00:30.500 --> 00:00:32.500
+<v Neil deGrasse Tyson>Didn’t we talk about enough in that conversation?</v>
+
+00:00:32.000 --> 00:00:35.500
+<v Roger Bingham>No! No no no no; 'cos 'cos obviously 'cos</v>
+
+00:00:32.500 --> 00:00:33.500
+<v Neil deGrasse Tyson><i>Laughs</i></v>
+
+00:00:35.500 --> 00:00:38.000
+<v Roger Bingham>You know I’m so excited my glasses are falling off here.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
new file mode 100644
index 00000000..5a7c9d29
--- /dev/null
+++ b/test/data/doc/webvtt_example_01.json
@@ -0,0 +1,313 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_01",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 16887312431371817791,
+    "filename": "webvtt_example_01.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/texts/3"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/texts/6"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      },
+      {
+        "$ref": "#/texts/9"
+      },
+      {
+        "$ref": "#/texts/10"
+      },
+      {
+        "$ref": "#/texts/11"
+      },
+      {
+        "$ref": "#/texts/12"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 11.0,
+          "end_time": 13.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "We are in New York City",
+      "text": "We are in New York City"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 13.0,
+          "end_time": 16.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "We’re actually at the Lucern Hotel, just down the street",
+      "text": "We’re actually at the Lucern Hotel, just down the street"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 16.0,
+          "end_time": 18.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "from the American Museum of Natural History",
+      "text": "from the American Museum of Natural History"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 18.0,
+          "end_time": 20.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "And with me is Neil deGrasse Tyson",
+      "text": "And with me is Neil deGrasse Tyson"
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 20.0,
+          "end_time": 22.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "Astrophysicist, Director of the Hayden Planetarium",
+      "text": "Astrophysicist, Director of the Hayden Planetarium"
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 22.0,
+          "end_time": 24.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "at the AMNH.",
+      "text": "at the AMNH."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 24.0,
+          "end_time": 26.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "Thank you for walking down here.",
+      "text": "Thank you for walking down here."
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 27.0,
+          "end_time": 30.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "And I want to do a follow-up on the last conversation we did.",
+      "text": "And I want to do a follow-up on the last conversation we did."
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 30.0,
+          "end_time": 31.5,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "When we e-mailed—",
+      "text": "When we e-mailed—"
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 30.5,
+          "end_time": 32.5,
+          "voice": "Neil deGrasse Tyson"
+        }
+      ],
+      "orig": "Didn’t we talk about enough in that conversation?",
+      "text": "Didn’t we talk about enough in that conversation?"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 32.0,
+          "end_time": 35.5,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+      "text": "No! No no no no; 'cos 'cos obviously 'cos"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 32.5,
+          "end_time": 33.5,
+          "voice": "Neil deGrasse Tyson"
+        }
+      ],
+      "orig": "Laughs",
+      "text": "Laughs",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 35.5,
+          "end_time": 38.0,
+          "voice": "Roger Bingham"
+        }
+      ],
+      "orig": "You know I’m so excited my glasses are falling off here.",
+      "text": "You know I’m so excited my glasses are falling off here."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.gt.vtt b/test/data/doc/webvtt_example_02.gt.vtt
new file mode 100644
index 00000000..8f9811e7
--- /dev/null
+++ b/test/data/doc/webvtt_example_02.gt.vtt
@@ -0,0 +1,16 @@
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+<v.first.loud Esme>It’s a blue apple tree!</v>
+
+00:00:02.000 --> 00:00:04.000
+<v Mary>No way!</v>
+
+00:00:04.000 --> 00:00:06.000
+<v Esme>Hee!</v> <i>laughter</i>
+
+00:00:06.000 --> 00:00:08.000
+<v.loud Mary>That’s awesome!</v>
+
+00:00:08.000 --> 00:00:10.000
+Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
new file mode 100644
index 00000000..2966a2e0
--- /dev/null
+++ b/test/data/doc/webvtt_example_02.json
@@ -0,0 +1,272 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_02",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 8584853280299071027,
+    "filename": "webvtt_example_02.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 0.0,
+          "end_time": 2.0,
+          "voice": "Esme",
+          "classes": [
+            "v.first.loud"
+          ]
+        }
+      ],
+      "orig": "It\u2019s a blue apple tree!",
+      "text": "It\u2019s a blue apple tree!"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 2.0,
+          "end_time": 4.0,
+          "voice": "Mary"
+        }
+      ],
+      "orig": "No way!",
+      "text": "No way!"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0,
+          "voice": "Esme"
+        }
+      ],
+      "orig": "Hee!",
+      "text": "Hee!"
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": " ",
+      "text": " "
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": "laughter",
+      "text": "laughter",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 6.0,
+          "end_time": 8.0,
+          "voice": "Mary",
+          "classes": [
+            "v.loud"
+          ]
+        }
+      ],
+      "orig": "That\u2019s awesome!",
+      "text": "That\u2019s awesome!"
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": "Sur les ",
+      "text": "Sur les "
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0,
+          "languages": [
+            "en"
+          ],
+          "classes": [
+            "i.foreignphrase"
+          ]
+        }
+      ],
+      "orig": "playground",
+      "text": "playground",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": ", ici \u00e0 Montpellier",
+      "text": ", ici \u00e0 Montpellier"
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_03.gt.vtt b/test/data/doc/webvtt_example_03.gt.vtt
new file mode 100644
index 00000000..a4dc1291
--- /dev/null
+++ b/test/data/doc/webvtt_example_03.gt.vtt
@@ -0,0 +1,57 @@
+WEBVTT
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+00:00:04.963 --> 00:00:08.571
+<v Speaker A>OK,
+I think now we should be recording</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
+00:00:08.571 --> 00:00:09.403
+<v Speaker A>properly.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
+00:00:10.683 --> 00:00:11.563
+Good.
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
+00:00:13.363 --> 00:00:13.803
+<v Speaker A>Yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
+00:00:49.603 --> 00:00:53.363
+<v Speaker B>I was also thinking.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
+00:00:54.963 --> 00:01:02.072
+<v Speaker B>Would be maybe good to create items,</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
+00:01:02.072 --> 00:01:06.811
+<v Speaker B>some metadata,
+some options that can be specific.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
+00:01:10.243 --> 00:01:13.014
+<v Speaker A>Yeah,
+I mean I think you went even more than</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
+00:01:10.563 --> 00:01:12.643
+<v Speaker B>But we preserved the atoms.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+00:01:13.014 --> 00:01:15.907
+<v Speaker A>than me.
+I just opened the format.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+00:01:50.222 --> 00:01:51.643
+<v Speaker A>give it a try, yeah.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+00:01:52.043 --> 00:01:55.043
+<v Speaker B>Okay, talk to you later.</v>
+
+62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+00:01:54.603 --> 00:01:55.283
+<v Speaker A>See you.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
new file mode 100644
index 00000000..dddce0f2
--- /dev/null
+++ b/test/data/doc/webvtt_example_03.json
@@ -0,0 +1,406 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_03",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 11620880316586573676,
+    "filename": "webvtt_example_03.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/texts/3"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/texts/5"
+      },
+      {
+        "$ref": "#/texts/6"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      },
+      {
+        "$ref": "#/texts/9"
+      },
+      {
+        "$ref": "#/texts/10"
+      },
+      {
+        "$ref": "#/texts/11"
+      },
+      {
+        "$ref": "#/texts/12"
+      },
+      {
+        "$ref": "#/texts/13"
+      },
+      {
+        "$ref": "#/texts/14"
+      },
+      {
+        "$ref": "#/texts/15"
+      },
+      {
+        "$ref": "#/texts/16"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "OK,",
+      "text": "OK,"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I think now we should be recording",
+      "text": "I think now we should be recording"
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 8.571,
+          "end_time": 9.403,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "properly.",
+      "text": "properly."
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 10.683,
+          "end_time": 11.563,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
+        }
+      ],
+      "orig": "Good.",
+      "text": "Good."
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 13.363,
+          "end_time": 13.803,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "Yeah.",
+      "text": "Yeah."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 49.603,
+          "end_time": 53.363,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "I was also thinking.",
+      "text": "I was also thinking."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 54.963,
+          "end_time": 62.072,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "Would be maybe good to create items,",
+      "text": "Would be maybe good to create items,"
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "some metadata,",
+      "text": "some metadata,"
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "some options that can be specific.",
+      "text": "some options that can be specific."
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "Yeah,",
+      "text": "Yeah,"
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I mean I think you went even more than",
+      "text": "I mean I think you went even more than"
+    },
+    {
+      "self_ref": "#/texts/11",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 70.563,
+          "end_time": 72.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "But we preserved the atoms.",
+      "text": "But we preserved the atoms."
+    },
+    {
+      "self_ref": "#/texts/12",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "than me.",
+      "text": "than me."
+    },
+    {
+      "self_ref": "#/texts/13",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I just opened the format.",
+      "text": "I just opened the format."
+    },
+    {
+      "self_ref": "#/texts/14",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 110.222,
+          "end_time": 111.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "give it a try, yeah.",
+      "text": "give it a try, yeah."
+    },
+    {
+      "self_ref": "#/texts/15",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 112.043,
+          "end_time": 115.043,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "Okay, talk to you later.",
+      "text": "Okay, talk to you later."
+    },
+    {
+      "self_ref": "#/texts/16",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 114.603,
+          "end_time": 115.283,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "See you.",
+      "text": "See you."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.gt.vtt b/test/data/doc/webvtt_example_04.gt.vtt
new file mode 100644
index 00000000..ce7fcf65
--- /dev/null
+++ b/test/data/doc/webvtt_example_04.gt.vtt
@@ -0,0 +1,9 @@
+WEBVTT Danger of Nitrogen
+
+00:00:01.000 --> 00:00:04.000
+Never drink liquid nitrogen.
+
+00:00:05.000 --> 00:00:09.000
+— It will perforate your stomach.
+— You could <b.loud>die</b>.
+<v John>This is true.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
new file mode 100644
index 00000000..f96765fc
--- /dev/null
+++ b/test/data/doc/webvtt_example_04.json
@@ -0,0 +1,194 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 11822397499369478441,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/texts/2"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/6"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        },
+        {
+          "$ref": "#/texts/5"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "title",
+      "prov": [],
+      "orig": "Danger of Nitrogen",
+      "text": "Danger of Nitrogen"
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 1.0,
+          "end_time": 4.0
+        }
+      ],
+      "orig": "Never drink liquid nitrogen.",
+      "text": "Never drink liquid nitrogen."
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": "\u2014 It will perforate your stomach.",
+      "text": "\u2014 It will perforate your stomach."
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": "\u2014 You could ",
+      "text": "\u2014 You could "
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0,
+          "classes": [
+            "b.loud"
+          ]
+        }
+      ],
+      "orig": "die",
+      "text": "die",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0
+        }
+      ],
+      "orig": ".",
+      "text": "."
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 5.0,
+          "end_time": 9.0,
+          "voice": "John"
+        }
+      ],
+      "orig": "This is true.",
+      "text": "This is true."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.gt.vtt b/test/data/doc/webvtt_example_05.gt.vtt
new file mode 100644
index 00000000..fd7b788c
--- /dev/null
+++ b/test/data/doc/webvtt_example_05.gt.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a <i>steaming bowl of <lang es-ES>paella</lang></i> that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s <i><b.loud>unexpected</b> <u><lang it>arcobaleno</lang></u> of flavors</i> left everyone in awe.
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
new file mode 100644
index 00000000..616c94fc
--- /dev/null
+++ b/test/data/doc/webvtt_example_05.json
@@ -0,0 +1,344 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 5389775195091554844,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        },
+        {
+          "$ref": "#/texts/9"
+        },
+        {
+          "$ref": "#/texts/10"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14580.0,
+          "end_time": 14760.0,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "Last night the chef surprised us with a culinary adventure.",
+      "text": "Last night the chef surprised us with a culinary adventure."
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The waiter offered a ",
+      "text": "The waiter offered a "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "steaming bowl of ",
+      "text": "steaming bowl of ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "es-ES"
+          ]
+        }
+      ],
+      "orig": "paella",
+      "text": "paella",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+      "text": " that instantly transported the diners to a sunny Mediterranean coast."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The dessert\u2019s ",
+      "text": "The dessert\u2019s "
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "classes": [
+            "b.loud"
+          ]
+        }
+      ],
+      "orig": "unexpected",
+      "text": "unexpected",
+      "formatting": {
+        "bold": true,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " ",
+      "text": " ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "it"
+          ]
+        }
+      ],
+      "orig": "arcobaleno",
+      "text": "arcobaleno",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": true,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " of flavors",
+      "text": " of flavors",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " left everyone in awe.",
+      "text": " left everyone in awe."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 6fe3b386..fd68a347 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -15,6 +15,7 @@
     MarkdownParams,
     OrigListItemMarkerMode,
 )
+from docling_core.transforms.serializer.webvtt import WebVTTDocSerializer
 from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.document import (
@@ -563,3 +564,27 @@ def test_html_inline_and_formatting():
     ser = HTMLDocSerializer(doc=doc)
     actual = ser.serialize().text
     verify(exp_file=src.with_suffix(".gt.html"), actual=actual)
+
+
+# ===============================
+# WebVTT tests
+# ===============================
+
+
+@pytest.mark.parametrize(
+    "file_name",
+    [
+        "webvtt_example_01",
+        "webvtt_example_02",
+        "webvtt_example_03",
+        "webvtt_example_04",
+        "webvtt_example_05",
+    ],
+)
+def test_webvtt(file_name):
+    src = Path(f"./test/data/doc/{file_name}.json")
+    doc = DoclingDocument.load_from_json(src)
+
+    ser = WebVTTDocSerializer(doc=doc)
+    actual = ser.serialize().text
+    verify(exp_file=src.with_suffix(".gt.vtt"), actual=actual)
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index a3443fd2..938da37c 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -255,7 +255,12 @@ def test_webvtt_file() -> None:
         "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
         "https://www.w3.org/TR/webvtt1/\n\n"
     )
-    reverse += "\n".join([str(block) for block in vtt.cue_blocks])
+    reverse += "\n".join(
+        [
+            block.format(omit_hours_if_zero=True, omit_voice_end=True)
+            for block in vtt.cue_blocks
+        ]
+    )
     assert content == reverse.rstrip()
 
     with open("./test/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:

From c2f6350e4dbb68414e29834666b3665df9cf12a9 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 19 Jan 2026 20:04:17 +0100
Subject: [PATCH 16/22] fix(webvtt): add 'text/vtt' as extra mimetype

Add 'text/vtt' as extra MIME type to support WebVTT serialization, since it is not
supported by 'mimetypes' with python < 3.11

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/types/doc/document.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 3d3f29d1..720b0967 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -990,6 +990,7 @@ class DocumentOrigin(BaseModel):
         "text/asciidoc",
         "text/markdown",
         "text/csv",
+        "text/vtt",
         "audio/x-wav",
         "audio/wav",
         "audio/mp3",

From fafdf3ffb51499a0d04427d87a220201b9a1e924 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Thu, 22 Jan 2026 18:21:22 +0100
Subject: [PATCH 17/22] refactor(webvtt): roll back DocItem.prov as list of
 ProvenanceItem

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/experimental/idoctags.py         |  14 +-
 docling_core/transforms/serializer/azure.py   |  11 +-
 docling_core/transforms/serializer/common.py  |  15 +-
 docling_core/transforms/serializer/doctags.py |   6 +-
 docling_core/transforms/serializer/webvtt.py  |  14 +-
 .../visualizer/key_value_visualizer.py        |   5 +-
 .../visualizer/layout_visualizer.py           |   3 +-
 .../visualizer/reading_order_visualizer.py    |   3 +-
 .../transforms/visualizer/table_visualizer.py |  11 +-
 docling_core/types/doc/__init__.py            |   4 +-
 docling_core/types/doc/document.py            | 174 ++++---
 docling_core/types/doc/webvtt.py              |   2 +-
 docling_core/utils/legacy.py                  |   3 -
 docs/DoclingDocument.json                     | 467 +++++++++++-------
 test/data/doc/webvtt_example_01.json          |  39 +-
 test/data/doc/webvtt_example_02.json          |  27 +-
 test/data/doc/webvtt_example_03.json          |  51 +-
 test/data/doc/webvtt_example_04.json          |  18 +-
 test/data/doc/webvtt_example_05.json          |  33 +-
 test/test_deserializer_idoctags.py            |   4 +-
 test/test_doc_base.py                         |  12 +-
 test/test_serialization_doctag.py             |   3 +-
 test/test_serialization_idoctag.py            |  22 +-
 test/test_webvtt.py                           |   1 -
 24 files changed, 553 insertions(+), 389 deletions(-)

diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
index 7376062b..dd19d7f0 100644
--- a/docling_core/experimental/idoctags.py
+++ b/docling_core/experimental/idoctags.py
@@ -175,8 +175,6 @@ def _create_location_tokens_for_item(
         return ""
     out: list[str] = []
     for prov in item.prov:
-        if not isinstance(prov, ProvenanceItem):
-            continue
         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
         bbox = prov.bbox.to_top_left_origin(page_h).as_tuple()
         out.append(_create_location_tokens_for_bbox(bbox=bbox, page_w=page_w, page_h=page_h, xres=xres, yres=yres))
@@ -1381,14 +1379,12 @@ def serialize(
             # we will need to do something more complex I believe ...
             res: list[SerializationResult] = []
             for idp, prov_ in enumerate(item.prov):
-                if not isinstance(prov_, ProvenanceItem):
-                    continue
-                item_: TextItem = copy.deepcopy(item)
+                item_ = copy.deepcopy(item)
                 item_.prov = [prov_]
                 item_.text = item.orig[prov_.charspan[0] : prov_.charspan[1]]  # it must be `orig`, not `text` here!
                 item_.orig = item.orig[prov_.charspan[0] : prov_.charspan[1]]
-                if isinstance(item_.prov[0], ProvenanceItem):
-                    item_.prov[0].charspan = (0, len(item_.orig))
+
+                item_.prov[0].charspan = (0, len(item_.orig))
 
                 # marker field should be cleared on subsequent split parts
                 if idp > 0 and isinstance(item_, ListItem):
@@ -1752,7 +1748,7 @@ def _emit_otsl(
 
         if params.add_table_cell_location:
             # Check if we have all required information for location serialization
-            if item.prov and isinstance(item.prov[0], ProvenanceItem):
+            if item.prov and len(item.prov) > 0:
                 page_no = item.prov[0].page_no
                 if doc.pages and page_no in doc.pages:
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
@@ -1901,8 +1897,6 @@ def serialize(
             for it, _ in doc.iterate_items(root=item):
                 if isinstance(it, DocItem) and it.prov:
                     for prov in it.prov:
-                        if not isinstance(prov, ProvenanceItem):
-                            continue
                         page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
                         boxes.append(prov.bbox.to_top_left_origin(page_h).as_tuple())
                         prov_page_w_h = (page_w, page_h, prov.page_no)
diff --git a/docling_core/transforms/serializer/azure.py b/docling_core/transforms/serializer/azure.py
index ed91aee2..1addf996 100644
--- a/docling_core/transforms/serializer/azure.py
+++ b/docling_core/transforms/serializer/azure.py
@@ -55,7 +55,6 @@
     ListGroup,
     NodeItem,
     PictureItem,
-    ProvenanceItem,
     RefItem,
     RichTableCell,
     TableItem,
@@ -77,7 +76,7 @@ def _bbox_to_polygon_coords(
 
 def _bbox_to_polygon_for_item(doc: DoclingDocument, item: DocItem) -> Optional[list[float]]:
     """Compute a TOPLEFT-origin polygon for the first provenance of the item."""
-    if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
+    if not item.prov:
         return None
 
     prov = item.prov[0]
@@ -188,7 +187,7 @@ def serialize(
 
         # Lists may be represented either as TextItem(ListItem) or via groups;
         # we treat any TextItem as a paragraph-like entry.
-        if item.prov and isinstance(item.prov[0], ProvenanceItem):
+        if item.prov:
             prov = item.prov[0]
             page_no = prov.page_no
             polygon = _bbox_to_polygon_for_item(doc, item)
@@ -238,7 +237,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
+        if not item.prov:
             return create_ser_result()
 
         prov = item.prov[0]
@@ -309,7 +308,7 @@ def serialize(
     ) -> SerializationResult:
         assert isinstance(doc_serializer, AzureDocSerializer)
 
-        if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
+        if not item.prov:
             return create_ser_result()
 
         prov = item.prov[0]
@@ -325,7 +324,7 @@ def serialize(
         for foot_ref in item.footnotes:
             if isinstance(foot_ref, RefItem):
                 tgt = foot_ref.resolve(doc)
-                if isinstance(tgt, TextItem) and tgt.prov and isinstance(tgt.prov[0], ProvenanceItem):
+                if isinstance(tgt, TextItem) and tgt.prov:
                     f_poly = _bbox_to_polygon_for_item(doc, tgt)
                     if f_poly is not None:
                         foots.append(
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index c36062e0..43bfd54b 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -52,7 +52,6 @@
     PictureDataType,
     PictureItem,
     PictureMoleculeData,
-    ProvenanceItem,
     Script,
     TableAnnotationType,
     TableItem,
@@ -109,7 +108,7 @@ def _iterate_items(
                     add_page_breaks=add_page_breaks,
                     visited=my_visited,
                 ):
-                    if isinstance(it, DocItem) and it.prov and isinstance(it.prov[0], ProvenanceItem):
+                    if isinstance(it, DocItem) and it.prov:
                         page_no = it.prov[0].page_no
                         if prev_page_nr is not None and page_no > prev_page_nr:
                             yield (
@@ -121,7 +120,7 @@ def _iterate_items(
                                 lvl,
                             )
                         break
-            elif isinstance(item, DocItem) and item.prov and isinstance(item.prov[0], ProvenanceItem):
+            elif isinstance(item, DocItem) and item.prov:
                 page_no = item.prov[0].page_no
                 if prev_page_nr is None or page_no > prev_page_nr:
                     if prev_page_nr is not None:  # close previous range
@@ -302,13 +301,7 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
                             or item.content_layer not in params.layers
                             or (
                                 params.pages is not None
-                                and (
-                                    (not item.prov)
-                                    or (
-                                        isinstance(item.prov[0], ProvenanceItem)
-                                        and item.prov[0].page_no not in params.pages
-                                    )
-                                )
+                                and ((not item.prov) or item.prov[0].page_no not in params.pages)
                             )
                         )
                     )
@@ -355,6 +348,7 @@ def serialize(
         empty_res = create_ser_result()
 
         my_item = item or self.doc.body
+
         if my_item == self.doc.body:
             if my_item.meta and not self._meta_is_wrapped():
                 meta_part = self.serialize_meta(item=my_item, **my_kwargs)
@@ -677,7 +671,6 @@ def _get_applicable_pages(self) -> Optional[list[int]]:
             if (
                 isinstance(item, DocItem)
                 and item.prov
-                and isinstance(item.prov[0], ProvenanceItem)
                 and (self.params.pages is None or item.prov[0].page_no in self.params.pages)
                 and ix >= self.params.start_idx
                 and ix < self.params.stop_idx
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index 16549652..dc8c520f 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -345,7 +345,7 @@ def serialize(
         results: list[SerializationResult] = []
 
         page_no = 1
-        if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem):
+        if len(item.prov) > 0:
             page_no = item.prov[0].page_no
 
         if params.add_location:
@@ -363,7 +363,7 @@ def serialize(
 
         for cell in item.graph.cells:
             cell_txt = ""
-            if cell.prov is not None and isinstance(cell.prov, ProvenanceItem):
+            if cell.prov is not None:
                 if len(doc.pages.keys()):
                     page_w, page_h = doc.pages[page_no].size.as_tuple()
                     cell_txt += DocumentToken.get_location(
@@ -471,7 +471,7 @@ def _get_inline_location_tags(
         doc_items: list[DocItem] = []
         for it, _ in doc.iterate_items(root=item):
             if isinstance(it, DocItem):
-                for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)):
+                for prov in it.prov:
                     boxes.append(prov.bbox)
                     doc_items.append(it)
         if prov is None:
diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index 15fdbc3b..bfd1fd55 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -38,10 +38,10 @@
     ListGroup,
     NodeItem,
     PictureItem,
-    ProvenanceTrack,
     TableItem,
     TextItem,
     TitleItem,
+    TrackProvenance,
 )
 from docling_core.types.doc.webvtt import (
     START_TAG_NAMES,
@@ -140,15 +140,15 @@ def serialize(
         if isinstance(item, TitleItem):
             return create_ser_result(text=item.text, span_source=item)
 
-        # Only process items with ProvenanceTrack (WebVTT cues)
-        if not item.text or not item.prov or not isinstance(item.prov[0], ProvenanceTrack):
+        # Only process items with TrackProvenance (WebVTT cues)
+        if not item.text or not item.source or item.source[0].kind != "track":
             return create_ser_result()
 
         # Apply post-processing here: formatting, classes, language, and voice
         # If the TextItem is part of an InlineGroup, we need to further post-process it
         # within the group context
 
-        prov: ProvenanceTrack = item.prov[0]
+        prov: TrackProvenance = item.source[0]
         text: str = doc_serializer.post_process(
             text=item.text,
             formatting=item.formatting,
@@ -417,7 +417,7 @@ def _extract_classes(classes: list[str]) -> dict[str, list[str]]:
         """Extract tag and values from provenance classes.
 
         Args:
-            classes: The classes from a ProvenanceTrack object.
+            classes: The classes from a TrackProvenance object.
 
         Returns:
             Map of tag to class values.
@@ -463,8 +463,8 @@ def serialize_doc(
                 continue
             if isinstance(doc_item, InlineGroup) and doc_item.children:
                 doc_item = doc_item.children[0].resolve(doc=self.doc)
-            if isinstance(doc_item, TextItem) and doc_item.prov and isinstance(doc_item.prov[0], ProvenanceTrack):
-                prov: ProvenanceTrack = doc_item.prov[0]
+            if isinstance(doc_item, TextItem) and doc_item.source and doc_item.source[0].kind == "track":
+                prov: TrackProvenance = doc_item.source[0]
                 if (
                     prov.identifier == id
                     and timings
diff --git a/docling_core/transforms/visualizer/key_value_visualizer.py b/docling_core/transforms/visualizer/key_value_visualizer.py
index e2b10264..89b07f77 100644
--- a/docling_core/transforms/visualizer/key_value_visualizer.py
+++ b/docling_core/transforms/visualizer/key_value_visualizer.py
@@ -21,7 +21,6 @@
     DoclingDocument,
     GraphCellLabel,
     GraphLinkLabel,
-    ProvenanceItem,
 )
 
 # ---------------------------------------------------------------------------
@@ -87,7 +86,7 @@ def _draw_key_value_layer(
             # First draw cells (rectangles + optional labels)
             # ------------------------------------------------------------------
             for cell in cell_dict.values():
-                if cell.prov is None or not isinstance(cell.prov, ProvenanceItem) or cell.prov.page_no != page_no:
+                if cell.prov is None or cell.prov.page_no != page_no:
                     continue  # skip cells not on this page or without bbox
 
                 tl_bbox = cell.prov.bbox.to_top_left_origin(page_height=doc.pages[page_no].size.height)
@@ -154,8 +153,6 @@ def _draw_key_value_layer(
                 if (
                     src_cell.prov is None
                     or tgt_cell.prov is None
-                    or not isinstance(src_cell.prov, ProvenanceItem)
-                    or not isinstance(tgt_cell.prov, ProvenanceItem)
                     or src_cell.prov.page_no != page_no
                     or tgt_cell.prov.page_no != page_no
                 ):
diff --git a/docling_core/transforms/visualizer/layout_visualizer.py b/docling_core/transforms/visualizer/layout_visualizer.py
index 8ac6bf81..043fedac 100644
--- a/docling_core/transforms/visualizer/layout_visualizer.py
+++ b/docling_core/transforms/visualizer/layout_visualizer.py
@@ -17,7 +17,6 @@
     DocItem,
     DocItemLabel,
     DoclingDocument,
-    ProvenanceItem,
     TextCell,
 )
 
@@ -179,7 +178,7 @@ def _draw_doc_layout(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
+            for prov in elem.prov:
                 page_nr = prov.page_no
 
                 if page_nr in my_images:
diff --git a/docling_core/transforms/visualizer/reading_order_visualizer.py b/docling_core/transforms/visualizer/reading_order_visualizer.py
index 27583613..60874333 100644
--- a/docling_core/transforms/visualizer/reading_order_visualizer.py
+++ b/docling_core/transforms/visualizer/reading_order_visualizer.py
@@ -14,7 +14,6 @@
     DocItem,
     DoclingDocument,
     PictureItem,
-    ProvenanceItem,
 )
 
 
@@ -131,7 +130,7 @@ def _draw_doc_reading_order(
             if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
-            for prov in (item for item in elem.prov if isinstance(item, ProvenanceItem)):
+            for prov in elem.prov:
                 page_no = prov.page_no
                 image = my_images.get(page_no)
 
diff --git a/docling_core/transforms/visualizer/table_visualizer.py b/docling_core/transforms/visualizer/table_visualizer.py
index 787a1bff..1bba8c89 100644
--- a/docling_core/transforms/visualizer/table_visualizer.py
+++ b/docling_core/transforms/visualizer/table_visualizer.py
@@ -10,12 +10,7 @@
 from typing_extensions import override
 
 from docling_core.transforms.visualizer.base import BaseVisualizer
-from docling_core.types.doc import (
-    ContentLayer,
-    DoclingDocument,
-    ProvenanceItem,
-    TableItem,
-)
+from docling_core.types.doc import ContentLayer, DoclingDocument, TableItem
 
 _log = logging.getLogger(__name__)
 
@@ -195,10 +190,10 @@ def _draw_doc_tables(
                 image = pil_img.copy()
                 my_images[page_nr] = image
 
-        for _, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
+        for idx, (elem, _) in enumerate(doc.iterate_items(included_content_layers=included_content_layers)):
             if not isinstance(elem, TableItem):
                 continue
-            if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem):
+            if len(elem.prov) == 0:
                 continue  # Skip elements without provenances
 
             if len(elem.prov) == 1:
diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index d8ddd0b4..c3a2b237 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -46,6 +46,7 @@
     PictureClassificationClass,
     PictureClassificationData,
     PictureClassificationMetaField,
+    PictureClassificationPrediction,
     PictureDataType,
     PictureItem,
     PictureLineChartData,
@@ -56,7 +57,7 @@
     PictureStackedBarChartData,
     PictureTabularChartData,
     ProvenanceItem,
-    ProvenanceTrack,
+    ProvenanceType,
     RefItem,
     RichTableCell,
     Script,
@@ -69,6 +70,7 @@
     TabularChartMetaField,
     TextItem,
     TitleItem,
+    TrackProvenance,
     UnorderedList,
 )
 from .labels import (
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 720b0967..f1dd2dd8 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -32,12 +32,10 @@
     AnyUrl,
     BaseModel,
     ConfigDict,
-    Discriminator,
     Field,
     FieldSerializationInfo,
     SerializerFunctionWrapHandler,
     StringConstraints,
-    Tag,
     computed_field,
     field_serializer,
     field_validator,
@@ -1200,13 +1198,27 @@ class ProvenanceItem(BaseModel):
     charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")]
 
 
-class ProvenanceTrack(BaseModel):
-    """Provenance information for elements extracted from media assets.
+class BaseProvenance(BaseModel):
+    """Base class for provenance information.
 
-    A `ProvenanceTrack` instance describes a cue in a text track associated with a
-    media element (audio, video, subtitles, screen recordings, ...).
+    Represents the provenance of an extracted component within a digital asset.
     """
 
+    kind: Annotated[
+        str, Field(description="Kind of provenance. It is used as a discriminator for the provenance type.")
+    ]
+
+
+class TrackProvenance(BaseProvenance):
+    """Provenance metadata for a cue extracted from a media track.
+
+    A `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,
+    etc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle
+    block, an audio clip, or a timed marker in a screen-recording.
+    """
+
+    kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track"
+
     start_time: Annotated[
         float,
         Field(
@@ -1251,25 +1263,22 @@ def check_order(self) -> Self:
         return self
 
 
-def _get_provenance_discriminator_value(v: Any) -> str:
-    """Callable discriminator for provenance instances.
+ProvenanceType = Annotated[Union[TrackProvenance], Field(discriminator="kind")]
+"""Union type for all provenance types.
 
-    Args:
-        v: Either dict or model input.
-
-    Returns:
-        A string discriminator of provenance instances.
-    """
-    fields = {"bbox", "page_no", "charspan"}
-    if isinstance(v, dict):
-        return "item" if any(f in v for f in fields) else "track"
-    return "item" if any(hasattr(v, f) for f in fields) else "track"
+This type alias represents a discriminated union of all available provenance types that can be associated with
+extracted elements in a document. The `kind` field is used as a discriminator to determine the specific
+provenance type at runtime.
 
+Currently supported provenance types:
+    - `TrackProvenance`: For elements extracted from media assets (audio, video, subtitles)
 
-ProvenanceType = Annotated[
-    Union[Annotated[ProvenanceItem, Tag("item")], Annotated[ProvenanceTrack, Tag("track")]],
-    Discriminator(_get_provenance_discriminator_value),
-]
+Notes:
+    - Additional provenance types may be added to this union in the future to support
+        other content sources.
+    - For documents with an implicit or explicity layout, such as PDF, HTML, docx, pptx, or markdown files, the
+        `ProvenanceItem` should still be used.
+"""
 
 
 class ContentLayer(str, Enum):
@@ -1574,20 +1583,28 @@ class FineRef(RefItem):
     range: Optional[tuple[int, int]] = None  # start_inclusive, end_exclusive
 
 
-class DocItem(NodeItem):  # Base type for any element that carries content, can be a leaf node
-    """DocItem."""
+class DocItem(NodeItem):
+    """Base type for any element that carries content, can be a leaf node."""
 
     label: DocItemLabel
-    prov: list[ProvenanceType] = []
+    prov: list[ProvenanceItem] = []
+    source: Annotated[
+        list[ProvenanceType],
+        Field(
+            description="The provenance of this document item. Currently, it is only used for media track provenance."
+        ),
+    ] = []
     comments: list[FineRef] = []  # References to comment items annotating this content
 
     @model_serializer(mode="wrap")
     def _custom_pydantic_serialize(self, handler: SerializerFunctionWrapHandler) -> dict:
         dumped = handler(self)
 
-        # suppress serializing comment list when empty:
-        if dumped.get("comments") == []:
-            del dumped["comments"]
+        # suppress serializing comment and source lists when empty:
+        for field in {"comments", "source"}:
+            if dumped.get(field) == []:
+                del dumped[field]
+
         return dumped
 
     def get_location_tokens(
@@ -1603,7 +1620,7 @@ def get_location_tokens(
             return ""
 
         location = ""
-        for prov in (item for item in self.prov if isinstance(item, ProvenanceItem)):
+        for prov in self.prov:
             page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
 
             loc_str = DocumentToken.get_location(
@@ -1639,9 +1656,9 @@ def get_image(self, doc: "DoclingDocument", prov_index: int = 0) -> Optional[PIL
         if not page_image:
             return None
         crop_bbox = (
-            prov.bbox.to_top_left_origin(page_height=page.size.height).scale_to_size(
-                old_size=page.size, new_size=page.image.size
-            )
+            self.prov[prov_index]
+            .bbox.to_top_left_origin(page_height=page.size.height)
+            .scale_to_size(old_size=page.size, new_size=page.image.size)
             # .scaled(scale=page_image.height / page.size.height)
         )
         return page_image.crop(crop_bbox.as_tuple())
@@ -2312,7 +2329,7 @@ def export_to_otsl(
             return ""
 
         page_no = 0
-        if len(self.prov) > 0 and isinstance(self.prov[0], ProvenanceItem):
+        if len(self.prov) > 0:
             page_no = self.prov[0].page_no
 
         for i in range(nrows):
@@ -2442,7 +2459,7 @@ class GraphCell(BaseModel):
     text: str  # sanitized text
     orig: str  # text as seen on document
 
-    prov: Optional[ProvenanceType] = None
+    prov: Optional[ProvenanceItem] = None
 
     # in case you have a text, table or picture item
     item_ref: Optional[RefItem] = None
@@ -3091,7 +3108,7 @@ def add_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3102,7 +3119,7 @@ def add_list_item(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3143,7 +3160,7 @@ def add_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3154,7 +3171,7 @@ def add_text(
         :param label: str:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
 
         """
@@ -3280,7 +3297,7 @@ def add_table(
         self,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
@@ -3290,7 +3307,7 @@ def add_table(
 
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
 
@@ -3326,7 +3343,7 @@ def add_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
     ):
@@ -3335,7 +3352,7 @@ def add_picture(
         :param data: Optional[list[PictureData]]: (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3367,7 +3384,7 @@ def add_title(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3378,7 +3395,7 @@ def add_title(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3413,7 +3430,7 @@ def add_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3426,7 +3443,7 @@ def add_code(
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem:
         :param RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3464,7 +3481,7 @@ def add_formula(
         self,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3475,7 +3492,7 @@ def add_formula(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3509,7 +3526,7 @@ def add_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
@@ -3521,7 +3538,7 @@ def add_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3554,13 +3571,13 @@ def add_heading(
     def add_key_values(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_key_values.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3585,13 +3602,13 @@ def add_key_values(
     def add_form(
         self,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
     ):
         """add_form.
 
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param parent: Optional[NodeItem]:  (Default value = None)
         """
         if not parent:
@@ -3780,7 +3797,7 @@ def insert_list_item(
         enumerated: bool = False,
         marker: Optional[str] = None,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3793,7 +3810,7 @@ def insert_list_item(
         :param enumerated: bool:  (Default value = False)
         :param marker: Optional[str]:  (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3852,7 +3869,7 @@ def insert_text(
         label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -3864,7 +3881,7 @@ def insert_text(
         :param label: DocItemLabel:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -3964,7 +3981,7 @@ def insert_table(
         sibling: NodeItem,
         data: TableData,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         label: DocItemLabel = DocItemLabel.TABLE,
         content_layer: Optional[ContentLayer] = None,
         annotations: Optional[list[TableAnnotationType]] = None,
@@ -3975,7 +3992,7 @@ def insert_table(
         :param sibling: NodeItem:
         :param data: TableData:
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param label: DocItemLabel:  (Default value = DocItemLabel.TABLE)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param annotations: Optional[list[TableAnnotationType]]: (Default value = None)
@@ -4012,7 +4029,7 @@ def insert_picture(
         annotations: Optional[list[PictureDataType]] = None,
         image: Optional[ImageRef] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         after: bool = True,
     ) -> PictureItem:
@@ -4022,7 +4039,7 @@ def insert_picture(
         :param annotations: Optional[list[PictureDataType]]: (Default value = None)
         :param image: Optional[ImageRef]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
@@ -4056,7 +4073,7 @@ def insert_title(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4067,7 +4084,7 @@ def insert_title(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4107,7 +4124,7 @@ def insert_code(
         code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         caption: Optional[Union[TextItem, RefItem]] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4120,7 +4137,7 @@ def insert_code(
         :param code_language: Optional[str]: (Default value = None)
         :param orig: Optional[str]:  (Default value = None)
         :param caption: Optional[Union[TextItem, RefItem]]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4162,7 +4179,7 @@ def insert_formula(
         sibling: NodeItem,
         text: str,
         orig: Optional[str] = None,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4173,7 +4190,7 @@ def insert_formula(
         :param sibling: NodeItem:
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4212,7 +4229,7 @@ def insert_heading(
         text: str,
         orig: Optional[str] = None,
         level: LevelNumber = 1,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -4224,7 +4241,7 @@ def insert_heading(
         :param text: str:
         :param orig: Optional[str]:  (Default value = None)
         :param level: LevelNumber:  (Default value = 1)
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param content_layer: Optional[ContentLayer]:  (Default value = None)
         :param formatting: Optional[Formatting]:  (Default value = None)
         :param hyperlink: Optional[Union[AnyUrl, Path]]:  (Default value = None)
@@ -4262,14 +4279,14 @@ def insert_key_values(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         after: bool = True,
     ) -> KeyValueItem:
         """Creates a new KeyValueItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: KeyValueItem: The newly created KeyValueItem item.
@@ -4291,14 +4308,14 @@ def insert_form(
         self,
         sibling: NodeItem,
         graph: GraphData,
-        prov: Optional[ProvenanceType] = None,
+        prov: Optional[ProvenanceItem] = None,
         after: bool = True,
     ) -> FormItem:
         """Creates a new FormItem item and inserts it into the document.
 
         :param sibling: NodeItem:
         :param graph: GraphData:
-        :param prov: Optional[ProvenanceType]:  (Default value = None)
+        :param prov: Optional[ProvenanceItem]:  (Default value = None)
         :param after: bool:  (Default value = True)
 
         :returns: FormItem: The newly created FormItem item.
@@ -4635,10 +4652,7 @@ def _iterate_items_with_stack(
             (not isinstance(root, GroupItem) or with_groups)
             and (
                 not isinstance(root, DocItem)
-                or (
-                    page_nrs is None
-                    or any(prov.page_no in page_nrs for prov in root.prov if isinstance(prov, ProvenanceItem))
-                )
+                or (page_nrs is None or any(prov.page_no in page_nrs for prov in root.prov))
             )
             and root.content_layer in my_layers
         )
@@ -4760,7 +4774,7 @@ def _with_pictures_refs(
                             else:
                                 obj_path = loc_path
 
-                            if item.image is None and isinstance(item.prov[0], ProvenanceItem):
+                            if item.image is None:
                                 scale = img.size[0] / item.prov[0].bbox.width
                                 item.image = ImageRef.from_pil(image=img, dpi=round(72 * scale))
                             elif item.image is not None:
@@ -6166,7 +6180,7 @@ def index(self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None) ->
                     if isinstance(new_item, DocItem):
                         # update page numbers
                         # NOTE other prov sources (e.g. GraphCell) currently not covered
-                        for prov in (item for item in new_item.prov if isinstance(item, ProvenanceItem)):
+                        for prov in new_item.prov:
                             prov.page_no += page_delta
 
                     if item.parent:
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 6bc4a219..297e97fb 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -406,7 +406,7 @@ class WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Optional[WebVTTCueIdentifier] = Field(None, description="The WebVTT cue identifier")
+    identifier: Annotated[Optional[WebVTTCueIdentifier], Field(description="The WebVTT cue identifier")] = None
     timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[WebVTTCueComponentWithTerminator],
diff --git a/docling_core/utils/legacy.py b/docling_core/utils/legacy.py
index 5ebac4be..26042436 100644
--- a/docling_core/utils/legacy.py
+++ b/docling_core/utils/legacy.py
@@ -165,7 +165,6 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
                         span=[0, len(item.text)],
                     )
                     for p in item.prov
-                    if isinstance(p, ProvenanceItem)
                 ]
                 main_text.append(
                     BaseText(
@@ -287,7 +286,6 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, 0],
                             )
                             for p in item.prov
-                            if isinstance(p, ProvenanceItem)
                         ],
                     )
                 )
@@ -315,7 +313,6 @@ def _make_spans(cell: TableCell, table_item: TableItem):
                                 span=[0, len(caption)],
                             )
                             for p in item.prov
-                            if isinstance(p, ProvenanceItem)
                         ],
                         obj_type=doc_item_label_to_legacy_type(item.label),
                         text=caption,
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index cea39ba5..b37260eb 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -233,16 +233,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -658,16 +670,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -807,16 +831,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -894,21 +930,13 @@
         "prov": {
           "anyOf": [
             {
-              "oneOf": [
-                {
-                  "$ref": "#/$defs/ProvenanceItem"
-                },
-                {
-                  "$ref": "#/$defs/ProvenanceTrack"
-                }
-              ]
+              "$ref": "#/$defs/ProvenanceItem"
             },
             {
               "type": "null"
             }
           ],
-          "default": null,
-          "title": "Prov"
+          "default": null
         },
         "item_ref": {
           "anyOf": [
@@ -1227,16 +1255,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -1406,16 +1446,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -1789,16 +1841,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -2224,120 +2288,6 @@
       "title": "ProvenanceItem",
       "type": "object"
     },
-    "ProvenanceTrack": {
-      "description": "Provenance information for elements extracted from media assets.\n\nA `ProvenanceTrack` instance describes a cue in a text track associated with a\nmedia element (audio, video, subtitles, screen recordings, ...).",
-      "properties": {
-        "start_time": {
-          "description": "Start time offset of the track cue in seconds",
-          "examples": [
-            11.0,
-            6.5,
-            5370.0
-          ],
-          "title": "Start Time",
-          "type": "number"
-        },
-        "end_time": {
-          "description": "End time offset of the track cue in seconds",
-          "examples": [
-            12.0,
-            8.2,
-            5370.1
-          ],
-          "title": "End Time",
-          "type": "number"
-        },
-        "identifier": {
-          "anyOf": [
-            {
-              "type": "string"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "An identifier of the cue",
-          "examples": [
-            "test",
-            "123",
-            "b72d946"
-          ],
-          "title": "Identifier"
-        },
-        "voice": {
-          "anyOf": [
-            {
-              "type": "string"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "The cue voice (speaker)",
-          "examples": [
-            "Mary",
-            "Fred",
-            "Name Surname"
-          ],
-          "title": "Voice"
-        },
-        "languages": {
-          "anyOf": [
-            {
-              "items": {
-                "type": "string"
-              },
-              "type": "array"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "Languages of the cue in BCP 47 language tag format",
-          "examples": [
-            [
-              "en",
-              "en-GB"
-            ],
-            [
-              "fr-CA"
-            ]
-          ],
-          "title": "Languages"
-        },
-        "classes": {
-          "anyOf": [
-            {
-              "items": {
-                "type": "string"
-              },
-              "minItems": 1,
-              "type": "array"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "Classes for describing the cue significance",
-          "examples": [
-            "b.first",
-            "v.loud",
-            "c.yellow"
-          ],
-          "title": "Classes"
-        }
-      },
-      "required": [
-        "start_time",
-        "end_time"
-      ],
-      "title": "ProvenanceTrack",
-      "type": "object"
-    },
     "RefItem": {
       "description": "RefItem.",
       "properties": {
@@ -2494,16 +2444,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -2796,16 +2758,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -3008,16 +2982,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -3127,16 +3113,28 @@
         "prov": {
           "default": [],
           "items": {
-            "oneOf": [
-              {
-                "$ref": "#/$defs/ProvenanceItem"
+            "$ref": "#/$defs/ProvenanceItem"
+          },
+          "title": "Prov",
+          "type": "array"
+        },
+        "source": {
+          "default": [],
+          "description": "The provenance of this document item. Currently, it is only used for media track provenance.",
+          "items": {
+            "discriminator": {
+              "mapping": {
+                "track": "#/$defs/TrackProvenance"
               },
+              "propertyName": "kind"
+            },
+            "oneOf": [
               {
-                "$ref": "#/$defs/ProvenanceTrack"
+                "$ref": "#/$defs/TrackProvenance"
               }
             ]
           },
-          "title": "Prov",
+          "title": "Source",
           "type": "array"
         },
         "comments": {
@@ -3192,6 +3190,127 @@
       ],
       "title": "TitleItem",
       "type": "object"
+    },
+    "TrackProvenance": {
+      "description": "Provenance metadata for a cue extracted from a media track.\n\nA `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.",
+      "properties": {
+        "kind": {
+          "const": "track",
+          "default": "track",
+          "description": "Identifiers this type of provenance.",
+          "title": "Kind",
+          "type": "string"
+        },
+        "start_time": {
+          "description": "Start time offset of the track cue in seconds",
+          "examples": [
+            11.0,
+            6.5,
+            5370.0
+          ],
+          "title": "Start Time",
+          "type": "number"
+        },
+        "end_time": {
+          "description": "End time offset of the track cue in seconds",
+          "examples": [
+            12.0,
+            8.2,
+            5370.1
+          ],
+          "title": "End Time",
+          "type": "number"
+        },
+        "identifier": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "An identifier of the cue",
+          "examples": [
+            "test",
+            "123",
+            "b72d946"
+          ],
+          "title": "Identifier"
+        },
+        "voice": {
+          "anyOf": [
+            {
+              "type": "string"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "The cue voice (speaker)",
+          "examples": [
+            "Mary",
+            "Fred",
+            "Name Surname"
+          ],
+          "title": "Voice"
+        },
+        "languages": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Languages of the cue in BCP 47 language tag format",
+          "examples": [
+            [
+              "en",
+              "en-GB"
+            ],
+            [
+              "fr-CA"
+            ]
+          ],
+          "title": "Languages"
+        },
+        "classes": {
+          "anyOf": [
+            {
+              "items": {
+                "type": "string"
+              },
+              "minItems": 1,
+              "type": "array"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Classes for describing the cue significance",
+          "examples": [
+            "b.first",
+            "v.loud",
+            "c.yellow"
+          ],
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "TrackProvenance",
+      "type": "object"
     }
   },
   "description": "DoclingDocument.",
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
index 5a7c9d29..78ce13b6 100644
--- a/test/data/doc/webvtt_example_01.json
+++ b/test/data/doc/webvtt_example_01.json
@@ -71,8 +71,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 11.0,
           "end_time": 13.0,
           "voice": "Roger Bingham"
@@ -89,8 +90,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 13.0,
           "end_time": 16.0,
           "voice": "Roger Bingham"
@@ -107,8 +109,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 16.0,
           "end_time": 18.0,
           "voice": "Roger Bingham"
@@ -125,8 +128,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 18.0,
           "end_time": 20.0,
           "voice": "Roger Bingham"
@@ -143,8 +147,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 20.0,
           "end_time": 22.0,
           "voice": "Roger Bingham"
@@ -161,8 +166,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 22.0,
           "end_time": 24.0,
           "voice": "Roger Bingham"
@@ -179,8 +185,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 24.0,
           "end_time": 26.0,
           "voice": "Roger Bingham"
@@ -197,8 +204,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 27.0,
           "end_time": 30.0,
           "voice": "Roger Bingham"
@@ -215,8 +223,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 30.0,
           "end_time": 31.5,
           "voice": "Roger Bingham"
@@ -233,8 +242,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 30.5,
           "end_time": 32.5,
           "voice": "Neil deGrasse Tyson"
@@ -251,8 +261,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 32.0,
           "end_time": 35.5,
           "voice": "Roger Bingham"
@@ -269,8 +280,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 32.5,
           "end_time": 33.5,
           "voice": "Neil deGrasse Tyson"
@@ -294,8 +306,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 35.5,
           "end_time": 38.0,
           "voice": "Roger Bingham"
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
index 2966a2e0..35c53692 100644
--- a/test/data/doc/webvtt_example_02.json
+++ b/test/data/doc/webvtt_example_02.json
@@ -88,8 +88,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 0.0,
           "end_time": 2.0,
           "voice": "Esme",
@@ -109,8 +110,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 2.0,
           "end_time": 4.0,
           "voice": "Mary"
@@ -127,8 +129,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0,
           "voice": "Esme"
@@ -145,8 +148,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0
         }
@@ -162,8 +166,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0
         }
@@ -186,8 +191,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 6.0,
           "end_time": 8.0,
           "voice": "Mary",
@@ -207,8 +213,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0
         }
@@ -224,8 +231,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0,
           "languages": [
@@ -254,8 +262,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0
         }
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
index dddce0f2..42d9e5b2 100644
--- a/test/data/doc/webvtt_example_03.json
+++ b/test/data/doc/webvtt_example_03.json
@@ -83,8 +83,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
@@ -102,8 +103,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
@@ -121,8 +123,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 8.571,
           "end_time": 9.403,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
@@ -140,8 +143,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 10.683,
           "end_time": 11.563,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
@@ -158,8 +162,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 13.363,
           "end_time": 13.803,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
@@ -177,8 +182,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 49.603,
           "end_time": 53.363,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
@@ -196,8 +202,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 54.963,
           "end_time": 62.072,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
@@ -215,8 +222,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
@@ -234,8 +242,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
@@ -253,8 +262,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
@@ -272,8 +282,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
@@ -291,8 +302,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 70.563,
           "end_time": 72.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
@@ -310,8 +322,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
@@ -329,8 +342,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
@@ -348,8 +362,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 110.222,
           "end_time": 111.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
@@ -367,8 +382,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 112.043,
           "end_time": 115.043,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
@@ -386,8 +402,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 114.603,
           "end_time": 115.283,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
index f96765fc..7e12385d 100644
--- a/test/data/doc/webvtt_example_04.json
+++ b/test/data/doc/webvtt_example_04.json
@@ -80,8 +80,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 1.0,
           "end_time": 4.0
         }
@@ -97,8 +98,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0
         }
@@ -114,8 +116,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0
         }
@@ -131,8 +134,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
           "classes": [
@@ -158,8 +162,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0
         }
@@ -175,8 +180,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
           "voice": "John"
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
index 616c94fc..9a53b3b0 100644
--- a/test/data/doc/webvtt_example_05.json
+++ b/test/data/doc/webvtt_example_05.json
@@ -94,8 +94,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14580.0,
           "end_time": 14760.0,
           "identifier": "agcvs-08234"
@@ -112,8 +113,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -130,8 +132,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -155,8 +158,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
@@ -183,8 +187,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -201,8 +206,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -219,8 +225,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
@@ -247,8 +254,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -272,8 +280,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
@@ -300,8 +309,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
@@ -325,8 +335,9 @@
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [
+      "source": [
         {
+          "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234"
diff --git a/test/test_deserializer_idoctags.py b/test/test_deserializer_idoctags.py
index 58fb50db..28b41ad6 100644
--- a/test/test_deserializer_idoctags.py
+++ b/test/test_deserializer_idoctags.py
@@ -1,5 +1,4 @@
 from pathlib import Path
-from test.test_serialization_doctag import verify
 
 import pytest
 
@@ -21,7 +20,8 @@
     TableData,
 )
 from docling_core.types.doc.labels import CodeLanguageLabel
-from test.test_serialization_idoctag import add_texts_section, add_list_section
+from test.test_serialization_doctag import verify
+from test.test_serialization_idoctag import add_list_section, add_texts_section
 
 DO_PRINT: bool = False
 
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 2d1ce498..45a9445c 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,7 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc import ProvenanceTrack
+from docling_core.types.doc import TrackProvenance
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -41,9 +41,9 @@ def test_prov():
 
 
 def test_prov_track():
-    """Test the class ProvenanceTrack."""
+    """Test the class TrackProvenance."""
 
-    valid_track = ProvenanceTrack(
+    valid_track = TrackProvenance(
         start_time=11.0,
         end_time=12.0,
         identifier="test",
@@ -61,17 +61,17 @@ def test_prov_track():
     assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
-        ProvenanceTrack(start_time=11.0)
+        TrackProvenance(start_time=11.0)
 
     with pytest.raises(ValidationError, match="should be a valid list"):
-        ProvenanceTrack(
+        TrackProvenance(
             start_time=11.0,
             end_time=12.0,
             languages="en",
         )
 
     with pytest.raises(ValidationError, match="must be greater than start"):
-        ProvenanceTrack(
+        TrackProvenance(
             start_time=11.0,
             end_time=11.0,
         )
diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py
index 45d0c983..9b378b03 100644
--- a/test/test_serialization_doctag.py
+++ b/test/test_serialization_doctag.py
@@ -6,8 +6,7 @@
     DocTagsDocSerializer,
     DocTagsParams,
 )
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DoclingDocument, TableData
+from docling_core.types.doc import DoclingDocument, TableData
 from docling_core.types.doc.labels import DocItemLabel
 
 from .test_serialization import verify
diff --git a/test/test_serialization_idoctag.py b/test/test_serialization_idoctag.py
index 43aaa79e..1c0f8479 100644
--- a/test/test_serialization_idoctag.py
+++ b/test/test_serialization_idoctag.py
@@ -2,37 +2,39 @@
 
 from pathlib import Path
 from typing import Optional
-from test.test_serialization import verify
 
 import pytest
 
 from docling_core.experimental.idoctags import (
     ContentType,
-    WrapMode,
     EscapeMode,
     IDocTagsDocSerializer,
     IDocTagsParams,
     IDocTagsSerializationMode,
     IDocTagsVocabulary,
+    WrapMode,
 )
 from docling_core.types.doc import (
+    BoundingBox,
+    CodeLanguageLabel,
+    CoordOrigin,
+    DescriptionMetaField,
     DocItemLabel,
     DoclingDocument,
     Formatting,
-    Script,
-    TableData,
-)
-from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.document import (
-    DescriptionMetaField,
+    PictureClassificationLabel,
     PictureClassificationMetaField,
     PictureClassificationPrediction,
     PictureMeta,
     ProvenanceItem,
+    Script,
+    Size,
     SummaryMetaField,
+    TableData,
     TabularChartMetaField,
 )
-from docling_core.types.doc.labels import CodeLanguageLabel, PictureClassificationLabel
+from test.test_serialization import verify
+
 
 def add_texts_section(doc: DoclingDocument):
     doc.add_text(label=DocItemLabel.TEXT, text="Simple text")
@@ -427,7 +429,7 @@ def test_content_allow_all_types(sample_doc: DoclingDocument):
     serializer = IDocTagsDocSerializer(
         doc=doc,
         params=IDocTagsParams(
-            content_types={ct for ct in ContentType},
+            content_types=set(ContentType),
         ),
     )
     ser_txt = serializer.serialize().text
diff --git a/test/test_webvtt.py b/test/test_webvtt.py
index 938da37c..5b1693e3 100644
--- a/test/test_webvtt.py
+++ b/test/test_webvtt.py
@@ -1,6 +1,5 @@
 """Test the data model for WebVTT files.
 
-Assisted by watsonx Code Assistant.
 Examples extracted from https://www.w3.org/TR/webvtt1/
 Copyright © 2019 World Wide Web Consortium.
 """

From 685a6112533eecae0c6574e1215c335857006ef7 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 23 Jan 2026 16:28:56 +0100
Subject: [PATCH 18/22] tests(webvtt): fix test with STYLE and NOTE blocks

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 test/test_serialization_doctag.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_serialization_doctag.py b/test/test_serialization_doctag.py
index 9b378b03..86237a9a 100644
--- a/test/test_serialization_doctag.py
+++ b/test/test_serialization_doctag.py
@@ -6,8 +6,7 @@
     DocTagsDocSerializer,
     DocTagsParams,
 )
-from docling_core.types.doc import DoclingDocument, TableData
-from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TableData
 
 from .test_serialization import verify
 

From 50f8ba9bc43cbbb2d74936b277ab86a3cbeb64ba Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Fri, 23 Jan 2026 19:04:04 +0100
Subject: [PATCH 19/22] style(webvtt): apply X | Y annotation instead of
 Optional, Union

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/webvtt.py | 26 +++++++++---------
 docling_core/types/doc/webvtt.py             | 28 +++++++++-----------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index bfd1fd55..bbc6e344 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from typing import Any, Optional, get_args
+from typing import Any, get_args
 
 from pydantic import BaseModel
 from typing_extensions import override
@@ -132,7 +132,7 @@ def serialize(
         doc_serializer: BaseDocSerializer,
         doc: DoclingDocument,
         is_inline_scope: bool = False,
-        visited: Optional[set[str]] = None,
+        visited: set[str] | None = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes the passed item."""
@@ -158,7 +158,7 @@ def serialize(
         )
         if is_inline_scope:
             # Iteratively remove unnecessary consecutive tag pairs until no more changes
-            prev_text: Optional[str] = None
+            prev_text: str | None = None
             while prev_text != text:
                 prev_text = text
                 text = _remove_consecutive_pairs(text)
@@ -275,7 +275,7 @@ def serialize(
         doc_serializer: "BaseDocSerializer",
         doc: DoclingDocument,
         list_level: int = 0,
-        visited: Optional[set[str]] = None,
+        visited: set[str] | None = None,
         **kwargs: Any,
     ) -> SerializationResult:
         """Serializes an inline group to WebVTT format."""
@@ -343,7 +343,7 @@ class WebVTTDocSerializer(DocSerializer):
     fallback_serializer: BaseFallbackSerializer = _WebVTTFallbackSerializer()
     list_serializer: BaseListSerializer = _WebVTTListSerializer()
     inline_serializer: BaseInlineSerializer = WebVTTInlineSerializer()
-    meta_serializer: Optional[BaseMetaSerializer] = _WebVTTMetaSerializer()
+    meta_serializer: BaseMetaSerializer | None = _WebVTTMetaSerializer()
     annotation_serializer: BaseAnnotationSerializer = _WebVTTAnnotationSerializer()
 
     params: CommonParams = CommonParams()
@@ -393,7 +393,7 @@ def serialize_cue_span(
         self,
         text: str,
         tag: START_TAG_NAMES,
-        anno: Optional[str] = None,
+        anno: str | None = None,
         css: list[str] = [],
     ) -> str:
         """Apply serialization to a WebVTT cue span."""
@@ -442,10 +442,10 @@ def serialize_doc(
         **kwargs: Any,
     ) -> SerializationResult:
         """Serialize a document out of its parts."""
-        title: Optional[str] = None
+        title: str | None = None
 
-        timings: Optional[WebVTTCueTimings] = None
-        id: Optional[str] = None
+        timings: WebVTTCueTimings | None = None
+        id: str | None = None
         text: str = ""
         cue_blocks: list[WebVTTCueBlock] = []
         for part in parts:
@@ -503,10 +503,10 @@ def serialize_doc(
     def post_process(
         self,
         text: str,
-        formatting: Optional[Formatting] = None,
-        voice: Optional[str] = None,
-        languages: Optional[list[str]] = None,
-        classes: Optional[list[str]] = None,
+        formatting: Formatting | None = None,
+        voice: str | None = None,
+        languages: list[str] | None = None,
+        classes: list[str] | None = None,
         **kwargs: Any,
     ) -> str:
         """Apply some text post-processing steps by adding formatting tags.
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index 297e97fb..f7c4eea6 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -5,7 +5,7 @@
 from collections.abc import Iterator
 from enum import Enum
 from functools import total_ordering
-from typing import Annotated, ClassVar, Literal, Optional, Union
+from typing import Annotated, ClassVar, Literal
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from pydantic.types import StringConstraints
@@ -204,7 +204,7 @@ class WebVTTCueComponentWithTerminator(BaseModel):
     """WebVTT caption or subtitle cue component optionally with a line terminator."""
 
     component: "WebVTTCueComponent"
-    terminator: Optional[WebVTTLineTerminator] = None
+    terminator: WebVTTLineTerminator | None = None
 
     @override
     def __str__(self) -> str:
@@ -215,7 +215,7 @@ def __str__(self) -> str:
 class WebVTTCueInternalText(BaseModel):
     """WebVTT cue internal text."""
 
-    terminator: Optional[WebVTTLineTerminator] = None
+    terminator: WebVTTLineTerminator | None = None
     components: Annotated[
         list[WebVTTCueComponentWithTerminator],
         Field(description=("WebVTT caption or subtitle cue components representing the cue internal text")),
@@ -380,15 +380,13 @@ class WebVTTCueLanguageSpan(WebVTTCueComponentBase):
 
 
 WebVTTCueComponent = Annotated[
-    Union[
-        WebVTTCueTextSpan,
-        WebVTTCueClassSpan,
-        WebVTTCueItalicSpan,
-        WebVTTCueBoldSpan,
-        WebVTTCueUnderlineSpan,
-        WebVTTCueVoiceSpan,
-        WebVTTCueLanguageSpan,
-    ],
+    WebVTTCueTextSpan
+    | WebVTTCueClassSpan
+    | WebVTTCueItalicSpan
+    | WebVTTCueBoldSpan
+    | WebVTTCueUnderlineSpan
+    | WebVTTCueVoiceSpan
+    | WebVTTCueLanguageSpan,
     Field(
         discriminator="kind",
         description="The type of WebVTT caption or subtitle cue component.",
@@ -406,7 +404,7 @@ class WebVTTCueBlock(BaseModel):
 
     model_config = ConfigDict(regex_engine="python-re")
 
-    identifier: Annotated[Optional[WebVTTCueIdentifier], Field(description="The WebVTT cue identifier")] = None
+    identifier: Annotated[WebVTTCueIdentifier | None, Field(description="The WebVTT cue identifier")] = None
     timings: Annotated[WebVTTCueTimings, Field(description="The WebVTT cue timings")]
     payload: Annotated[
         list[WebVTTCueComponentWithTerminator],
@@ -456,7 +454,7 @@ def parse(cls, raw: str) -> Self:
         lines = raw.strip().splitlines()
         if not lines:
             raise ValueError("Cue block must have at least one line")
-        identifier: Optional[WebVTTCueIdentifier] = None
+        identifier: WebVTTCueIdentifier | None = None
         timing_line = lines[0]
         if "-->" not in timing_line and len(lines) > 1:
             identifier = timing_line
@@ -585,7 +583,7 @@ class WebVTTFile(BaseModel):
 
     _pattern: ClassVar[re.Pattern] = re.compile(r"(?m)^(STYLE|NOTE|REGION)\b[\s\S]*?(?:\n\s*\n|\Z)")
     cue_blocks: list[WebVTTCueBlock]
-    title: Optional[str] = None
+    title: str | None = None
 
     @staticmethod
     def verify_signature(content: str) -> bool:

From aedca1313f4bd04a9363678d2d310e90f632eccb Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 26 Jan 2026 00:10:56 +0100
Subject: [PATCH 20/22] refactor(webvtt): simplify TrackProvenance model with
 tags

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/webvtt.py |  47 +++++---
 docling_core/types/doc/document.py           |  39 +++---
 docling_core/types/doc/webvtt.py             |  12 +-
 docs/DoclingDocument.json                    | 118 ++++++++++++++-----
 test/data/doc/webvtt_example_01.json         |  91 ++++++++++++--
 test/data/doc/webvtt_example_02.json         |  53 +++++++--
 test/data/doc/webvtt_example_03.json         | 112 +++++++++++++++---
 test/data/doc/webvtt_example_04.json         |  16 ++-
 test/data/doc/webvtt_example_05.json         |  23 +++-
 test/test_doc_base.py                        |  32 +++--
 10 files changed, 407 insertions(+), 136 deletions(-)

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index bbc6e344..eba06b36 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -152,9 +152,7 @@ def serialize(
         text: str = doc_serializer.post_process(
             text=item.text,
             formatting=item.formatting,
-            voice=prov.voice,
-            languages=prov.languages,
-            classes=prov.classes,
+            tags=prov.tags,
         )
         if is_inline_scope:
             # Iteratively remove unnecessary consecutive tag pairs until no more changes
@@ -394,7 +392,7 @@ def serialize_cue_span(
         text: str,
         tag: START_TAG_NAMES,
         anno: str | None = None,
-        css: list[str] = [],
+        css: list[str] | None = None,
     ) -> str:
         """Apply serialization to a WebVTT cue span."""
         start_tag: WebVTTCueSpanStartTag
@@ -504,9 +502,7 @@ def post_process(
         self,
         text: str,
         formatting: Formatting | None = None,
-        voice: str | None = None,
-        languages: list[str] | None = None,
-        classes: list[str] | None = None,
+        tags: list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None = None,
         **kwargs: Any,
     ) -> str:
         """Apply some text post-processing steps by adding formatting tags.
@@ -521,25 +517,40 @@ def post_process(
             6. voice (<v>)
         """
         res: str = text
-        cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
-
-        for lang in languages or []:
-            res = self.serialize_cue_span(text=res, tag="lang", anno=lang, css=cls.get("lang", []))
-
-        res = super().post_process(text=res, formatting=formatting, classes=cls)
-
-        if "c" in cls:
+        # cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
+
+        languages: list[WebVTTCueSpanStartTagAnnotated] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "lang"
+        ]
+        for lang in languages:
+            res = self.serialize_cue_span(text=res, tag="lang", anno=lang.annotation, css=lang.classes)
+
+        format_classes = {
+            item.name: item.classes
+            for item in tags or []
+            if isinstance(item, WebVTTCueSpanStartTag) and item.name in {"u", "i", "b"}
+        }
+        res = super().post_process(text=res, formatting=formatting, classes=format_classes)
+
+        class_tag: list[WebVTTCueSpanStartTag] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTag) and item.name == "c"
+        ]
+        if class_tag:
             res = self.serialize_cue_span(
                 text=res,
                 tag="c",
-                css=cls.get("c", []),
+                css=class_tag[0].classes,
             )
+
+        voice: list[WebVTTCueSpanStartTagAnnotated] = [
+            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "v"
+        ]
         if voice:
             res = self.serialize_cue_span(
                 text=res,
                 tag="v",
-                anno=voice,
-                css=cls.get("v", []),
+                anno=voice[0].annotation,
+                css=voice[0].classes,
             )
 
         return res
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index f1dd2dd8..117066cc 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -65,6 +65,7 @@
 )
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
+from docling_core.types.doc.webvtt import WebVTTCueIdentifier, WebVTTCueSpanStartTag, WebVTTCueSpanStartTagAnnotated
 
 _logger = logging.getLogger(__name__)
 
@@ -1217,8 +1218,8 @@ class TrackProvenance(BaseProvenance):
     block, an audio clip, or a timed marker in a screen-recording.
     """
 
+    model_config = ConfigDict(regex_engine="python-re")
     kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track"
-
     start_time: Annotated[
         float,
         Field(
@@ -1233,27 +1234,19 @@ class TrackProvenance(BaseProvenance):
             description="End time offset of the track cue in seconds",
         ),
     ]
-    identifier: Optional[str] = Field(
-        None,
-        examples=["test", "123", "b72d946"],
-        description="An identifier of the cue",
-    )
-    voice: Optional[str] = Field(
-        None,
-        examples=["Mary", "Fred", "Name Surname"],
-        description="The cue voice (speaker)",
-    )
-    languages: Optional[list[str]] = Field(
-        None,
-        examples=[["en", "en-GB"], ["fr-CA"]],
-        description="Languages of the cue in BCP 47 language tag format",
-    )
-    classes: Optional[list[str]] = Field(
-        None,
-        min_length=1,
-        examples=["b.first", "v.loud", "c.yellow"],
-        description="Classes for describing the cue significance",
-    )
+    identifier: Annotated[
+        WebVTTCueIdentifier | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"])
+    ] = None
+    tags: Annotated[
+        list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None,
+        Field(
+            description="A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
+            examples=[
+                [WebVTTCueSpanStartTagAnnotated(name="v", classes=["loud"], annotation="John")],
+                [WebVTTCueSpanStartTag(name="i", classes=["foreignphrase"])],
+            ],
+        ),
+    ] = None
 
     @model_validator(mode="after")
     def check_order(self) -> Self:
@@ -1436,7 +1429,7 @@ class PictureMeta(FloatingMeta):
     tabular_chart: Optional[TabularChartMetaField] = None
 
 
-class NodeItem(BaseModel):
+class NodeItem(BaseModel, validate_assignment=True):
     """NodeItem."""
 
     self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
diff --git a/docling_core/types/doc/webvtt.py b/docling_core/types/doc/webvtt.py
index f7c4eea6..32bfc12d 100644
--- a/docling_core/types/doc/webvtt.py
+++ b/docling_core/types/doc/webvtt.py
@@ -233,19 +233,19 @@ class WebVTTCueSpanStartTag(BaseModel):
 
     name: Annotated[START_TAG_NAMES, Field(description="The tag name")]
     classes: Annotated[
-        list[str],
+        list[str] | None,
         Field(description="List of classes representing the cue span's significance"),
-    ] = []
+    ] = None
 
     @field_validator("classes", mode="after")
     @classmethod
-    def validate_classes(cls, value: list[str]) -> list[str]:
+    def validate_classes(cls, value: list[str] | None) -> list[str] | None:
         """Validate cue span start tag classes."""
-        for item in value:
+        for item in value or []:
             if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
                 raise ValueError("A cue span start tag class contains invalid characters")
             if not item:
-                raise ValueError("Cue span start tag classes cannot be empty")
+                raise ValueError("A cue span start tag class cannot be empty")
         return value
 
     def _get_name_with_classes(self) -> str:
@@ -501,7 +501,7 @@ def parse(cls, raw: str) -> Self:
                             raise ValueError(f"Incorrect end tag: {ct}")
                         class_string = closed["class"]
                         annotation = closed["annotation"]
-                        classes: list[str] = []
+                        classes: list[str] | None = None
                         if class_string:
                             classes = [c for c in class_string.split(".") if c]
                         st: WebVTTCueSpanStartTag
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index b37260eb..6b617f28 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -3224,6 +3224,7 @@
         "identifier": {
           "anyOf": [
             {
+              "pattern": "^(?!.*-->)[^\\n\\r]+$",
               "type": "string"
             },
             {
@@ -3239,25 +3240,73 @@
           ],
           "title": "Identifier"
         },
-        "voice": {
+        "tags": {
           "anyOf": [
             {
-              "type": "string"
+              "items": {
+                "anyOf": [
+                  {
+                    "$ref": "#/$defs/WebVTTCueSpanStartTag"
+                  },
+                  {
+                    "$ref": "#/$defs/WebVTTCueSpanStartTagAnnotated"
+                  }
+                ]
+              },
+              "type": "array"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "The cue voice (speaker)",
+          "description": "A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
           "examples": [
-            "Mary",
-            "Fred",
-            "Name Surname"
+            [
+              {
+                "annotation": "John",
+                "classes": [
+                  "loud"
+                ],
+                "name": "v"
+              }
+            ],
+            [
+              {
+                "classes": [
+                  "foreignphrase"
+                ],
+                "name": "i"
+              }
+            ]
+          ],
+          "title": "Tags"
+        }
+      },
+      "required": [
+        "start_time",
+        "end_time"
+      ],
+      "title": "TrackProvenance",
+      "type": "object"
+    },
+    "WebVTTCueSpanStartTag": {
+      "description": "WebVTT cue span start tag.",
+      "properties": {
+        "name": {
+          "description": "The tag name",
+          "enum": [
+            "c",
+            "b",
+            "i",
+            "u",
+            "v",
+            "lang"
           ],
-          "title": "Voice"
+          "title": "Name",
+          "type": "string"
         },
-        "languages": {
+        "classes": {
           "anyOf": [
             {
               "items": {
@@ -3270,17 +3319,31 @@
             }
           ],
           "default": null,
-          "description": "Languages of the cue in BCP 47 language tag format",
-          "examples": [
-            [
-              "en",
-              "en-GB"
-            ],
-            [
-              "fr-CA"
-            ]
+          "description": "List of classes representing the cue span's significance",
+          "title": "Classes"
+        }
+      },
+      "required": [
+        "name"
+      ],
+      "title": "WebVTTCueSpanStartTag",
+      "type": "object"
+    },
+    "WebVTTCueSpanStartTagAnnotated": {
+      "description": "WebVTT cue span start tag requiring an annotation.",
+      "properties": {
+        "name": {
+          "description": "The tag name",
+          "enum": [
+            "c",
+            "b",
+            "i",
+            "u",
+            "v",
+            "lang"
           ],
-          "title": "Languages"
+          "title": "Name",
+          "type": "string"
         },
         "classes": {
           "anyOf": [
@@ -3288,7 +3351,6 @@
               "items": {
                 "type": "string"
               },
-              "minItems": 1,
               "type": "array"
             },
             {
@@ -3296,20 +3358,20 @@
             }
           ],
           "default": null,
-          "description": "Classes for describing the cue significance",
-          "examples": [
-            "b.first",
-            "v.loud",
-            "c.yellow"
-          ],
+          "description": "List of classes representing the cue span's significance",
           "title": "Classes"
+        },
+        "annotation": {
+          "description": "Cue span start tag annotation",
+          "title": "Annotation",
+          "type": "string"
         }
       },
       "required": [
-        "start_time",
-        "end_time"
+        "name",
+        "annotation"
       ],
-      "title": "TrackProvenance",
+      "title": "WebVTTCueSpanStartTagAnnotated",
       "type": "object"
     }
   },
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
index 78ce13b6..85d119be 100644
--- a/test/data/doc/webvtt_example_01.json
+++ b/test/data/doc/webvtt_example_01.json
@@ -76,7 +76,12 @@
           "kind": "track",
           "start_time": 11.0,
           "end_time": 13.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "We are in New York City",
@@ -95,7 +100,12 @@
           "kind": "track",
           "start_time": 13.0,
           "end_time": 16.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "We’re actually at the Lucern Hotel, just down the street",
@@ -114,7 +124,12 @@
           "kind": "track",
           "start_time": 16.0,
           "end_time": 18.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "from the American Museum of Natural History",
@@ -133,7 +148,12 @@
           "kind": "track",
           "start_time": 18.0,
           "end_time": 20.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "And with me is Neil deGrasse Tyson",
@@ -152,7 +172,12 @@
           "kind": "track",
           "start_time": 20.0,
           "end_time": 22.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "Astrophysicist, Director of the Hayden Planetarium",
@@ -171,7 +196,12 @@
           "kind": "track",
           "start_time": 22.0,
           "end_time": 24.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "at the AMNH.",
@@ -190,7 +220,12 @@
           "kind": "track",
           "start_time": 24.0,
           "end_time": 26.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "Thank you for walking down here.",
@@ -209,7 +244,12 @@
           "kind": "track",
           "start_time": 27.0,
           "end_time": 30.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "And I want to do a follow-up on the last conversation we did.",
@@ -228,7 +268,12 @@
           "kind": "track",
           "start_time": 30.0,
           "end_time": 31.5,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "When we e-mailed—",
@@ -247,7 +292,12 @@
           "kind": "track",
           "start_time": 30.5,
           "end_time": 32.5,
-          "voice": "Neil deGrasse Tyson"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Neil deGrasse Tyson"
+            }
+          ]
         }
       ],
       "orig": "Didn’t we talk about enough in that conversation?",
@@ -266,7 +316,12 @@
           "kind": "track",
           "start_time": 32.0,
           "end_time": 35.5,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "No! No no no no; 'cos 'cos obviously 'cos",
@@ -285,7 +340,12 @@
           "kind": "track",
           "start_time": 32.5,
           "end_time": 33.5,
-          "voice": "Neil deGrasse Tyson"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Neil deGrasse Tyson"
+            }
+          ]
         }
       ],
       "orig": "Laughs",
@@ -311,7 +371,12 @@
           "kind": "track",
           "start_time": 35.5,
           "end_time": 38.0,
-          "voice": "Roger Bingham"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Roger Bingham"
+            }
+          ]
         }
       ],
       "orig": "You know I’m so excited my glasses are falling off here.",
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
index 35c53692..55fd15ea 100644
--- a/test/data/doc/webvtt_example_02.json
+++ b/test/data/doc/webvtt_example_02.json
@@ -93,9 +93,15 @@
           "kind": "track",
           "start_time": 0.0,
           "end_time": 2.0,
-          "voice": "Esme",
-          "classes": [
-            "v.first.loud"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Esme",
+              "classes": [
+                "first",
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -115,7 +121,12 @@
           "kind": "track",
           "start_time": 2.0,
           "end_time": 4.0,
-          "voice": "Mary"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Mary"
+            }
+          ]
         }
       ],
       "orig": "No way!",
@@ -134,7 +145,12 @@
           "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0,
-          "voice": "Esme"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Esme"
+            }
+          ]
         }
       ],
       "orig": "Hee!",
@@ -196,9 +212,14 @@
           "kind": "track",
           "start_time": 6.0,
           "end_time": 8.0,
-          "voice": "Mary",
-          "classes": [
-            "v.loud"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Mary",
+              "classes": [
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -236,11 +257,17 @@
           "kind": "track",
           "start_time": 8.0,
           "end_time": 10.0,
-          "languages": [
-            "en"
-          ],
-          "classes": [
-            "i.foreignphrase"
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "en"
+            },
+            {
+              "name": "i",
+              "classes": [
+                "foreignphrase"
+              ]
+            }
           ]
         }
       ],
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
index 42d9e5b2..7b6faa6c 100644
--- a/test/data/doc/webvtt_example_03.json
+++ b/test/data/doc/webvtt_example_03.json
@@ -89,7 +89,12 @@
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "OK,",
@@ -109,7 +114,12 @@
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "I think now we should be recording",
@@ -129,7 +139,12 @@
           "start_time": 8.571,
           "end_time": 9.403,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "properly.",
@@ -168,7 +183,12 @@
           "start_time": 13.363,
           "end_time": 13.803,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "Yeah.",
@@ -188,7 +208,12 @@
           "start_time": 49.603,
           "end_time": 53.363,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "I was also thinking.",
@@ -208,7 +233,12 @@
           "start_time": 54.963,
           "end_time": 62.072,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "Would be maybe good to create items,",
@@ -228,7 +258,12 @@
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "some metadata,",
@@ -248,7 +283,12 @@
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "some options that can be specific.",
@@ -268,7 +308,12 @@
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "Yeah,",
@@ -288,7 +333,12 @@
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "I mean I think you went even more than",
@@ -308,7 +358,12 @@
           "start_time": 70.563,
           "end_time": 72.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "But we preserved the atoms.",
@@ -328,7 +383,12 @@
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "than me.",
@@ -348,7 +408,12 @@
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "I just opened the format.",
@@ -368,7 +433,12 @@
           "start_time": 110.222,
           "end_time": 111.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "give it a try, yeah.",
@@ -388,7 +458,12 @@
           "start_time": 112.043,
           "end_time": 115.043,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
-          "voice": "Speaker B"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker B"
+            }
+          ]
         }
       ],
       "orig": "Okay, talk to you later.",
@@ -408,7 +483,12 @@
           "start_time": 114.603,
           "end_time": 115.283,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
-          "voice": "Speaker A"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "Speaker A"
+            }
+          ]
         }
       ],
       "orig": "See you.",
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
index 7e12385d..98e7da21 100644
--- a/test/data/doc/webvtt_example_04.json
+++ b/test/data/doc/webvtt_example_04.json
@@ -139,8 +139,13 @@
           "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
-          "classes": [
-            "b.loud"
+          "tags": [
+            {
+              "name": "b",
+              "classes": [
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -185,7 +190,12 @@
           "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
-          "voice": "John"
+          "tags": [
+            {
+              "name": "v",
+              "annotation": "John"
+            }
+          ]
         }
       ],
       "orig": "This is true.",
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
index 9a53b3b0..4af18174 100644
--- a/test/data/doc/webvtt_example_05.json
+++ b/test/data/doc/webvtt_example_05.json
@@ -164,8 +164,11 @@
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
-          "languages": [
-            "es-ES"
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "es-ES"
+            }
           ]
         }
       ],
@@ -231,8 +234,13 @@
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
-          "classes": [
-            "b.loud"
+          "tags": [
+            {
+              "name": "b",
+              "classes": [
+                "loud"
+              ]
+            }
           ]
         }
       ],
@@ -286,8 +294,11 @@
           "start_time": 14760.0,
           "end_time": 14818.239,
           "identifier": "agcvs-08234",
-          "languages": [
-            "it"
+          "tags": [
+            {
+              "name": "lang",
+              "annotation": "it"
+            }
           ]
         }
       ],
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 45a9445c..5d569716 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,7 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc import TrackProvenance
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackProvenance
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -40,34 +40,40 @@ def test_prov():
         Prov(**prov)
 
 
-def test_prov_track():
+def test_track_provenance():
     """Test the class TrackProvenance."""
 
     valid_track = TrackProvenance(
         start_time=11.0,
         end_time=12.0,
         identifier="test",
-        voice="Mary",
-        languages=["en", "en-GB"],
-        classes=["v.first.loud", "i.foreignphrase"],
+        tags = [
+            {"name": "v", "annotation": "Mary", "classes": ["first", "loud"]},
+            {"name": "lang", "annotation": "en"},
+            {"name": "lang", "annotation": "en-GB"},
+            {"name": "i", "classes": ["foreignphrase"]},
+        ]
     )
 
     assert valid_track
     assert valid_track.start_time == 11.0
     assert valid_track.end_time == 12.0
     assert valid_track.identifier == "test"
-    assert valid_track.voice == "Mary"
-    assert valid_track.languages == ["en", "en-GB"]
-    assert valid_track.classes == ["v.first.loud", "i.foreignphrase"]
+    assert valid_track.tags
+    assert valid_track.tags[0].annotation == "Mary"
+    assert valid_track.tags[0].classes == ["first", "loud"]
+    assert valid_track.tags[1].annotation == "en"
+    assert valid_track.tags[2].annotation == "en-GB"
+    assert valid_track.tags[3].classes == ["foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
         TrackProvenance(start_time=11.0)
 
-    with pytest.raises(ValidationError, match="should be a valid list"):
+    with pytest.raises(ValidationError, match="should be a valid dictionary"):
         TrackProvenance(
             start_time=11.0,
             end_time=12.0,
-            languages="en",
+            tags=["en"],
         )
 
     with pytest.raises(ValidationError, match="must be greater than start"):
@@ -75,3 +81,9 @@ def test_prov_track():
             start_time=11.0,
             end_time=11.0,
         )
+
+    doc = DoclingDocument(name="Unknown")
+    item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT)
+    item.source = [valid_track]
+    with pytest.raises(ValidationError, match="should be a valid list"):
+        item.source = "Invalid source"

From 563c041821d364b9adec8891ac124920d284a0d5 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 26 Jan 2026 12:52:23 +0100
Subject: [PATCH 21/22] refactor(webvtt): align class and field names to new
 'source' type

Classes and fields that are related to the new source type should aign with their names.
The term 'provenance' will identify the legacy implementation.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/webvtt.py | 10 ++--
 docling_core/types/doc/__init__.py           |  4 +-
 docling_core/types/doc/document.py           | 35 +++++++-------
 docs/DoclingDocument.json                    | 48 ++++++++++----------
 test/test_doc_base.py                        | 12 ++---
 5 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index eba06b36..5e5acad0 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -41,7 +41,7 @@
     TableItem,
     TextItem,
     TitleItem,
-    TrackProvenance,
+    TrackSource,
 )
 from docling_core.types.doc.webvtt import (
     START_TAG_NAMES,
@@ -140,7 +140,7 @@ def serialize(
         if isinstance(item, TitleItem):
             return create_ser_result(text=item.text, span_source=item)
 
-        # Only process items with TrackProvenance (WebVTT cues)
+        # Only process items with TrackSource (WebVTT cues)
         if not item.text or not item.source or item.source[0].kind != "track":
             return create_ser_result()
 
@@ -148,7 +148,7 @@ def serialize(
         # If the TextItem is part of an InlineGroup, we need to further post-process it
         # within the group context
 
-        prov: TrackProvenance = item.source[0]
+        prov: TrackSource = item.source[0]
         text: str = doc_serializer.post_process(
             text=item.text,
             formatting=item.formatting,
@@ -415,7 +415,7 @@ def _extract_classes(classes: list[str]) -> dict[str, list[str]]:
         """Extract tag and values from provenance classes.
 
         Args:
-            classes: The classes from a TrackProvenance object.
+            classes: The classes from a TrackSource object.
 
         Returns:
             Map of tag to class values.
@@ -462,7 +462,7 @@ def serialize_doc(
             if isinstance(doc_item, InlineGroup) and doc_item.children:
                 doc_item = doc_item.children[0].resolve(doc=self.doc)
             if isinstance(doc_item, TextItem) and doc_item.source and doc_item.source[0].kind == "track":
-                prov: TrackProvenance = doc_item.source[0]
+                prov: TrackSource = doc_item.source[0]
                 if (
                     prov.identifier == id
                     and timings
diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py
index c3a2b237..e25f04db 100644
--- a/docling_core/types/doc/__init__.py
+++ b/docling_core/types/doc/__init__.py
@@ -57,11 +57,11 @@
     PictureStackedBarChartData,
     PictureTabularChartData,
     ProvenanceItem,
-    ProvenanceType,
     RefItem,
     RichTableCell,
     Script,
     SectionHeaderItem,
+    SourceType,
     SummaryMetaField,
     TableAnnotationType,
     TableCell,
@@ -70,7 +70,7 @@
     TabularChartMetaField,
     TextItem,
     TitleItem,
-    TrackProvenance,
+    TrackSource,
     UnorderedList,
 )
 from .labels import (
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 117066cc..b0fbee2b 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1199,27 +1199,25 @@ class ProvenanceItem(BaseModel):
     charspan: Annotated[tuple[int, int], Field(description="Character span (0-indexed)")]
 
 
-class BaseProvenance(BaseModel):
-    """Base class for provenance information.
+class BaseSource(BaseModel):
+    """Base class for source information.
 
-    Represents the provenance of an extracted component within a digital asset.
+    Represents the source of an extracted component within a digital asset.
     """
 
-    kind: Annotated[
-        str, Field(description="Kind of provenance. It is used as a discriminator for the provenance type.")
-    ]
+    kind: Annotated[str, Field(description="Kind of source. It is used as a discriminator for the source type.")]
 
 
-class TrackProvenance(BaseProvenance):
-    """Provenance metadata for a cue extracted from a media track.
+class TrackSource(BaseSource):
+    """Source metadata for a cue extracted from a media track.
 
-    A `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,
+    A `TrackSource` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,
     etc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle
     block, an audio clip, or a timed marker in a screen-recording.
     """
 
     model_config = ConfigDict(regex_engine="python-re")
-    kind: Annotated[Literal["track"], Field(description="Identifiers this type of provenance.")] = "track"
+    kind: Annotated[Literal["track"], Field(description="Identifies this type of source.")] = "track"
     start_time: Annotated[
         float,
         Field(
@@ -1256,19 +1254,18 @@ def check_order(self) -> Self:
         return self
 
 
-ProvenanceType = Annotated[Union[TrackProvenance], Field(discriminator="kind")]
-"""Union type for all provenance types.
+SourceType = Annotated[Union[TrackSource], Field(discriminator="kind")]
+"""Union type for all source types.
 
-This type alias represents a discriminated union of all available provenance types that can be associated with
+This type alias represents a discriminated union of all available source types that can be associated with
 extracted elements in a document. The `kind` field is used as a discriminator to determine the specific
-provenance type at runtime.
+source type at runtime.
 
-Currently supported provenance types:
-    - `TrackProvenance`: For elements extracted from media assets (audio, video, subtitles)
+Currently supported source types:
+    - `TrackSource`: For elements extracted from media assets (audio, video, subtitles)
 
 Notes:
-    - Additional provenance types may be added to this union in the future to support
-        other content sources.
+    - Additional source types may be added to this union in the future to support other content sources.
     - For documents with an implicit or explicity layout, such as PDF, HTML, docx, pptx, or markdown files, the
         `ProvenanceItem` should still be used.
 """
@@ -1582,7 +1579,7 @@ class DocItem(NodeItem):
     label: DocItemLabel
     prov: list[ProvenanceItem] = []
     source: Annotated[
-        list[ProvenanceType],
+        list[SourceType],
         Field(
             description="The provenance of this document item. Currently, it is only used for media track provenance."
         ),
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index 6b617f28..e07e0a08 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -244,13 +244,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -681,13 +681,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -842,13 +842,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -1266,13 +1266,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -1457,13 +1457,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -1852,13 +1852,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -2455,13 +2455,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -2769,13 +2769,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -2993,13 +2993,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -3124,13 +3124,13 @@
           "items": {
             "discriminator": {
               "mapping": {
-                "track": "#/$defs/TrackProvenance"
+                "track": "#/$defs/TrackSource"
               },
               "propertyName": "kind"
             },
             "oneOf": [
               {
-                "$ref": "#/$defs/TrackProvenance"
+                "$ref": "#/$defs/TrackSource"
               }
             ]
           },
@@ -3191,13 +3191,13 @@
       "title": "TitleItem",
       "type": "object"
     },
-    "TrackProvenance": {
-      "description": "Provenance metadata for a cue extracted from a media track.\n\nA `TrackProvenance` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.",
+    "TrackSource": {
+      "description": "Source metadata for a cue extracted from a media track.\n\nA `TrackSource` instance identifies a cue in a media track (audio, video, subtitles, screen-recording captions,\netc.). A *cue* here refers to any discrete segment that was pulled out of the original asset, e.g., a subtitle\nblock, an audio clip, or a timed marker in a screen-recording.",
       "properties": {
         "kind": {
           "const": "track",
           "default": "track",
-          "description": "Identifiers this type of provenance.",
+          "description": "Identifies this type of source.",
           "title": "Kind",
           "type": "string"
         },
@@ -3287,7 +3287,7 @@
         "start_time",
         "end_time"
       ],
-      "title": "TrackProvenance",
+      "title": "TrackSource",
       "type": "object"
     },
     "WebVTTCueSpanStartTag": {
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 5d569716..20ad02d3 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -1,7 +1,7 @@
 import pytest
 from pydantic import ValidationError
 
-from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackProvenance
+from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackSource
 from docling_core.types.legacy_doc.base import Prov, S3Reference
 
 
@@ -41,9 +41,9 @@ def test_prov():
 
 
 def test_track_provenance():
-    """Test the class TrackProvenance."""
+    """Test the class TrackSource."""
 
-    valid_track = TrackProvenance(
+    valid_track = TrackSource(
         start_time=11.0,
         end_time=12.0,
         identifier="test",
@@ -67,17 +67,17 @@ def test_track_provenance():
     assert valid_track.tags[3].classes == ["foreignphrase"]
 
     with pytest.raises(ValidationError, match="end_time"):
-        TrackProvenance(start_time=11.0)
+        TrackSource(start_time=11.0)
 
     with pytest.raises(ValidationError, match="should be a valid dictionary"):
-        TrackProvenance(
+        TrackSource(
             start_time=11.0,
             end_time=12.0,
             tags=["en"],
         )
 
     with pytest.raises(ValidationError, match="must be greater than start"):
-        TrackProvenance(
+        TrackSource(
             start_time=11.0,
             end_time=11.0,
         )

From 621e55e88e434b6665b85dfcf863b2ff5cebae14 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Date: Mon, 26 Jan 2026 13:09:03 +0100
Subject: [PATCH 22/22] chore(DoclingDocument): drop the validation on field
 assignment

Drop the validation on field assignment in NodeItem objects.
Add the 'source' argument in the convenient function 'add_text' to create TextItem with track source data.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

refactor(webvtt): drop cue span classes, 'lang' and 'c' tags

Drop WebVTT formatting features not covered by Docling across formats.
Only 'u', 'b', 'i', and 'v' are supported and without classes.
Make 'v' tag explicit as 'voice' feature in SourceTrack class.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
---
 docling_core/transforms/serializer/webvtt.py | 106 +++++-----------
 docling_core/types/doc/document.py           |  20 ++-
 docs/DoclingDocument.json                    | 123 ++-----------------
 test/data/doc/webvtt_example_01.json         |  91 ++------------
 test/data/doc/webvtt_example_02.gt.vtt       |   6 +-
 test/data/doc/webvtt_example_02.json         |  49 +-------
 test/data/doc/webvtt_example_03.json         | 112 +++--------------
 test/data/doc/webvtt_example_04.gt.vtt       |   2 +-
 test/data/doc/webvtt_example_04.json         |  17 +--
 test/data/doc/webvtt_example_05.gt.vtt       |   4 +-
 test/data/doc/webvtt_example_05.json         |  28 +----
 test/test_doc_base.py                        |  28 ++---
 12 files changed, 102 insertions(+), 484 deletions(-)

diff --git a/docling_core/transforms/serializer/webvtt.py b/docling_core/transforms/serializer/webvtt.py
index 5e5acad0..044ea457 100644
--- a/docling_core/transforms/serializer/webvtt.py
+++ b/docling_core/transforms/serializer/webvtt.py
@@ -2,9 +2,10 @@
 
 import logging
 import re
+from pathlib import Path
 from typing import Any, get_args
 
-from pydantic import BaseModel
+from pydantic import AnyUrl, BaseModel
 from typing_extensions import override
 
 from docling_core.transforms.serializer.base import (
@@ -144,15 +145,13 @@ def serialize(
         if not item.text or not item.source or item.source[0].kind != "track":
             return create_ser_result()
 
-        # Apply post-processing here: formatting, classes, language, and voice
-        # If the TextItem is part of an InlineGroup, we need to further post-process it
-        # within the group context
-
-        prov: TrackSource = item.source[0]
+        # Apply post-processing here: formatting and voice.
+        # If the TextItem is part of an InlineGroup, we need to further post-process it within the group context.
+        source: TrackSource = item.source[0]
         text: str = doc_serializer.post_process(
             text=item.text,
             formatting=item.formatting,
-            tags=prov.tags,
+            voice=source.voice,
         )
         if is_inline_scope:
             # Iteratively remove unnecessary consecutive tag pairs until no more changes
@@ -355,55 +354,42 @@ def requires_page_break(self) -> bool:
         return False
 
     @override
-    def serialize_bold(self, text: str, **kwargs: Any) -> str:
+    def serialize_bold(self, text: str, **kwargs) -> str:
         """Apply WebVTT-specific bold serialization."""
-        classes: list[str] = kwargs.get("classes", {}).get("b", [])
 
-        return self.serialize_cue_span(
-            text,
-            tag="b",
-            css=classes,
-        )
+        return self.serialize_cue_span(text=text, tag="b")
 
     @override
-    def serialize_italic(self, text: str, **kwargs: Any) -> str:
+    def serialize_italic(self, text: str, **kwargs) -> str:
         """Apply WebVTT-specific italic serialization."""
-        classes: list[str] = kwargs.get("classes", {}).get("i", [])
 
-        return self.serialize_cue_span(
-            text,
-            tag="i",
-            css=classes,
-        )
+        return self.serialize_cue_span(text=text, tag="i")
 
     @override
-    def serialize_underline(self, text: str, **kwargs: Any) -> str:
+    def serialize_underline(self, text: str, **kwargs) -> str:
         """Apply WebVTT-specific underline serialization."""
-        classes: list[str] = kwargs.get("classes", {}).get("u", [])
 
-        return self.serialize_cue_span(
-            text,
-            tag="u",
-            css=classes,
-        )
+        return self.serialize_cue_span(text=text, tag="u")
 
     def serialize_cue_span(
         self,
         text: str,
         tag: START_TAG_NAMES,
         anno: str | None = None,
-        css: list[str] | None = None,
     ) -> str:
-        """Apply serialization to a WebVTT cue span."""
+        """Apply serialization to a WebVTT cue span.
+
+        Currently, only b, i, u, and v tags are supported.
+        """
         start_tag: WebVTTCueSpanStartTag
-        if tag in {"b", "i", "u", "c"}:
-            start_tag = WebVTTCueSpanStartTag(name=tag, classes=css)
-        elif tag in {"v", "lang"}:
+        if tag in {"b", "i", "u"}:
+            start_tag = WebVTTCueSpanStartTag(name=tag)
+        elif tag in {"v"}:
             if not anno:
                 _logger.warning(f"Invalid {tag} cue span without annotation: {text}")
                 return text
             else:
-                start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, classes=css, annotation=anno)
+                start_tag = WebVTTCueSpanStartTagAnnotated(name=tag, annotation=anno)
         else:
             return text
 
@@ -501,56 +487,26 @@ def serialize_doc(
     def post_process(
         self,
         text: str,
+        *,
         formatting: Formatting | None = None,
-        tags: list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None = None,
+        hyperlink: AnyUrl | Path | None = None,
         **kwargs: Any,
     ) -> str:
         """Apply some text post-processing steps by adding formatting tags.
 
         The order of the formatting tags is determined by this function and `DocSerializer.post_process`,
         from the innermost to the outermost:
-            1. language (<lang>)
-            2. underline (<u>)
-            3. italic (<i>)
-            4. bold (<b>)
-            5. class (<c>)
-            6. voice (<v>)
+            1. underline (<u>)
+            2. italic (<i>)
+            3. bold (<b>)
+            4. voice (<v>)
         """
         res: str = text
-        # cls: dict[str, list[str]] = self._extract_classes(classes) if classes else {}
-
-        languages: list[WebVTTCueSpanStartTagAnnotated] = [
-            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "lang"
-        ]
-        for lang in languages:
-            res = self.serialize_cue_span(text=res, tag="lang", anno=lang.annotation, css=lang.classes)
-
-        format_classes = {
-            item.name: item.classes
-            for item in tags or []
-            if isinstance(item, WebVTTCueSpanStartTag) and item.name in {"u", "i", "b"}
-        }
-        res = super().post_process(text=res, formatting=formatting, classes=format_classes)
-
-        class_tag: list[WebVTTCueSpanStartTag] = [
-            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTag) and item.name == "c"
-        ]
-        if class_tag:
-            res = self.serialize_cue_span(
-                text=res,
-                tag="c",
-                css=class_tag[0].classes,
-            )
-
-        voice: list[WebVTTCueSpanStartTagAnnotated] = [
-            item for item in tags or [] if isinstance(item, WebVTTCueSpanStartTagAnnotated) and item.name == "v"
-        ]
+
+        res = super().post_process(text=res, formatting=formatting)
+
+        voice: str | None = kwargs.get("voice", None)
         if voice:
-            res = self.serialize_cue_span(
-                text=res,
-                tag="v",
-                anno=voice[0].annotation,
-                css=voice[0].classes,
-            )
+            res = self.serialize_cue_span(text=res, tag="v", anno=voice)
 
         return res
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index b0fbee2b..4a8eddbb 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1233,17 +1233,11 @@ class TrackSource(BaseSource):
         ),
     ]
     identifier: Annotated[
-        WebVTTCueIdentifier | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"])
+        str | None, Field(description="An identifier of the cue", examples=["test", "123", "b72d946"])
     ] = None
-    tags: Annotated[
-        list[WebVTTCueSpanStartTag | WebVTTCueSpanStartTagAnnotated] | None,
-        Field(
-            description="A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
-            examples=[
-                [WebVTTCueSpanStartTagAnnotated(name="v", classes=["loud"], annotation="John")],
-                [WebVTTCueSpanStartTag(name="i", classes=["foreignphrase"])],
-            ],
-        ),
+    voice: Annotated[
+        str | None,
+        Field(description="The name of the voice in this track (the speaker)", examples=["John", "Mary", "Speaker 1"]),
     ] = None
 
     @model_validator(mode="after")
@@ -1426,7 +1420,7 @@ class PictureMeta(FloatingMeta):
     tabular_chart: Optional[TabularChartMetaField] = None
 
 
-class NodeItem(BaseModel, validate_assignment=True):
+class NodeItem(BaseModel):
     """NodeItem."""
 
     self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
@@ -3155,6 +3149,8 @@ def add_text(
         content_layer: Optional[ContentLayer] = None,
         formatting: Optional[Formatting] = None,
         hyperlink: Optional[Union[AnyUrl, Path]] = None,
+        *,
+        source: Optional[SourceType] = None,
     ):
         """add_text.
 
@@ -3242,6 +3238,8 @@ def add_text(
             )
             if prov:
                 text_item.prov.append(prov)
+            if source:
+                text_item.source.append(source)
 
             if content_layer:
                 text_item.content_layer = content_layer
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index e07e0a08..d91ff5c5 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -3224,7 +3224,6 @@
         "identifier": {
           "anyOf": [
             {
-              "pattern": "^(?!.*-->)[^\\n\\r]+$",
               "type": "string"
             },
             {
@@ -3240,47 +3239,23 @@
           ],
           "title": "Identifier"
         },
-        "tags": {
+        "voice": {
           "anyOf": [
             {
-              "items": {
-                "anyOf": [
-                  {
-                    "$ref": "#/$defs/WebVTTCueSpanStartTag"
-                  },
-                  {
-                    "$ref": "#/$defs/WebVTTCueSpanStartTagAnnotated"
-                  }
-                ]
-              },
-              "type": "array"
+              "type": "string"
             },
             {
               "type": "null"
             }
           ],
           "default": null,
-          "description": "A list of tags that apply to a cue, including the voice tag (the speaker in a track).",
+          "description": "The name of the voice in this track (the speaker)",
           "examples": [
-            [
-              {
-                "annotation": "John",
-                "classes": [
-                  "loud"
-                ],
-                "name": "v"
-              }
-            ],
-            [
-              {
-                "classes": [
-                  "foreignphrase"
-                ],
-                "name": "i"
-              }
-            ]
+            "John",
+            "Mary",
+            "Speaker 1"
           ],
-          "title": "Tags"
+          "title": "Voice"
         }
       },
       "required": [
@@ -3289,90 +3264,6 @@
       ],
       "title": "TrackSource",
       "type": "object"
-    },
-    "WebVTTCueSpanStartTag": {
-      "description": "WebVTT cue span start tag.",
-      "properties": {
-        "name": {
-          "description": "The tag name",
-          "enum": [
-            "c",
-            "b",
-            "i",
-            "u",
-            "v",
-            "lang"
-          ],
-          "title": "Name",
-          "type": "string"
-        },
-        "classes": {
-          "anyOf": [
-            {
-              "items": {
-                "type": "string"
-              },
-              "type": "array"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "List of classes representing the cue span's significance",
-          "title": "Classes"
-        }
-      },
-      "required": [
-        "name"
-      ],
-      "title": "WebVTTCueSpanStartTag",
-      "type": "object"
-    },
-    "WebVTTCueSpanStartTagAnnotated": {
-      "description": "WebVTT cue span start tag requiring an annotation.",
-      "properties": {
-        "name": {
-          "description": "The tag name",
-          "enum": [
-            "c",
-            "b",
-            "i",
-            "u",
-            "v",
-            "lang"
-          ],
-          "title": "Name",
-          "type": "string"
-        },
-        "classes": {
-          "anyOf": [
-            {
-              "items": {
-                "type": "string"
-              },
-              "type": "array"
-            },
-            {
-              "type": "null"
-            }
-          ],
-          "default": null,
-          "description": "List of classes representing the cue span's significance",
-          "title": "Classes"
-        },
-        "annotation": {
-          "description": "Cue span start tag annotation",
-          "title": "Annotation",
-          "type": "string"
-        }
-      },
-      "required": [
-        "name",
-        "annotation"
-      ],
-      "title": "WebVTTCueSpanStartTagAnnotated",
-      "type": "object"
     }
   },
   "description": "DoclingDocument.",
diff --git a/test/data/doc/webvtt_example_01.json b/test/data/doc/webvtt_example_01.json
index 85d119be..78ce13b6 100644
--- a/test/data/doc/webvtt_example_01.json
+++ b/test/data/doc/webvtt_example_01.json
@@ -76,12 +76,7 @@
           "kind": "track",
           "start_time": 11.0,
           "end_time": 13.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "We are in New York City",
@@ -100,12 +95,7 @@
           "kind": "track",
           "start_time": 13.0,
           "end_time": 16.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "We’re actually at the Lucern Hotel, just down the street",
@@ -124,12 +114,7 @@
           "kind": "track",
           "start_time": 16.0,
           "end_time": 18.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "from the American Museum of Natural History",
@@ -148,12 +133,7 @@
           "kind": "track",
           "start_time": 18.0,
           "end_time": 20.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "And with me is Neil deGrasse Tyson",
@@ -172,12 +152,7 @@
           "kind": "track",
           "start_time": 20.0,
           "end_time": 22.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "Astrophysicist, Director of the Hayden Planetarium",
@@ -196,12 +171,7 @@
           "kind": "track",
           "start_time": 22.0,
           "end_time": 24.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "at the AMNH.",
@@ -220,12 +190,7 @@
           "kind": "track",
           "start_time": 24.0,
           "end_time": 26.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "Thank you for walking down here.",
@@ -244,12 +209,7 @@
           "kind": "track",
           "start_time": 27.0,
           "end_time": 30.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "And I want to do a follow-up on the last conversation we did.",
@@ -268,12 +228,7 @@
           "kind": "track",
           "start_time": 30.0,
           "end_time": 31.5,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "When we e-mailed—",
@@ -292,12 +247,7 @@
           "kind": "track",
           "start_time": 30.5,
           "end_time": 32.5,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Neil deGrasse Tyson"
-            }
-          ]
+          "voice": "Neil deGrasse Tyson"
         }
       ],
       "orig": "Didn’t we talk about enough in that conversation?",
@@ -316,12 +266,7 @@
           "kind": "track",
           "start_time": 32.0,
           "end_time": 35.5,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "No! No no no no; 'cos 'cos obviously 'cos",
@@ -340,12 +285,7 @@
           "kind": "track",
           "start_time": 32.5,
           "end_time": 33.5,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Neil deGrasse Tyson"
-            }
-          ]
+          "voice": "Neil deGrasse Tyson"
         }
       ],
       "orig": "Laughs",
@@ -371,12 +311,7 @@
           "kind": "track",
           "start_time": 35.5,
           "end_time": 38.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Roger Bingham"
-            }
-          ]
+          "voice": "Roger Bingham"
         }
       ],
       "orig": "You know I’m so excited my glasses are falling off here.",
diff --git a/test/data/doc/webvtt_example_02.gt.vtt b/test/data/doc/webvtt_example_02.gt.vtt
index 8f9811e7..83907dbe 100644
--- a/test/data/doc/webvtt_example_02.gt.vtt
+++ b/test/data/doc/webvtt_example_02.gt.vtt
@@ -1,7 +1,7 @@
 WEBVTT
 
 00:00:00.000 --> 00:00:02.000
-<v.first.loud Esme>It’s a blue apple tree!</v>
+<v Esme>It’s a blue apple tree!</v>
 
 00:00:02.000 --> 00:00:04.000
 <v Mary>No way!</v>
@@ -10,7 +10,7 @@ WEBVTT
 <v Esme>Hee!</v> <i>laughter</i>
 
 00:00:06.000 --> 00:00:08.000
-<v.loud Mary>That’s awesome!</v>
+<v Mary>That’s awesome!</v>
 
 00:00:08.000 --> 00:00:10.000
-Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier
\ No newline at end of file
+Sur les <i>playground</i>, ici à Montpellier
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_02.json b/test/data/doc/webvtt_example_02.json
index 55fd15ea..c418ac50 100644
--- a/test/data/doc/webvtt_example_02.json
+++ b/test/data/doc/webvtt_example_02.json
@@ -93,16 +93,7 @@
           "kind": "track",
           "start_time": 0.0,
           "end_time": 2.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Esme",
-              "classes": [
-                "first",
-                "loud"
-              ]
-            }
-          ]
+          "voice": "Esme"
         }
       ],
       "orig": "It\u2019s a blue apple tree!",
@@ -121,12 +112,7 @@
           "kind": "track",
           "start_time": 2.0,
           "end_time": 4.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Mary"
-            }
-          ]
+          "voice": "Mary"
         }
       ],
       "orig": "No way!",
@@ -145,12 +131,7 @@
           "kind": "track",
           "start_time": 4.0,
           "end_time": 6.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Esme"
-            }
-          ]
+          "voice": "Esme"
         }
       ],
       "orig": "Hee!",
@@ -212,15 +193,7 @@
           "kind": "track",
           "start_time": 6.0,
           "end_time": 8.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Mary",
-              "classes": [
-                "loud"
-              ]
-            }
-          ]
+          "voice": "Mary"
         }
       ],
       "orig": "That\u2019s awesome!",
@@ -256,19 +229,7 @@
         {
           "kind": "track",
           "start_time": 8.0,
-          "end_time": 10.0,
-          "tags": [
-            {
-              "name": "lang",
-              "annotation": "en"
-            },
-            {
-              "name": "i",
-              "classes": [
-                "foreignphrase"
-              ]
-            }
-          ]
+          "end_time": 10.0
         }
       ],
       "orig": "playground",
diff --git a/test/data/doc/webvtt_example_03.json b/test/data/doc/webvtt_example_03.json
index 7b6faa6c..42d9e5b2 100644
--- a/test/data/doc/webvtt_example_03.json
+++ b/test/data/doc/webvtt_example_03.json
@@ -89,12 +89,7 @@
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "OK,",
@@ -114,12 +109,7 @@
           "start_time": 4.963,
           "end_time": 8.571,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "I think now we should be recording",
@@ -139,12 +129,7 @@
           "start_time": 8.571,
           "end_time": 9.403,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "properly.",
@@ -183,12 +168,7 @@
           "start_time": 13.363,
           "end_time": 13.803,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "Yeah.",
@@ -208,12 +188,7 @@
           "start_time": 49.603,
           "end_time": 53.363,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker B"
-            }
-          ]
+          "voice": "Speaker B"
         }
       ],
       "orig": "I was also thinking.",
@@ -233,12 +208,7 @@
           "start_time": 54.963,
           "end_time": 62.072,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker B"
-            }
-          ]
+          "voice": "Speaker B"
         }
       ],
       "orig": "Would be maybe good to create items,",
@@ -258,12 +228,7 @@
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker B"
-            }
-          ]
+          "voice": "Speaker B"
         }
       ],
       "orig": "some metadata,",
@@ -283,12 +248,7 @@
           "start_time": 62.072,
           "end_time": 66.811,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker B"
-            }
-          ]
+          "voice": "Speaker B"
         }
       ],
       "orig": "some options that can be specific.",
@@ -308,12 +268,7 @@
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "Yeah,",
@@ -333,12 +288,7 @@
           "start_time": 70.243,
           "end_time": 73.014,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "I mean I think you went even more than",
@@ -358,12 +308,7 @@
           "start_time": 70.563,
           "end_time": 72.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker B"
-            }
-          ]
+          "voice": "Speaker B"
         }
       ],
       "orig": "But we preserved the atoms.",
@@ -383,12 +328,7 @@
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "than me.",
@@ -408,12 +348,7 @@
           "start_time": 73.014,
           "end_time": 75.907,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "I just opened the format.",
@@ -433,12 +368,7 @@
           "start_time": 110.222,
           "end_time": 111.643,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "give it a try, yeah.",
@@ -458,12 +388,7 @@
           "start_time": 112.043,
           "end_time": 115.043,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker B"
-            }
-          ]
+          "voice": "Speaker B"
         }
       ],
       "orig": "Okay, talk to you later.",
@@ -483,12 +408,7 @@
           "start_time": 114.603,
           "end_time": 115.283,
           "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "Speaker A"
-            }
-          ]
+          "voice": "Speaker A"
         }
       ],
       "orig": "See you.",
diff --git a/test/data/doc/webvtt_example_04.gt.vtt b/test/data/doc/webvtt_example_04.gt.vtt
index ce7fcf65..221919b6 100644
--- a/test/data/doc/webvtt_example_04.gt.vtt
+++ b/test/data/doc/webvtt_example_04.gt.vtt
@@ -5,5 +5,5 @@ Never drink liquid nitrogen.
 
 00:00:05.000 --> 00:00:09.000
 — It will perforate your stomach.
-— You could <b.loud>die</b>.
+— You could <b>die</b>.
 <v John>This is true.</v>
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_04.json b/test/data/doc/webvtt_example_04.json
index 98e7da21..6fe7d0ea 100644
--- a/test/data/doc/webvtt_example_04.json
+++ b/test/data/doc/webvtt_example_04.json
@@ -138,15 +138,7 @@
         {
           "kind": "track",
           "start_time": 5.0,
-          "end_time": 9.0,
-          "tags": [
-            {
-              "name": "b",
-              "classes": [
-                "loud"
-              ]
-            }
-          ]
+          "end_time": 9.0
         }
       ],
       "orig": "die",
@@ -190,12 +182,7 @@
           "kind": "track",
           "start_time": 5.0,
           "end_time": 9.0,
-          "tags": [
-            {
-              "name": "v",
-              "annotation": "John"
-            }
-          ]
+          "voice": "John"
         }
       ],
       "orig": "This is true.",
diff --git a/test/data/doc/webvtt_example_05.gt.vtt b/test/data/doc/webvtt_example_05.gt.vtt
index fd7b788c..7c10e008 100644
--- a/test/data/doc/webvtt_example_05.gt.vtt
+++ b/test/data/doc/webvtt_example_05.gt.vtt
@@ -6,5 +6,5 @@ Last night the chef surprised us with a culinary adventure.
 
 agcvs-08234
 04:06:00.000 --> 04:06:58.239
-The waiter offered a <i>steaming bowl of <lang es-ES>paella</lang></i> that instantly transported the diners to a sunny Mediterranean coast.
-The dessert’s <i><b.loud>unexpected</b> <u><lang it>arcobaleno</lang></u> of flavors</i> left everyone in awe.
\ No newline at end of file
+The waiter offered a <i>steaming bowl of <b>paella</b></i> that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s <i><b>unexpected</b> <u>arcobaleno</u> of flavors</i> left everyone in awe.
\ No newline at end of file
diff --git a/test/data/doc/webvtt_example_05.json b/test/data/doc/webvtt_example_05.json
index 4af18174..76803cf6 100644
--- a/test/data/doc/webvtt_example_05.json
+++ b/test/data/doc/webvtt_example_05.json
@@ -163,19 +163,13 @@
           "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
-          "identifier": "agcvs-08234",
-          "tags": [
-            {
-              "name": "lang",
-              "annotation": "es-ES"
-            }
-          ]
+          "identifier": "agcvs-08234"
         }
       ],
       "orig": "paella",
       "text": "paella",
       "formatting": {
-        "bold": false,
+        "bold": true,
         "italic": true,
         "underline": false,
         "strikethrough": false,
@@ -233,15 +227,7 @@
           "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
-          "identifier": "agcvs-08234",
-          "tags": [
-            {
-              "name": "b",
-              "classes": [
-                "loud"
-              ]
-            }
-          ]
+          "identifier": "agcvs-08234"
         }
       ],
       "orig": "unexpected",
@@ -293,13 +279,7 @@
           "kind": "track",
           "start_time": 14760.0,
           "end_time": 14818.239,
-          "identifier": "agcvs-08234",
-          "tags": [
-            {
-              "name": "lang",
-              "annotation": "it"
-            }
-          ]
+          "identifier": "agcvs-08234"
         }
       ],
       "orig": "arcobaleno",
diff --git a/test/test_doc_base.py b/test/test_doc_base.py
index 20ad02d3..d1045cea 100644
--- a/test/test_doc_base.py
+++ b/test/test_doc_base.py
@@ -40,40 +40,30 @@ def test_prov():
         Prov(**prov)
 
 
-def test_track_provenance():
+def test_track_source():
     """Test the class TrackSource."""
 
     valid_track = TrackSource(
         start_time=11.0,
         end_time=12.0,
         identifier="test",
-        tags = [
-            {"name": "v", "annotation": "Mary", "classes": ["first", "loud"]},
-            {"name": "lang", "annotation": "en"},
-            {"name": "lang", "annotation": "en-GB"},
-            {"name": "i", "classes": ["foreignphrase"]},
-        ]
+        voice="Mary",
     )
 
     assert valid_track
     assert valid_track.start_time == 11.0
     assert valid_track.end_time == 12.0
     assert valid_track.identifier == "test"
-    assert valid_track.tags
-    assert valid_track.tags[0].annotation == "Mary"
-    assert valid_track.tags[0].classes == ["first", "loud"]
-    assert valid_track.tags[1].annotation == "en"
-    assert valid_track.tags[2].annotation == "en-GB"
-    assert valid_track.tags[3].classes == ["foreignphrase"]
+    assert valid_track.voice == "Mary"
 
     with pytest.raises(ValidationError, match="end_time"):
         TrackSource(start_time=11.0)
 
-    with pytest.raises(ValidationError, match="should be a valid dictionary"):
+    with pytest.raises(ValidationError, match="should be a valid string"):
         TrackSource(
             start_time=11.0,
             end_time=12.0,
-            tags=["en"],
+            voice=["Mary"],
         )
 
     with pytest.raises(ValidationError, match="must be greater than start"):
@@ -83,7 +73,7 @@ def test_track_provenance():
         )
 
     doc = DoclingDocument(name="Unknown")
-    item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT)
-    item.source = [valid_track]
-    with pytest.raises(ValidationError, match="should be a valid list"):
-        item.source = "Invalid source"
+    item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT, source=valid_track)
+    assert item.source
+    assert len(item.source) == 1
+    assert item.source[0] == valid_track