diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py
index 2a7d02ce74..001fc3eac8 100644
--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -1,8 +1,7 @@
 import logging
-import re
+from dataclasses import dataclass, field
 from io import BytesIO
 from pathlib import Path
-from typing import Annotated, ClassVar, Literal, Optional, Union, cast
 
 from docling_core.types.doc import (
     ContentLayer,
@@ -10,12 +9,19 @@
     DoclingDocument,
     DocumentOrigin,
     Formatting,
-    GroupLabel,
-    NodeItem,
+    TrackSource,
 )
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic.types import StringConstraints
-from typing_extensions import Self, override
+from docling_core.types.doc.webvtt import (
+    WebVTTCueBoldSpan,
+    WebVTTCueComponent,
+    WebVTTCueComponentWithTerminator,
+    WebVTTCueItalicSpan,
+    WebVTTCueTextSpan,
+    WebVTTCueUnderlineSpan,
+    WebVTTCueVoiceSpan,
+    WebVTTFile,
+)
+from typing_extensions import override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -24,409 +30,23 @@
 _log = logging.getLogger(__name__)
 
 
-class _WebVTTTimestamp(BaseModel):
-    """Model representing a WebVTT timestamp.
-
-    A WebVTT timestamp is always interpreted relative to the current playback position
-    of the media data that the WebVTT file is to be synchronized with.
-    """
-
-    model_config = ConfigDict(regex_engine="python-re")
-
-    raw: Annotated[
-        str,
-        Field(
-            description="A representation of the WebVTT Timestamp as a single string"
-        ),
-    ]
-
-    _pattern: ClassVar[re.Pattern] = re.compile(
-        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
-    )
-    _hours: int
-    _minutes: int
-    _seconds: int
-    _millis: int
-
-    @model_validator(mode="after")
-    def validate_raw(self) -> Self:
-        m = self._pattern.match(self.raw)
-        if not m:
-            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
-        self._hours = int(m.group(1)) if m.group(1) else 0
-        self._minutes = int(m.group(2))
-        self._seconds = int(m.group(3))
-        self._millis = int(m.group(4))
-
-        if self._minutes < 0 or self._minutes > 59:
-            raise ValueError("Minutes must be between 0 and 59")
-        if self._seconds < 0 or self._seconds > 59:
-            raise ValueError("Seconds must be between 0 and 59")
-
-        return self
-
-    @property
-    def seconds(self) -> float:
-        """A representation of the WebVTT Timestamp in seconds"""
-        return (
-            self._hours * 3600
-            + self._minutes * 60
-            + self._seconds
-            + self._millis / 1000.0
-        )
-
-    @override
-    def __str__(self) -> str:
-        return self.raw
-
-
-_WebVTTCueIdentifier = Annotated[
-    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
-
-
-class _WebVTTCueTimings(BaseModel):
-    """Model representating WebVTT cue timings."""
-
-    start: Annotated[
-        _WebVTTTimestamp, Field(description="Start time offset of the cue")
-    ]
-    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
-
-    @model_validator(mode="after")
-    def check_order(self) -> Self:
-        if self.start and self.end:
-            if self.end.seconds <= self.start.seconds:
-                raise ValueError("End timestamp must be greater than start timestamp")
-        return self
-
-    @override
-    def __str__(self):
-        return f"{self.start} --> {self.end}"
-
-
-class _WebVTTCueTextSpan(BaseModel):
-    """Model representing a WebVTT cue text span."""
-
+@dataclass
+class AnnotatedText:
     text: str
-    span_type: Literal["text"] = "text"
-
-    @field_validator("text", mode="after")
-    @classmethod
-    def validate_text(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
-            raise ValueError("Cue text span contains invalid characters")
-        if len(value) == 0:
-            raise ValueError("Cue text span cannot be empty")
-        return value
-
-    @override
-    def __str__(self):
-        return self.text
-
-
-class _WebVTTCueVoiceSpan(BaseModel):
-    """Model representing a WebVTT cue voice span."""
-
-    annotation: Annotated[
-        str,
-        Field(
-            description=(
-                "Cue span start tag annotation text representing the name of thevoice"
-            )
-        ),
-    ]
-    classes: Annotated[
-        list[str],
-        Field(description="List of classes representing the cue span's significance"),
-    ] = []
-    components: Annotated[
-        list["_WebVTTCueComponent"],
-        Field(description="The components representing the cue internal text"),
-    ] = []
-    span_type: Literal["v"] = "v"
-
-    @field_validator("annotation", mode="after")
-    @classmethod
-    def validate_annotation(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
-            raise ValueError(
-                "Cue span start tag annotation contains invalid characters"
-            )
-        if not value:
-            raise ValueError("Cue text span cannot be empty")
-        return value
-
-    @field_validator("classes", mode="after")
-    @classmethod
-    def validate_classes(cls, value: list[str]) -> list[str]:
-        for item in value:
-            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
-                raise ValueError(
-                    "A cue span start tag class contains invalid characters"
-                )
-            if not item:
-                raise ValueError("Cue span start tag classes cannot be empty")
-        return value
-
-    @override
-    def __str__(self):
-        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
-        inner = "".join(str(span) for span in self.components)
-        return f"<{tag} {self.annotation}>{inner}</v>"
-
-
-class _WebVTTCueClassSpan(BaseModel):
-    span_type: Literal["c"] = "c"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<c>{inner}</c>"
-
-
-class _WebVTTCueItalicSpan(BaseModel):
-    span_type: Literal["i"] = "i"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<i>{inner}</i>"
-
-
-class _WebVTTCueBoldSpan(BaseModel):
-    span_type: Literal["b"] = "b"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<b>{inner}</b>"
-
-
-class _WebVTTCueUnderlineSpan(BaseModel):
-    span_type: Literal["u"] = "u"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<u>{inner}</u>"
-
-
-_WebVTTCueComponent = Annotated[
-    Union[
-        _WebVTTCueTextSpan,
-        _WebVTTCueClassSpan,
-        _WebVTTCueItalicSpan,
-        _WebVTTCueBoldSpan,
-        _WebVTTCueUnderlineSpan,
-        _WebVTTCueVoiceSpan,
-    ],
-    Field(discriminator="span_type", description="The WebVTT cue component"),
-]
-
-
-class _WebVTTCueBlock(BaseModel):
-    """Model representing a WebVTT cue block.
-
-    The optional WebVTT cue settings list is not supported.
-    The cue payload is limited to the following spans: text, class, italic, bold,
-    underline, and voice.
-    """
-
-    model_config = ConfigDict(regex_engine="python-re")
-
-    identifier: Optional[_WebVTTCueIdentifier] = Field(
-        None, description="The WebVTT cue identifier"
-    )
-    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
-    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
-
-    _pattern_block: ClassVar[re.Pattern] = re.compile(
-        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
-    )
-    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
-        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
-        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
-    )
-
-    @field_validator("payload", mode="after")
-    @classmethod
-    def validate_payload(cls, payload):
-        for voice in payload:
-            if "-->" in str(voice):
-                raise ValueError("Cue payload must not contain '-->'")
-        return payload
-
-    @classmethod
-    def parse(cls, raw: str) -> "_WebVTTCueBlock":
-        lines = raw.strip().splitlines()
-        if not lines:
-            raise ValueError("Cue block must have at least one line")
-        identifier: Optional[_WebVTTCueIdentifier] = None
-        timing_line = lines[0]
-        if "-->" not in timing_line and len(lines) > 1:
-            identifier = timing_line
-            timing_line = lines[1]
-            cue_lines = lines[2:]
-        else:
-            cue_lines = lines[1:]
-
-        if "-->" not in timing_line:
-            raise ValueError("Cue block must contain WebVTT cue timings")
-
-        start, end = [t.strip() for t in timing_line.split("-->")]
-        end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(
-            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+    voice: str | None = None
+    formatting: Formatting | None = None
+
+    def copy_meta(self, text):
+        return AnnotatedText(
+            text=text,
+            voice=self.voice,
+            formatting=self.formatting.model_copy() if self.formatting else None,
         )
-        cue_text = " ".join(cue_lines).strip()
-        if cue_text.startswith("<v") and "</v>" not in cue_text:
-            # adding close tag for cue voice spans without end tag
-            cue_text += "</v>"
-
-        stack: list[list[_WebVTTCueComponent]] = [[]]
-        tag_stack: list[Union[str, tuple]] = []
-
-        pos = 0
-        matches = list(cls._pattern_block.finditer(cue_text))
-        i = 0
-        while i < len(matches):
-            match = matches[i]
-            if match.start() > pos:
-                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
-            tag = match.group(0)
-
-            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
-                tag_type = tag[1:2]
-                tag_stack.append(tag_type)
-                stack.append([])
-            elif tag == "</i>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueItalicSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</b>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueBoldSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</u>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</c>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueClassSpan(components=children))
-                tag_stack.pop()
-            elif tag.startswith("<v"):
-                tag_stack.append(("v", tag))
-                stack.append([])
-            elif tag.startswith("</v"):
-                children = stack.pop() if stack else []
-                if (
-                    tag_stack
-                    and isinstance(tag_stack[-1], tuple)
-                    and tag_stack[-1][0] == "v"
-                ):
-                    _, voice = cast(tuple, tag_stack.pop())
-                    voice_match = cls._pattern_voice_tag.match(voice)
-                    if voice_match:
-                        class_string = voice_match.group("class")
-                        annotation = voice_match.group("annotation")
-                        if annotation:
-                            classes: list[str] = []
-                            if class_string:
-                                classes = [c for c in class_string.split(".") if c]
-                            stack[-1].append(
-                                _WebVTTCueVoiceSpan(
-                                    annotation=annotation.strip(),
-                                    classes=classes,
-                                    components=children,
-                                )
-                            )
-
-            pos = match.end()
-            i += 1
-
-        if pos < len(cue_text):
-            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
-
-        return cls(
-            identifier=identifier,
-            timings=timings,
-            payload=stack[0],
-        )
-
-    def __str__(self):
-        parts = []
-        if self.identifier:
-            parts.append(f"{self.identifier}\n")
-        timings_line = str(self.timings)
-        parts.append(timings_line + "\n")
-        for idx, span in enumerate(self.payload):
-            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
-                # the end tag may be omitted for brevity
-                parts.append(str(span).removesuffix("</v>"))
-            else:
-                parts.append(str(span))
-
-        return "".join(parts)
-
-
-class _WebVTTFile(BaseModel):
-    """A model representing a WebVTT file."""
-
-    cue_blocks: list[_WebVTTCueBlock]
-
-    @staticmethod
-    def verify_signature(content: str) -> bool:
-        if not content:
-            return False
-        elif len(content) == 6:
-            return content == "WEBVTT"
-        elif len(content) > 6 and content.startswith("WEBVTT"):
-            return content[6] in (" ", "\t", "\n")
-        else:
-            return False
-
-    @classmethod
-    def parse(cls, raw: str) -> "_WebVTTFile":
-        # Normalize newlines to LF
-        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
-
-        # Check WebVTT signature
-        if not cls.verify_signature(raw):
-            raise ValueError("Invalid WebVTT file signature")
 
-        # Strip "WEBVTT" header line
-        lines = raw.split("\n", 1)
-        body = lines[1] if len(lines) > 1 else ""
 
-        # Remove NOTE/STYLE/REGION blocks
-        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
-        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
-
-        # Split into cue blocks
-        raw_blocks = re.split(r"\n\s*\n", body.strip())
-        cues: list[_WebVTTCueBlock] = []
-        for block in raw_blocks:
-            try:
-                cues.append(_WebVTTCueBlock.parse(block))
-            except ValueError as e:
-                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
-
-        return cls(cue_blocks=cues)
-
-    def __iter__(self):
-        return iter(self.cue_blocks)
-
-    def __getitem__(self, idx):
-        return self.cue_blocks[idx]
-
-    def __len__(self):
-        return len(self.cue_blocks)
+@dataclass
+class AnnotatedPar:
+    items: list[AnnotatedText]
 
 
 class WebVTTDocumentBackend(DeclarativeDocumentBackend):
@@ -440,7 +60,7 @@ class WebVTTDocumentBackend(DeclarativeDocumentBackend):
     """
 
     @override
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+    def __init__(self, in_doc: InputDocument, path_or_stream: BytesIO | Path):
         super().__init__(in_doc, path_or_stream)
 
         self.content: str = ""
@@ -458,7 +78,7 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
 
     @override
     def is_valid(self) -> bool:
-        return _WebVTTFile.verify_signature(self.content)
+        return WebVTTFile.verify_signature(self.content)
 
     @classmethod
     @override
@@ -476,38 +96,6 @@ def unload(self):
     def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.VTT}
 
-    @staticmethod
-    def _add_text_from_component(
-        doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
-    ) -> None:
-        """Adds a TextItem to a document by extracting text from a cue span component.
-
-        TODO: address nesting
-        """
-        formatting = Formatting()
-        text = ""
-        if isinstance(item, _WebVTTCueItalicSpan):
-            formatting.italic = True
-        elif isinstance(item, _WebVTTCueBoldSpan):
-            formatting.bold = True
-        elif isinstance(item, _WebVTTCueUnderlineSpan):
-            formatting.underline = True
-        if isinstance(item, _WebVTTCueTextSpan):
-            text = item.text
-        else:
-            # TODO: address nesting
-            text = "".join(
-                [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
-            )
-        if text := text.strip():
-            doc.add_text(
-                label=DocItemLabel.TEXT,
-                text=text,
-                parent=parent,
-                content_layer=ContentLayer.BODY,
-                formatting=formatting,
-            )
-
     @override
     def convert(self) -> DoclingDocument:
         _log.debug("Starting WebVTT conversion...")
@@ -521,52 +109,100 @@ def convert(self) -> DoclingDocument:
         )
         doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
 
-        vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
-        for block in vtt.cue_blocks:
-            block_group = doc.add_group(
-                label=GroupLabel.SECTION,
-                name="WebVTT cue block",
-                parent=None,
-                content_layer=ContentLayer.BODY,
-            )
-            if block.identifier:
-                doc.add_text(
-                    label=DocItemLabel.TEXT,
-                    text=str(block.identifier),
-                    parent=block_group,
-                    content_layer=ContentLayer.BODY,
+        vtt: WebVTTFile = WebVTTFile.parse(self.content)
+        cue_text: list[AnnotatedPar] = []
+        parents: list[AnnotatedText] = []
+
+        def _extract_components(
+            payload: list[WebVTTCueComponentWithTerminator],
+        ) -> None:
+            nonlocal cue_text, parents
+            if not cue_text:
+                cue_text.append(AnnotatedPar(items=[]))
+            par = cue_text[-1]
+            for comp in payload:
+                item: AnnotatedText = (
+                    parents[-1].copy_meta("") if parents else AnnotatedText(text="")
                 )
+                component: WebVTTCueComponent = comp.component
+                if isinstance(component, WebVTTCueTextSpan):
+                    item.text = component.text
+                    par.items.append(item)
+                else:
+                    # configure metadata based on span type
+                    if isinstance(component, WebVTTCueBoldSpan):
+                        item.formatting = item.formatting or Formatting()
+                        item.formatting.bold = True
+
+                    elif isinstance(component, WebVTTCueItalicSpan):
+                        item.formatting = item.formatting or Formatting()
+                        item.formatting.italic = True
+
+                    elif isinstance(component, WebVTTCueUnderlineSpan):
+                        item.formatting = item.formatting or Formatting()
+                        item.formatting.underline = True
+
+                    elif isinstance(component, WebVTTCueVoiceSpan):
+                        # voice spans cannot be embedded
+                        item.voice = component.start_tag.annotation
+
+                    parents.append(item)
+                    _extract_components(component.internal_text.components)
+                    parents.pop()
+
+                if comp.terminator is not None:
+                    cue_text.append(AnnotatedPar(items=[]))
+                    par = cue_text[-1]
+
+        def _add_text_item(
+            text: str,
+            formatting: Formatting | None,
+            item: AnnotatedText,
+            parent=None,
+        ):
+            track = TrackSource(
+                start_time=block.timings.start.seconds,
+                end_time=block.timings.end.seconds,
+                identifier=identifier,
+                voice=item.voice or None,
+            )
+
             doc.add_text(
                 label=DocItemLabel.TEXT,
-                text=str(block.timings),
-                parent=block_group,
+                text=text,
                 content_layer=ContentLayer.BODY,
+                formatting=formatting,
+                parent=parent,
+                source=track,
             )
-            for cue_span in block.payload:
-                if isinstance(cue_span, _WebVTTCueVoiceSpan):
-                    voice_group = doc.add_group(
-                        label=GroupLabel.INLINE,
-                        name="WebVTT cue voice span",
-                        parent=block_group,
-                        content_layer=ContentLayer.BODY,
-                    )
-                    voice = cue_span.annotation
-                    if classes := cue_span.classes:
-                        voice += f" ({', '.join(classes)})"
-                    voice += ": "
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        text=voice,
-                        parent=voice_group,
-                        content_layer=ContentLayer.BODY,
+
+        if vtt.title:
+            doc.add_title(vtt.title, content_layer=ContentLayer.BODY)
+        for block in vtt.cue_blocks:
+            cue_text = []
+            parents = []
+            identifier = str(block.identifier) if block.identifier else None
+            _extract_components(block.payload)
+            for par in cue_text:
+                if not par.items:
+                    continue
+                if len(par.items) == 1:
+                    item = par.items[0]
+                    _add_text_item(
+                        text=item.text,
+                        formatting=item.formatting,
+                        item=item,
                     )
-                    for item in cue_span.components:
-                        WebVTTDocumentBackend._add_text_from_component(
-                            doc, item, voice_group
-                        )
                 else:
-                    WebVTTDocumentBackend._add_text_from_component(
-                        doc, cue_span, block_group
+                    group = doc.add_inline_group(
+                        "WebVTT cue span", content_layer=ContentLayer.BODY
                     )
+                    for item in par.items:
+                        _add_text_item(
+                            text=item.text,
+                            formatting=item.formatting,
+                            item=item,
+                            parent=group,
+                        )
 
         return doc
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
index 2bb94e42a6..7c8ea4cf3d 100644
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -1,47 +1,35 @@
 import logging
-import os
-import re
 import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Union, cast
-
-from docling_core.types.doc import DoclingDocument, DocumentOrigin
-
-# import whisper  # type: ignore
-# import librosa
-# import numpy as np
-# import soundfile as sf  # type: ignore
-from docling_core.types.doc.labels import DocItemLabel
-from pydantic import BaseModel, Field, validator
+from typing import Optional, Union
+
+from docling_core.types.doc import (
+    ContentLayer,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    TrackSource,
+)
+from pydantic import BaseModel, Field
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.noop_backend import NoOpBackend
-
-# from pydub import AudioSegment  # type: ignore
-# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
 )
 from docling.datamodel.base_models import (
     ConversionStatus,
-    FormatToMimeType,
 )
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AsrPipelineOptions,
 )
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrMlxWhisperOptions,
     InlineAsrNativeWhisperOptions,
-    # AsrResponseFormat,
-    InlineAsrOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import (
-    InferenceFramework,
-)
-from docling.datamodel.settings import settings
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
             )
 
             for citem in conversation:
+                track: TrackSource = TrackSource(
+                    start_time=citem.start_time,
+                    end_time=citem.end_time,
+                    voice=citem.speaker,
+                )
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.text,
+                    content_layer=ContentLayer.BODY,
+                    source=track,
                 )
 
             return conv_res
@@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
             )
 
             for citem in conversation:
+                track: TrackSource = TrackSource(
+                    start_time=citem.start_time,
+                    end_time=citem.end_time,
+                    voice=citem.speaker,
+                )
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.text,
+                    content_layer=ContentLayer.BODY,
+                    source=track,
                 )
 
             conv_res.status = ConversionStatus.SUCCESS
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 67be9e0de4..d284a4777c 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -2,7 +2,7 @@
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import List, Optional, Union, cast
+from typing import List, Union, cast
 
 from docling_core.types.doc import (
     BoundingBox,
@@ -12,8 +12,6 @@
     ImageRef,
     PictureItem,
     ProvenanceItem,
-    TableCell,
-    TableData,
     TextItem,
 )
 from docling_core.types.doc.base import (
@@ -21,7 +19,6 @@
     Size,
 )
 from docling_core.types.doc.document import DocTagsDocument
-from lxml import etree
 from PIL import Image as PILImage
 
 from docling.backend.abstract_backend import (
@@ -42,7 +39,6 @@
     InlineVlmOptions,
     ResponseFormat,
 )
-from docling.datamodel.settings import settings
 from docling.models.vlm_pipeline_models.api_vlm_model import ApiVlmModel
 from docling.models.vlm_pipeline_models.hf_transformers_model import (
     HuggingFaceTransformersVlmModel,
diff --git a/pyproject.toml b/pyproject.toml
index 1898c52f1e..dec2c06813 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ authors = [
 requires-python = '>=3.10,<4.0'
 dependencies = [
   'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.58.0,<3.0.0)',
+  'docling-core[chunking] (>=2.62.0,<3.0.0)',
   'docling-parse (>=4.7.0,<5.0.0)',
   "docling-ibm-models>=3.9.1,<4",
   'filetype (>=1.2.0,<2.0.0)',
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
index d7840e9941..db52ba1b79 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -1,66 +1,14 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 00:11.000 --> 00:13.000
-    item-3 at level 2: inline: group WebVTT cue voice span
-      item-4 at level 3: text: Roger Bingham: 
-      item-5 at level 3: text: We are in New York City
-  item-6 at level 1: section: group WebVTT cue block
-    item-7 at level 2: text: 00:13.000 --> 00:16.000
-    item-8 at level 2: inline: group WebVTT cue voice span
-      item-9 at level 3: text: Roger Bingham: 
-      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
-  item-11 at level 1: section: group WebVTT cue block
-    item-12 at level 2: text: 00:16.000 --> 00:18.000
-    item-13 at level 2: inline: group WebVTT cue voice span
-      item-14 at level 3: text: Roger Bingham: 
-      item-15 at level 3: text: from the American Museum of Natural History
-  item-16 at level 1: section: group WebVTT cue block
-    item-17 at level 2: text: 00:18.000 --> 00:20.000
-    item-18 at level 2: inline: group WebVTT cue voice span
-      item-19 at level 3: text: Roger Bingham: 
-      item-20 at level 3: text: And with me is Neil deGrasse Tyson
-  item-21 at level 1: section: group WebVTT cue block
-    item-22 at level 2: text: 00:20.000 --> 00:22.000
-    item-23 at level 2: inline: group WebVTT cue voice span
-      item-24 at level 3: text: Roger Bingham: 
-      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
-  item-26 at level 1: section: group WebVTT cue block
-    item-27 at level 2: text: 00:22.000 --> 00:24.000
-    item-28 at level 2: inline: group WebVTT cue voice span
-      item-29 at level 3: text: Roger Bingham: 
-      item-30 at level 3: text: at the AMNH.
-  item-31 at level 1: section: group WebVTT cue block
-    item-32 at level 2: text: 00:24.000 --> 00:26.000
-    item-33 at level 2: inline: group WebVTT cue voice span
-      item-34 at level 3: text: Roger Bingham: 
-      item-35 at level 3: text: Thank you for walking down here.
-  item-36 at level 1: section: group WebVTT cue block
-    item-37 at level 2: text: 00:27.000 --> 00:30.000
-    item-38 at level 2: inline: group WebVTT cue voice span
-      item-39 at level 3: text: Roger Bingham: 
-      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
-  item-41 at level 1: section: group WebVTT cue block
-    item-42 at level 2: text: 00:30.000 --> 00:31.500
-    item-43 at level 2: inline: group WebVTT cue voice span
-      item-44 at level 3: text: Roger Bingham: 
-      item-45 at level 3: text: When we e-mailed—
-  item-46 at level 1: section: group WebVTT cue block
-    item-47 at level 2: text: 00:30.500 --> 00:32.500
-    item-48 at level 2: inline: group WebVTT cue voice span
-      item-49 at level 3: text: Neil deGrasse Tyson: 
-      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
-  item-51 at level 1: section: group WebVTT cue block
-    item-52 at level 2: text: 00:32.000 --> 00:35.500
-    item-53 at level 2: inline: group WebVTT cue voice span
-      item-54 at level 3: text: Roger Bingham: 
-      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
-  item-56 at level 1: section: group WebVTT cue block
-    item-57 at level 2: text: 00:32.500 --> 00:33.500
-    item-58 at level 2: inline: group WebVTT cue voice span
-      item-59 at level 3: text: Neil deGrasse Tyson: 
-      item-60 at level 3: text: Laughs
-  item-61 at level 1: section: group WebVTT cue block
-    item-62 at level 2: text: 00:35.500 --> 00:38.000
-    item-63 at level 2: inline: group WebVTT cue voice span
-      item-64 at level 3: text: Roger Bingham: 
-      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+  item-1 at level 1: text: We are in New York City
+  item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street
+  item-3 at level 1: text: from the American Museum of Natural History
+  item-4 at level 1: text: And with me is Neil deGrasse Tyson
+  item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium
+  item-6 at level 1: text: at the AMNH.
+  item-7 at level 1: text: Thank you for walking down here.
+  item-8 at level 1: text: And I want to do a follow-up on the last conversation we did.
+  item-9 at level 1: text: When we e-mailed—
+  item-10 at level 1: text: Didn’t we talk about enough in that conversation?
+  item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos
+  item-12 at level 1: text: Laughs
+  item-13 at level 1: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
index 8311825601..56548734b1 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "name": "webvtt_example_01",
   "origin": {
     "mimetype": "text/vtt",
@@ -18,1052 +18,316 @@
     "self_ref": "#/body",
     "children": [
       {
-        "$ref": "#/groups/0"
+        "$ref": "#/texts/0"
       },
       {
-        "$ref": "#/groups/2"
+        "$ref": "#/texts/1"
       },
       {
-        "$ref": "#/groups/4"
+        "$ref": "#/texts/2"
       },
       {
-        "$ref": "#/groups/6"
+        "$ref": "#/texts/3"
       },
       {
-        "$ref": "#/groups/8"
+        "$ref": "#/texts/4"
       },
       {
-        "$ref": "#/groups/10"
+        "$ref": "#/texts/5"
       },
       {
-        "$ref": "#/groups/12"
+        "$ref": "#/texts/6"
       },
       {
-        "$ref": "#/groups/14"
+        "$ref": "#/texts/7"
       },
       {
-        "$ref": "#/groups/16"
+        "$ref": "#/texts/8"
       },
       {
-        "$ref": "#/groups/18"
+        "$ref": "#/texts/9"
       },
       {
-        "$ref": "#/groups/20"
+        "$ref": "#/texts/10"
       },
       {
-        "$ref": "#/groups/22"
+        "$ref": "#/texts/11"
       },
       {
-        "$ref": "#/groups/24"
+        "$ref": "#/texts/12"
       }
     ],
     "content_layer": "body",
     "name": "_root_",
     "label": "unspecified"
   },
-  "groups": [
-    {
-      "self_ref": "#/groups/0",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/0"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/1"
-        },
-        {
-          "$ref": "#/texts/2"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
+  "groups": [],
+  "texts": [
     {
-      "self_ref": "#/groups/2",
+      "self_ref": "#/texts/0",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/3"
-        },
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/4"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/5"
+          "kind": "track",
+          "start_time": 11.0,
+          "end_time": 13.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "We are in New York City",
+      "text": "We are in New York City"
     },
     {
-      "self_ref": "#/groups/4",
+      "self_ref": "#/texts/1",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/6"
-        },
-        {
-          "$ref": "#/groups/5"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/5",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/7"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/8"
+          "kind": "track",
+          "start_time": 13.0,
+          "end_time": 16.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "We’re actually at the Lucern Hotel, just down the street",
+      "text": "We’re actually at the Lucern Hotel, just down the street"
     },
     {
-      "self_ref": "#/groups/6",
+      "self_ref": "#/texts/2",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/9"
-        },
-        {
-          "$ref": "#/groups/7"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/7",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/10"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/11"
+          "kind": "track",
+          "start_time": 16.0,
+          "end_time": 18.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "from the American Museum of Natural History",
+      "text": "from the American Museum of Natural History"
     },
     {
-      "self_ref": "#/groups/8",
+      "self_ref": "#/texts/3",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/12"
-        },
-        {
-          "$ref": "#/groups/9"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/9",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/13"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/14"
+          "kind": "track",
+          "start_time": 18.0,
+          "end_time": 20.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "And with me is Neil deGrasse Tyson",
+      "text": "And with me is Neil deGrasse Tyson"
     },
     {
-      "self_ref": "#/groups/10",
+      "self_ref": "#/texts/4",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/15"
-        },
-        {
-          "$ref": "#/groups/11"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/11",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/16"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/17"
+          "kind": "track",
+          "start_time": 20.0,
+          "end_time": 22.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Astrophysicist, Director of the Hayden Planetarium",
+      "text": "Astrophysicist, Director of the Hayden Planetarium"
     },
     {
-      "self_ref": "#/groups/12",
+      "self_ref": "#/texts/5",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/18"
-        },
-        {
-          "$ref": "#/groups/13"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/13",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/19"
-        },
+      "label": "text",
+      "source": [
         {
-          "$ref": "#/texts/20"
+          "kind": "track",
+          "start_time": 22.0,
+          "end_time": 24.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "at the AMNH.",
+      "text": "at the AMNH."
     },
     {
-      "self_ref": "#/groups/14",
+      "self_ref": "#/texts/6",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/21"
-        },
-        {
-          "$ref": "#/groups/15"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/15",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/22"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/23"
+          "kind": "track",
+          "start_time": 24.0,
+          "end_time": 26.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Thank you for walking down here.",
+      "text": "Thank you for walking down here."
     },
     {
-      "self_ref": "#/groups/16",
+      "self_ref": "#/texts/7",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/24"
-        },
-        {
-          "$ref": "#/groups/17"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/17",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/25"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/26"
+          "kind": "track",
+          "start_time": 27.0,
+          "end_time": 30.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "And I want to do a follow-up on the last conversation we did.",
+      "text": "And I want to do a follow-up on the last conversation we did."
     },
     {
-      "self_ref": "#/groups/18",
+      "self_ref": "#/texts/8",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/27"
-        },
-        {
-          "$ref": "#/groups/19"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/19",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/28"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/29"
+          "kind": "track",
+          "start_time": 30.0,
+          "end_time": 31.5,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "When we e-mailed—",
+      "text": "When we e-mailed—"
     },
     {
-      "self_ref": "#/groups/20",
+      "self_ref": "#/texts/9",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/30"
-        },
-        {
-          "$ref": "#/groups/21"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/21",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/31"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/32"
+          "kind": "track",
+          "start_time": 30.5,
+          "end_time": 32.5,
+          "voice": "Neil deGrasse Tyson"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Didn’t we talk about enough in that conversation?",
+      "text": "Didn’t we talk about enough in that conversation?"
     },
     {
-      "self_ref": "#/groups/22",
+      "self_ref": "#/texts/10",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/33"
-        },
-        {
-          "$ref": "#/groups/23"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/23",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/34"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/35"
+          "kind": "track",
+          "start_time": 32.0,
+          "end_time": 35.5,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+      "text": "No! No no no no; 'cos 'cos obviously 'cos"
     },
     {
-      "self_ref": "#/groups/24",
+      "self_ref": "#/texts/11",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/36"
-        },
-        {
-          "$ref": "#/groups/25"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/25",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/37"
-        },
-        {
-          "$ref": "#/texts/38"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    }
-  ],
-  "texts": [
-    {
-      "self_ref": "#/texts/0",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:11.000 --> 00:13.000",
-      "text": "00:11.000 --> 00:13.000"
-    },
-    {
-      "self_ref": "#/texts/1",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/2",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "We are in New York City",
-      "text": "We are in New York City",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 32.5,
+          "end_time": 33.5,
+          "voice": "Neil deGrasse Tyson"
+        }
+      ],
+      "orig": "Laughs",
+      "text": "Laughs",
       "formatting": {
         "bold": false,
-        "italic": false,
+        "italic": true,
         "underline": false,
         "strikethrough": false,
         "script": "baseline"
       }
     },
     {
-      "self_ref": "#/texts/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:13.000 --> 00:16.000",
-      "text": "00:13.000 --> 00:16.000"
-    },
-    {
-      "self_ref": "#/texts/4",
-      "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/5",
+      "self_ref": "#/texts/12",
       "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "We’re actually at the Lucern Hotel, just down the street",
-      "text": "We’re actually at the Lucern Hotel, just down the street",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/6",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:16.000 --> 00:18.000",
-      "text": "00:16.000 --> 00:18.000"
-    },
-    {
-      "self_ref": "#/texts/7",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/8",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "from the American Museum of Natural History",
-      "text": "from the American Museum of Natural History",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/9",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:18.000 --> 00:20.000",
-      "text": "00:18.000 --> 00:20.000"
-    },
-    {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "And with me is Neil deGrasse Tyson",
-      "text": "And with me is Neil deGrasse Tyson",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/12",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:20.000 --> 00:22.000",
-      "text": "00:20.000 --> 00:22.000"
-    },
-    {
-      "self_ref": "#/texts/13",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/14",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Astrophysicist, Director of the Hayden Planetarium",
-      "text": "Astrophysicist, Director of the Hayden Planetarium",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/15",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:22.000 --> 00:24.000",
-      "text": "00:22.000 --> 00:24.000"
-    },
-    {
-      "self_ref": "#/texts/16",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/17",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "at the AMNH.",
-      "text": "at the AMNH.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/18",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:24.000 --> 00:26.000",
-      "text": "00:24.000 --> 00:26.000"
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Thank you for walking down here.",
-      "text": "Thank you for walking down here.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:27.000 --> 00:30.000",
-      "text": "00:27.000 --> 00:30.000"
-    },
-    {
-      "self_ref": "#/texts/22",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/23",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "And I want to do a follow-up on the last conversation we did.",
-      "text": "And I want to do a follow-up on the last conversation we did.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/24",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:30.000 --> 00:31.500",
-      "text": "00:30.000 --> 00:31.500"
-    },
-    {
-      "self_ref": "#/texts/25",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/26",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "When we e-mailed—",
-      "text": "When we e-mailed—",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/27",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:30.500 --> 00:32.500",
-      "text": "00:30.500 --> 00:32.500"
-    },
-    {
-      "self_ref": "#/texts/28",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Neil deGrasse Tyson: ",
-      "text": "Neil deGrasse Tyson: "
-    },
-    {
-      "self_ref": "#/texts/29",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Didn’t we talk about enough in that conversation?",
-      "text": "Didn’t we talk about enough in that conversation?",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/30",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:32.000 --> 00:35.500",
-      "text": "00:32.000 --> 00:35.500"
-    },
-    {
-      "self_ref": "#/texts/31",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/32",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
-      "text": "No! No no no no; 'cos 'cos obviously 'cos",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/33",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:32.500 --> 00:33.500",
-      "text": "00:32.500 --> 00:33.500"
-    },
-    {
-      "self_ref": "#/texts/34",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Neil deGrasse Tyson: ",
-      "text": "Neil deGrasse Tyson: "
-    },
-    {
-      "self_ref": "#/texts/35",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Laughs",
-      "text": "Laughs",
-      "formatting": {
-        "bold": false,
-        "italic": true,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/36",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:35.500 --> 00:38.000",
-      "text": "00:35.500 --> 00:38.000"
-    },
-    {
-      "self_ref": "#/texts/37",
-      "parent": {
-        "$ref": "#/groups/25"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/38",
-      "parent": {
-        "$ref": "#/groups/25"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 35.5,
+          "end_time": 38.0,
+          "voice": "Roger Bingham"
+        }
+      ],
       "orig": "You know I’m so excited my glasses are falling off here.",
-      "text": "You know I’m so excited my glasses are falling off here.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "text": "You know I’m so excited my glasses are falling off here."
     }
   ],
   "pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
index c57670289f..95d9e65753 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -1,51 +1,25 @@
-00:11.000 --> 00:13.000
+We are in New York City
 
-Roger Bingham:  We are in New York City
+We’re actually at the Lucern Hotel, just down the street
 
-00:13.000 --> 00:16.000
+from the American Museum of Natural History
 
-Roger Bingham:  We’re actually at the Lucern Hotel, just down the street
+And with me is Neil deGrasse Tyson
 
-00:16.000 --> 00:18.000
+Astrophysicist, Director of the Hayden Planetarium
 
-Roger Bingham:  from the American Museum of Natural History
+at the AMNH.
 
-00:18.000 --> 00:20.000
+Thank you for walking down here.
 
-Roger Bingham:  And with me is Neil deGrasse Tyson
+And I want to do a follow-up on the last conversation we did.
 
-00:20.000 --> 00:22.000
+When we e-mailed—
 
-Roger Bingham:  Astrophysicist, Director of the Hayden Planetarium
+Didn’t we talk about enough in that conversation?
 
-00:22.000 --> 00:24.000
+No! No no no no; 'cos 'cos obviously 'cos
 
-Roger Bingham:  at the AMNH.
+*Laughs*
 
-00:24.000 --> 00:26.000
-
-Roger Bingham:  Thank you for walking down here.
-
-00:27.000 --> 00:30.000
-
-Roger Bingham:  And I want to do a follow-up on the last conversation we did.
-
-00:30.000 --> 00:31.500
-
-Roger Bingham:  When we e-mailed—
-
-00:30.500 --> 00:32.500
-
-Neil deGrasse Tyson:  Didn’t we talk about enough in that conversation?
-
-00:32.000 --> 00:35.500
-
-Roger Bingham:  No! No no no no; 'cos 'cos obviously 'cos
-
-00:32.500 --> 00:33.500
-
-Neil deGrasse Tyson:  *Laughs*
-
-00:35.500 --> 00:38.000
-
-Roger Bingham:  You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
index 6d90404ff7..56f63bc3f5 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -1,22 +1,12 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 00:00.000 --> 00:02.000
-    item-3 at level 2: inline: group WebVTT cue voice span
-      item-4 at level 3: text: Esme (first, loud): 
-      item-5 at level 3: text: It’s a blue apple tree!
-  item-6 at level 1: section: group WebVTT cue block
-    item-7 at level 2: text: 00:02.000 --> 00:04.000
-    item-8 at level 2: inline: group WebVTT cue voice span
-      item-9 at level 3: text: Mary: 
-      item-10 at level 3: text: No way!
-  item-11 at level 1: section: group WebVTT cue block
-    item-12 at level 2: text: 00:04.000 --> 00:06.000
-    item-13 at level 2: inline: group WebVTT cue voice span
-      item-14 at level 3: text: Esme: 
-      item-15 at level 3: text: Hee!
-    item-16 at level 2: text: laughter
-  item-17 at level 1: section: group WebVTT cue block
-    item-18 at level 2: text: 00:06.000 --> 00:08.000
-    item-19 at level 2: inline: group WebVTT cue voice span
-      item-20 at level 3: text: Mary (loud): 
-      item-21 at level 3: text: That’s awesome!
\ No newline at end of file
+  item-1 at level 1: text: It’s a blue apple tree!
+  item-2 at level 1: text: No way!
+  item-3 at level 1: inline: group WebVTT cue span
+    item-4 at level 2: text: Hee!
+    item-5 at level 2: text:  
+    item-6 at level 2: text: laughter
+  item-7 at level 1: text: That’s awesome!
+  item-8 at level 1: inline: group WebVTT cue span
+    item-9 at level 2: text: Sur les 
+    item-10 at level 2: text: playground
+    item-11 at level 2: text: , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
index 72647d93d0..3103261655 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -1,10 +1,10 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "name": "webvtt_example_02",
   "origin": {
     "mimetype": "text/vtt",
-    "binary_hash": 5029965721282070624,
+    "binary_hash": 8584853280299071027,
     "filename": "webvtt_example_02.vtt"
   },
   "furniture": {
@@ -18,16 +18,19 @@
     "self_ref": "#/body",
     "children": [
       {
-        "$ref": "#/groups/0"
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
       },
       {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/0"
       },
       {
-        "$ref": "#/groups/4"
+        "$ref": "#/texts/5"
       },
       {
-        "$ref": "#/groups/6"
+        "$ref": "#/groups/1"
       }
     ],
     "content_layer": "body",
@@ -41,70 +44,22 @@
         "$ref": "#/body"
       },
       "children": [
-        {
-          "$ref": "#/texts/0"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/1"
-        },
         {
           "$ref": "#/texts/2"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/2",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
+        },
         {
           "$ref": "#/texts/3"
         },
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [
         {
           "$ref": "#/texts/4"
-        },
-        {
-          "$ref": "#/texts/5"
         }
       ],
       "content_layer": "body",
-      "name": "WebVTT cue voice span",
+      "name": "WebVTT cue span",
       "label": "inline"
     },
     {
-      "self_ref": "#/groups/4",
+      "self_ref": "#/groups/1",
       "parent": {
         "$ref": "#/body"
       },
@@ -112,23 +67,6 @@
         {
           "$ref": "#/texts/6"
         },
-        {
-          "$ref": "#/groups/5"
-        },
-        {
-          "$ref": "#/texts/9"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/5",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [
         {
           "$ref": "#/texts/7"
         },
@@ -137,41 +75,7 @@
         }
       ],
       "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/6",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/10"
-        },
-        {
-          "$ref": "#/groups/7"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/7",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/11"
-        },
-        {
-          "$ref": "#/texts/12"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
+      "name": "WebVTT cue span",
       "label": "inline"
     }
   ],
@@ -179,143 +83,177 @@
     {
       "self_ref": "#/texts/0",
       "parent": {
-        "$ref": "#/groups/0"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "00:00.000 --> 00:02.000",
-      "text": "00:00.000 --> 00:02.000"
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 0.0,
+          "end_time": 2.0,
+          "voice": "Esme",
+          "classes": [
+            "v.first.loud"
+          ]
+        }
+      ],
+      "orig": "It’s a blue apple tree!",
+      "text": "It’s a blue apple tree!"
     },
     {
       "self_ref": "#/texts/1",
       "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "Esme (first, loud): ",
-      "text": "Esme (first, loud): "
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 2.0,
+          "end_time": 4.0,
+          "voice": "Mary"
+        }
+      ],
+      "orig": "No way!",
+      "text": "No way!"
     },
     {
       "self_ref": "#/texts/2",
       "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/groups/0"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "It’s a blue apple tree!",
-      "text": "It’s a blue apple tree!",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.0,
+          "end_time": 6.0,
+          "voice": "Esme"
+        }
+      ],
+      "orig": "Hee!",
+      "text": "Hee!"
     },
     {
       "self_ref": "#/texts/3",
       "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/0"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "00:02.000 --> 00:04.000",
-      "text": "00:02.000 --> 00:04.000"
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": " ",
+      "text": " "
     },
     {
       "self_ref": "#/texts/4",
       "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Mary: ",
-      "text": "Mary: "
-    },
-    {
-      "self_ref": "#/texts/5",
-      "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/groups/0"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "No way!",
-      "text": "No way!",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": "laughter",
+      "text": "laughter",
       "formatting": {
         "bold": false,
-        "italic": false,
+        "italic": true,
         "underline": false,
         "strikethrough": false,
         "script": "baseline"
       }
     },
     {
-      "self_ref": "#/texts/6",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:04.000 --> 00:06.000",
-      "text": "00:04.000 --> 00:06.000"
-    },
-    {
-      "self_ref": "#/texts/7",
+      "self_ref": "#/texts/5",
       "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "Esme: ",
-      "text": "Esme: "
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 6.0,
+          "end_time": 8.0,
+          "voice": "Mary",
+          "classes": [
+            "v.loud"
+          ]
+        }
+      ],
+      "orig": "That’s awesome!",
+      "text": "That’s awesome!"
     },
     {
-      "self_ref": "#/texts/8",
+      "self_ref": "#/texts/6",
       "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/1"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "Hee!",
-      "text": "Hee!",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": "Sur les ",
+      "text": "Sur les "
     },
     {
-      "self_ref": "#/texts/9",
+      "self_ref": "#/texts/7",
       "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/groups/1"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "laughter",
-      "text": "laughter",
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.0,
+          "end_time": 10.0,
+          "languages": [
+            "en"
+          ],
+          "classes": [
+            "i.foreignphrase"
+          ]
+        }
+      ],
+      "orig": "playground",
+      "text": "playground",
       "formatting": {
         "bold": false,
         "italic": true,
@@ -325,47 +263,23 @@
       }
     },
     {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:06.000 --> 00:08.000",
-      "text": "00:06.000 --> 00:08.000"
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Mary (loud): ",
-      "text": "Mary (loud): "
-    },
-    {
-      "self_ref": "#/texts/12",
+      "self_ref": "#/texts/8",
       "parent": {
-        "$ref": "#/groups/7"
+        "$ref": "#/groups/1"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "That’s awesome!",
-      "text": "That’s awesome!",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": ", ici à Montpellier",
+      "text": ", ici à Montpellier"
     }
   ],
   "pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
index db84cf116d..7f62407381 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -1,17 +1,9 @@
-00:00.000 --> 00:02.000
+It’s a blue apple tree!
 
-Esme (first, loud):  It’s a blue apple tree!
+No way!
 
-00:02.000 --> 00:04.000
+Hee!   *laughter*
 
-Mary:  No way!
+That’s awesome!
 
-00:04.000 --> 00:06.000
-
-Esme:  Hee!
-
-*laughter*
-
-00:06.000 --> 00:08.000
-
-Mary (loud):  That’s awesome!
\ No newline at end of file
+Sur les  *playground* , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
index ca344e5957..a46794123c 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -1,77 +1,18 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
-    item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
-    item-4 at level 2: inline: group WebVTT cue voice span
-      item-5 at level 3: text: Speaker A: 
-      item-6 at level 3: text: OK, I think now we should be recording
-  item-7 at level 1: section: group WebVTT cue block
-    item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
-    item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
-    item-10 at level 2: inline: group WebVTT cue voice span
-      item-11 at level 3: text: Speaker A: 
-      item-12 at level 3: text: properly.
-  item-13 at level 1: section: group WebVTT cue block
-    item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
-    item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
-    item-16 at level 2: text: Good.
-  item-17 at level 1: section: group WebVTT cue block
-    item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
-    item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
-    item-20 at level 2: inline: group WebVTT cue voice span
-      item-21 at level 3: text: Speaker A: 
-      item-22 at level 3: text: Yeah.
-  item-23 at level 1: section: group WebVTT cue block
-    item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
-    item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
-    item-26 at level 2: inline: group WebVTT cue voice span
-      item-27 at level 3: text: Speaker B: 
-      item-28 at level 3: text: I was also thinking.
-  item-29 at level 1: section: group WebVTT cue block
-    item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
-    item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
-    item-32 at level 2: inline: group WebVTT cue voice span
-      item-33 at level 3: text: Speaker B: 
-      item-34 at level 3: text: Would be maybe good to create items,
-  item-35 at level 1: section: group WebVTT cue block
-    item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
-    item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
-    item-38 at level 2: inline: group WebVTT cue voice span
-      item-39 at level 3: text: Speaker B: 
-      item-40 at level 3: text: some metadata, some options that can be specific.
-  item-41 at level 1: section: group WebVTT cue block
-    item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
-    item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
-    item-44 at level 2: inline: group WebVTT cue voice span
-      item-45 at level 3: text: Speaker A: 
-      item-46 at level 3: text: Yeah, I mean I think you went even more than
-  item-47 at level 1: section: group WebVTT cue block
-    item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
-    item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
-    item-50 at level 2: inline: group WebVTT cue voice span
-      item-51 at level 3: text: Speaker B: 
-      item-52 at level 3: text: But we preserved the atoms.
-  item-53 at level 1: section: group WebVTT cue block
-    item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
-    item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
-    item-56 at level 2: inline: group WebVTT cue voice span
-      item-57 at level 3: text: Speaker A: 
-      item-58 at level 3: text: than me. I just opened the format.
-  item-59 at level 1: section: group WebVTT cue block
-    item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
-    item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
-    item-62 at level 2: inline: group WebVTT cue voice span
-      item-63 at level 3: text: Speaker A: 
-      item-64 at level 3: text: give it a try, yeah.
-  item-65 at level 1: section: group WebVTT cue block
-    item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
-    item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
-    item-68 at level 2: inline: group WebVTT cue voice span
-      item-69 at level 3: text: Speaker B: 
-      item-70 at level 3: text: Okay, talk to you later.
-  item-71 at level 1: section: group WebVTT cue block
-    item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
-    item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
-    item-74 at level 2: inline: group WebVTT cue voice span
-      item-75 at level 3: text: Speaker A: 
-      item-76 at level 3: text: See you.
\ No newline at end of file
+  item-1 at level 1: text: OK,
+  item-2 at level 1: text: I think now we should be recording
+  item-3 at level 1: text: properly.
+  item-4 at level 1: text: Good.
+  item-5 at level 1: text: Yeah.
+  item-6 at level 1: text: I was also thinking.
+  item-7 at level 1: text: Would be maybe good to create items,
+  item-8 at level 1: text: some metadata,
+  item-9 at level 1: text: some options that can be specific.
+  item-10 at level 1: text: Yeah,
+  item-11 at level 1: text: I mean I think you went even more than
+  item-12 at level 1: text: But we preserved the atoms.
+  item-13 at level 1: text: than me.
+  item-14 at level 1: text: I just opened the format.
+  item-15 at level 1: text: give it a try, yeah.
+  item-16 at level 1: text: Okay, talk to you later.
+  item-17 at level 1: text: See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
index 5df08e2bf3..e744229666 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "name": "webvtt_example_03",
   "origin": {
     "mimetype": "text/vtt",
@@ -18,1218 +18,418 @@
     "self_ref": "#/body",
     "children": [
       {
-        "$ref": "#/groups/0"
+        "$ref": "#/texts/0"
       },
       {
-        "$ref": "#/groups/2"
+        "$ref": "#/texts/1"
       },
       {
-        "$ref": "#/groups/4"
+        "$ref": "#/texts/2"
       },
       {
-        "$ref": "#/groups/5"
+        "$ref": "#/texts/3"
       },
       {
-        "$ref": "#/groups/7"
+        "$ref": "#/texts/4"
       },
       {
-        "$ref": "#/groups/9"
+        "$ref": "#/texts/5"
       },
       {
-        "$ref": "#/groups/11"
+        "$ref": "#/texts/6"
       },
       {
-        "$ref": "#/groups/13"
+        "$ref": "#/texts/7"
       },
       {
-        "$ref": "#/groups/15"
+        "$ref": "#/texts/8"
       },
       {
-        "$ref": "#/groups/17"
+        "$ref": "#/texts/9"
       },
       {
-        "$ref": "#/groups/19"
+        "$ref": "#/texts/10"
       },
       {
-        "$ref": "#/groups/21"
+        "$ref": "#/texts/11"
       },
       {
-        "$ref": "#/groups/23"
+        "$ref": "#/texts/12"
+      },
+      {
+        "$ref": "#/texts/13"
+      },
+      {
+        "$ref": "#/texts/14"
+      },
+      {
+        "$ref": "#/texts/15"
+      },
+      {
+        "$ref": "#/texts/16"
       }
     ],
     "content_layer": "body",
     "name": "_root_",
     "label": "unspecified"
   },
-  "groups": [
+  "groups": [],
+  "texts": [
     {
-      "self_ref": "#/groups/0",
+      "self_ref": "#/texts/0",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/0"
-        },
-        {
-          "$ref": "#/texts/1"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/2"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/3"
+          "kind": "track",
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "OK,",
+      "text": "OK,"
     },
     {
-      "self_ref": "#/groups/2",
+      "self_ref": "#/texts/1",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/4"
-        },
-        {
-          "$ref": "#/texts/5"
-        },
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/6"
-        },
-        {
-          "$ref": "#/texts/7"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/4",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/8"
-        },
-        {
-          "$ref": "#/texts/9"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/10"
+          "kind": "track",
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
+      "orig": "I think now we should be recording",
+      "text": "I think now we should be recording"
     },
     {
-      "self_ref": "#/groups/5",
+      "self_ref": "#/texts/2",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/11"
-        },
-        {
-          "$ref": "#/texts/12"
-        },
-        {
-          "$ref": "#/groups/6"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/6",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/13"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/14"
+          "kind": "track",
+          "start_time": 8.571,
+          "end_time": 9.403,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "properly.",
+      "text": "properly."
     },
     {
-      "self_ref": "#/groups/7",
+      "self_ref": "#/texts/3",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/15"
-        },
-        {
-          "$ref": "#/texts/16"
-        },
-        {
-          "$ref": "#/groups/8"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/8",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/17"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/18"
+          "kind": "track",
+          "start_time": 10.683,
+          "end_time": 11.563,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Good.",
+      "text": "Good."
     },
     {
-      "self_ref": "#/groups/9",
+      "self_ref": "#/texts/4",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/19"
-        },
-        {
-          "$ref": "#/texts/20"
-        },
-        {
-          "$ref": "#/groups/10"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/10",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/21"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/22"
+          "kind": "track",
+          "start_time": 13.363,
+          "end_time": 13.803,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Yeah.",
+      "text": "Yeah."
     },
     {
-      "self_ref": "#/groups/11",
+      "self_ref": "#/texts/5",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/23"
-        },
-        {
-          "$ref": "#/texts/24"
-        },
-        {
-          "$ref": "#/groups/12"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/12",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/25"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/26"
+          "kind": "track",
+          "start_time": 49.603,
+          "end_time": 53.363,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "I was also thinking.",
+      "text": "I was also thinking."
     },
     {
-      "self_ref": "#/groups/13",
+      "self_ref": "#/texts/6",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/27"
-        },
-        {
-          "$ref": "#/texts/28"
-        },
-        {
-          "$ref": "#/groups/14"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/14",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/29"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/30"
+          "kind": "track",
+          "start_time": 54.963,
+          "end_time": 62.072,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Would be maybe good to create items,",
+      "text": "Would be maybe good to create items,"
     },
     {
-      "self_ref": "#/groups/15",
+      "self_ref": "#/texts/7",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/31"
-        },
-        {
-          "$ref": "#/texts/32"
-        },
-        {
-          "$ref": "#/groups/16"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/16",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/33"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/34"
+          "kind": "track",
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "some metadata,",
+      "text": "some metadata,"
     },
     {
-      "self_ref": "#/groups/17",
+      "self_ref": "#/texts/8",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/35"
-        },
-        {
-          "$ref": "#/texts/36"
-        },
-        {
-          "$ref": "#/groups/18"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/18",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/37"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/38"
+          "kind": "track",
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "some options that can be specific.",
+      "text": "some options that can be specific."
     },
     {
-      "self_ref": "#/groups/19",
+      "self_ref": "#/texts/9",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/39"
-        },
-        {
-          "$ref": "#/texts/40"
-        },
-        {
-          "$ref": "#/groups/20"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/20",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/41"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/42"
+          "kind": "track",
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Yeah,",
+      "text": "Yeah,"
     },
     {
-      "self_ref": "#/groups/21",
+      "self_ref": "#/texts/10",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/43"
-        },
-        {
-          "$ref": "#/texts/44"
-        },
-        {
-          "$ref": "#/groups/22"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/22",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/45"
-        },
+      "label": "text",
+      "prov": [],
+      "source": [
         {
-          "$ref": "#/texts/46"
+          "kind": "track",
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "I mean I think you went even more than",
+      "text": "I mean I think you went even more than"
     },
     {
-      "self_ref": "#/groups/23",
+      "self_ref": "#/texts/11",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/47"
-        },
-        {
-          "$ref": "#/texts/48"
-        },
-        {
-          "$ref": "#/groups/24"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/24",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/49"
-        },
-        {
-          "$ref": "#/texts/50"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    }
-  ],
-  "texts": [
-    {
-      "self_ref": "#/texts/0",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
-    },
-    {
-      "self_ref": "#/texts/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "00:00:04.963 --> 00:00:08.571",
-      "text": "00:00:04.963 --> 00:00:08.571"
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 70.563,
+          "end_time": 72.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "But we preserved the atoms.",
+      "text": "But we preserved the atoms."
     },
     {
-      "self_ref": "#/texts/2",
+      "self_ref": "#/texts/12",
       "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "than me.",
+      "text": "than me."
     },
     {
-      "self_ref": "#/texts/3",
+      "self_ref": "#/texts/13",
       "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "OK, I think now we should be recording",
-      "text": "OK, I think now we should be recording",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I just opened the format.",
+      "text": "I just opened the format."
     },
     {
-      "self_ref": "#/texts/4",
+      "self_ref": "#/texts/14",
       "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1"
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 110.222,
+          "end_time": 111.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "give it a try, yeah.",
+      "text": "give it a try, yeah."
     },
     {
-      "self_ref": "#/texts/5",
+      "self_ref": "#/texts/15",
       "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
-      "orig": "00:00:08.571 --> 00:00:09.403",
-      "text": "00:00:08.571 --> 00:00:09.403"
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 112.043,
+          "end_time": 115.043,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "Okay, talk to you later.",
+      "text": "Okay, talk to you later."
     },
     {
-      "self_ref": "#/texts/6",
+      "self_ref": "#/texts/16",
       "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/7",
-      "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "properly.",
-      "text": "properly.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/8",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
-    },
-    {
-      "self_ref": "#/texts/9",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:10.683 --> 00:00:11.563",
-      "text": "00:00:10.683 --> 00:00:11.563"
-    },
-    {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Good.",
-      "text": "Good.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0"
-    },
-    {
-      "self_ref": "#/texts/12",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:13.363 --> 00:00:13.803",
-      "text": "00:00:13.363 --> 00:00:13.803"
-    },
-    {
-      "self_ref": "#/texts/13",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/14",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Yeah.",
-      "text": "Yeah.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/15",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0"
-    },
-    {
-      "self_ref": "#/texts/16",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:49.603 --> 00:00:53.363",
-      "text": "00:00:49.603 --> 00:00:53.363"
-    },
-    {
-      "self_ref": "#/texts/17",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/18",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "I was also thinking.",
-      "text": "I was also thinking.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0"
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:54.963 --> 00:01:02.072",
-      "text": "00:00:54.963 --> 00:01:02.072"
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/22",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Would be maybe good to create items,",
-      "text": "Would be maybe good to create items,",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/23",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1"
-    },
-    {
-      "self_ref": "#/texts/24",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:02.072 --> 00:01:06.811",
-      "text": "00:01:02.072 --> 00:01:06.811"
-    },
-    {
-      "self_ref": "#/texts/25",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/26",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "some metadata, some options that can be specific.",
-      "text": "some metadata, some options that can be specific.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/27",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0"
-    },
-    {
-      "self_ref": "#/texts/28",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:10.243 --> 00:01:13.014",
-      "text": "00:01:10.243 --> 00:01:13.014"
-    },
-    {
-      "self_ref": "#/texts/29",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/30",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Yeah, I mean I think you went even more than",
-      "text": "Yeah, I mean I think you went even more than",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/31",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0"
-    },
-    {
-      "self_ref": "#/texts/32",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:10.563 --> 00:01:12.643",
-      "text": "00:01:10.563 --> 00:01:12.643"
-    },
-    {
-      "self_ref": "#/texts/33",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/34",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "But we preserved the atoms.",
-      "text": "But we preserved the atoms.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/35",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1"
-    },
-    {
-      "self_ref": "#/texts/36",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:13.014 --> 00:01:15.907",
-      "text": "00:01:13.014 --> 00:01:15.907"
-    },
-    {
-      "self_ref": "#/texts/37",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/38",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "than me. I just opened the format.",
-      "text": "than me. I just opened the format.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/39",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1"
-    },
-    {
-      "self_ref": "#/texts/40",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:50.222 --> 00:01:51.643",
-      "text": "00:01:50.222 --> 00:01:51.643"
-    },
-    {
-      "self_ref": "#/texts/41",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/42",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "give it a try, yeah.",
-      "text": "give it a try, yeah.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/43",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0"
-    },
-    {
-      "self_ref": "#/texts/44",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:52.043 --> 00:01:55.043",
-      "text": "00:01:52.043 --> 00:01:55.043"
-    },
-    {
-      "self_ref": "#/texts/45",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/46",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Okay, talk to you later.",
-      "text": "Okay, talk to you later.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/47",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0"
-    },
-    {
-      "self_ref": "#/texts/48",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:54.603 --> 00:01:55.283",
-      "text": "00:01:54.603 --> 00:01:55.283"
-    },
-    {
-      "self_ref": "#/texts/49",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/50",
-      "parent": {
-        "$ref": "#/groups/24"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
       "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 114.603,
+          "end_time": 115.283,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+          "voice": "Speaker A"
+        }
+      ],
       "orig": "See you.",
-      "text": "See you.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "text": "See you."
     }
   ],
   "pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
index 859a6dde3f..b58d350b3d 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -1,77 +1,33 @@
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+OK,
 
-00:00:04.963 --> 00:00:08.571
+I think now we should be recording
 
-Speaker A:  OK, I think now we should be recording
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
-
-00:00:08.571 --> 00:00:09.403
-
-Speaker A:  properly.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
-
-00:00:10.683 --> 00:00:11.563
+properly.
 
 Good.
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
-
-00:00:13.363 --> 00:00:13.803
-
-Speaker A:  Yeah.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
-
-00:00:49.603 --> 00:00:53.363
-
-Speaker B:  I was also thinking.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
-
-00:00:54.963 --> 00:01:02.072
-
-Speaker B:  Would be maybe good to create items,
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
-
-00:01:02.072 --> 00:01:06.811
-
-Speaker B:  some metadata, some options that can be specific.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
-
-00:01:10.243 --> 00:01:13.014
-
-Speaker A:  Yeah, I mean I think you went even more than
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
-
-00:01:10.563 --> 00:01:12.643
-
-Speaker B:  But we preserved the atoms.
+Yeah.
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+I was also thinking.
 
-00:01:13.014 --> 00:01:15.907
+Would be maybe good to create items,
 
-Speaker A:  than me. I just opened the format.
+some metadata,
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+some options that can be specific.
 
-00:01:50.222 --> 00:01:51.643
+Yeah,
 
-Speaker A:  give it a try, yeah.
+I mean I think you went even more than
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+But we preserved the atoms.
 
-00:01:52.043 --> 00:01:55.043
+than me.
 
-Speaker B:  Okay, talk to you later.
+I just opened the format.
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+give it a try, yeah.
 
-00:01:54.603 --> 00:01:55.283
+Okay, talk to you later.
 
-Speaker A:  See you.
\ No newline at end of file
+See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
new file mode 100644
index 0000000000..93feba5e9a
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
@@ -0,0 +1,14 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: text: Last night the chef surprised us with a culinary adventure.
+  item-2 at level 1: inline: group WebVTT cue span
+    item-3 at level 2: text: The waiter offered a 
+    item-4 at level 2: text: steaming bowl of 
+    item-5 at level 2: text: paella
+    item-6 at level 2: text:  that instantly transported the diners to a sunny Mediterranean coast.
+  item-7 at level 1: inline: group WebVTT cue span
+    item-8 at level 2: text: The dessert’s 
+    item-9 at level 2: text: unexpected
+    item-10 at level 2: text:  
+    item-11 at level 2: text: arcobaleno
+    item-12 at level 2: text:  of flavors
+    item-13 at level 2: text:  left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
new file mode 100644
index 0000000000..3a07d69e9b
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
@@ -0,0 +1,366 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 5389775195091554844,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        },
+        {
+          "$ref": "#/texts/9"
+        },
+        {
+          "$ref": "#/texts/10"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14580.0,
+          "end_time": 14760.0,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "Last night the chef surprised us with a culinary adventure.",
+      "text": "Last night the chef surprised us with a culinary adventure."
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The waiter offered a ",
+      "text": "The waiter offered a "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "steaming bowl of ",
+      "text": "steaming bowl of ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "es-ES"
+          ]
+        }
+      ],
+      "orig": "paella",
+      "text": "paella",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+      "text": " that instantly transported the diners to a sunny Mediterranean coast."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The dessert’s ",
+      "text": "The dessert’s "
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "classes": [
+            "b.loud"
+          ]
+        }
+      ],
+      "orig": "unexpected",
+      "text": "unexpected",
+      "formatting": {
+        "bold": true,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " ",
+      "text": " ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "it"
+          ]
+        }
+      ],
+      "orig": "arcobaleno",
+      "text": "arcobaleno",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": true,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " of flavors",
+      "text": " of flavors",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "source": [
+        {
+          "kind": "track",
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " left everyone in awe.",
+      "text": " left everyone in awe."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
new file mode 100644
index 0000000000..f2312a059c
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
@@ -0,0 +1,5 @@
+Last night the chef surprised us with a culinary adventure.
+
+The waiter offered a  *steaming bowl of * *paella*  that instantly transported the diners to a sunny Mediterranean coast.
+
+The dessert’s  ***unexpected*** * * *arcobaleno* * of flavors*  left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt
index 1152a1e8fa..6bd1821011 100644
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -12,4 +12,7 @@ NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 <v Esme>Hee!</v> <i>laughter</i>
 
 00:06.000 --> 00:08.000
-<v.loud Mary>That’s awesome!
\ No newline at end of file
+<v.loud Mary>That’s awesome!
+
+00:08.000 --> 00:10.000
+Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_04.vtt b/tests/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 0000000000..fd7b788c06
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a <i>steaming bowl of <lang es-ES>paella</lang></i> that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s <i><b.loud>unexpected</b> <u><lang it>arcobaleno</lang></u> of flavors</i> left everyone in awe.
\ No newline at end of file
diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py
index a910671bb5..cadcef9b33 100644
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -1,21 +1,12 @@
-# Assisted by watsonx Code Assistant
-
+import warnings
+from io import BytesIO
 from pathlib import Path
 
 import pytest
-from docling_core.types.doc import DoclingDocument
-from pydantic import ValidationError
-
-from docling.backend.webvtt_backend import (
-    _WebVTTCueItalicSpan,
-    _WebVTTCueTextSpan,
-    _WebVTTCueTimings,
-    _WebVTTCueVoiceSpan,
-    _WebVTTFile,
-    _WebVTTTimestamp,
-)
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import ConversionResult
+from docling_core.types.doc import DoclingDocument, GroupItem, TextItem
+
+from docling.datamodel.base_models import DocumentStream, InputFormat
+from docling.datamodel.document import ConversionResult, _DocumentConversionInput
 from docling.document_converter import DocumentConverter
 
 from .test_data_gen_flag import GEN_TEST_DATA
@@ -24,187 +15,6 @@
 GENERATE = GEN_TEST_DATA
 
 
-def test_vtt_cue_commponents():
-    """Test WebVTT components."""
-    valid_timestamps = [
-        "00:01:02.345",
-        "12:34:56.789",
-        "02:34.567",
-        "00:00:00.000",
-    ]
-    valid_total_seconds = [
-        1 * 60 + 2.345,
-        12 * 3600 + 34 * 60 + 56.789,
-        2 * 60 + 34.567,
-        0.0,
-    ]
-    for idx, ts in enumerate(valid_timestamps):
-        model = _WebVTTTimestamp(raw=ts)
-        assert model.seconds == valid_total_seconds[idx]
-
-    """Test invalid WebVTT timestamps."""
-    invalid_timestamps = [
-        "00:60:02.345",  # minutes > 59
-        "00:01:60.345",  # seconds > 59
-        "00:01:02.1000",  # milliseconds > 999
-        "01:02:03",  # missing milliseconds
-        "01:02",  # missing milliseconds
-        ":01:02.345",  # extra : for missing hours
-        "abc:01:02.345",  # invalid format
-    ]
-    for ts in invalid_timestamps:
-        with pytest.raises(ValidationError):
-            _WebVTTTimestamp(raw=ts)
-
-    """Test the timestamp __str__ method."""
-    model = _WebVTTTimestamp(raw="00:01:02.345")
-    assert str(model) == "00:01:02.345"
-
-    """Test valid cue timings."""
-    start = _WebVTTTimestamp(raw="00:10.005")
-    end = _WebVTTTimestamp(raw="00:14.007")
-    cue_timings = _WebVTTCueTimings(start=start, end=end)
-    assert cue_timings.start == start
-    assert cue_timings.end == end
-    assert str(cue_timings) == "00:10.005 --> 00:14.007"
-
-    """Test invalid cue timings with end timestamp before start."""
-    start = _WebVTTTimestamp(raw="00:10.700")
-    end = _WebVTTTimestamp(raw="00:10.500")
-    with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start, end=end)
-    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
-
-    """Test invalid cue timings with missing end."""
-    start = _WebVTTTimestamp(raw="00:10.500")
-    with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start)
-    assert "Field required" in str(excinfo.value)
-
-    """Test invalid cue timings with missing start."""
-    end = _WebVTTTimestamp(raw="00:10.500")
-    with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(end=end)
-    assert "Field required" in str(excinfo.value)
-
-    """Test with valid text."""
-    valid_text = "This is a valid cue text span."
-    span = _WebVTTCueTextSpan(text=valid_text)
-    assert span.text == valid_text
-    assert str(span) == valid_text
-
-    """Test with text containing newline characters."""
-    invalid_text = "This cue text span\ncontains a newline."
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
-
-    """Test with text containing ampersand."""
-    invalid_text = "This cue text span contains &."
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
-
-    """Test with text containing less-than sign."""
-    invalid_text = "This cue text span contains <."
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
-
-    """Test with empty text."""
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text="")
-
-    """Test that annotation validation works correctly."""
-    valid_annotation = "valid-annotation"
-    invalid_annotation = "invalid\nannotation"
-    with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
-    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
-
-    """Test that classes validation works correctly."""
-    annotation = "speaker name"
-    valid_classes = ["class1", "class2"]
-    invalid_classes = ["class\nwith\nnewlines", ""]
-    with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
-
-    """Test that components validation works correctly."""
-    annotation = "speaker name"
-    valid_components = [_WebVTTCueTextSpan(text="random text")]
-    invalid_components = [123, "not a component"]
-    with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
-
-    """Test valid cue voice spans."""
-    cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        classes=["loud", "clear"],
-        components=[_WebVTTCueTextSpan(text="random text")],
-    )
-
-    expected_str = "<v.loud.clear speaker>random text</v>"
-    assert str(cue_span) == expected_str
-
-    cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        components=[_WebVTTCueTextSpan(text="random text")],
-    )
-    expected_str = "<v speaker>random text</v>"
-    assert str(cue_span) == expected_str
-
-
-def test_webvtt_file():
-    """Test WebVTT files."""
-    with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
-        content = f.read()
-        vtt = _WebVTTFile.parse(content)
-    assert len(vtt) == 13
-    block = vtt.cue_blocks[11]
-    assert str(block.timings) == "00:32.500 --> 00:33.500"
-    assert len(block.payload) == 1
-    cue_span = block.payload[0]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
-    assert cue_span.annotation == "Neil deGrasse Tyson"
-    assert not cue_span.classes
-    assert len(cue_span.components) == 1
-    comp = cue_span.components[0]
-    assert isinstance(comp, _WebVTTCueItalicSpan)
-    assert len(comp.components) == 1
-    comp2 = comp.components[0]
-    assert isinstance(comp2, _WebVTTCueTextSpan)
-    assert comp2.text == "Laughs"
-
-    with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
-        content = f.read()
-        vtt = _WebVTTFile.parse(content)
-    assert len(vtt) == 4
-    reverse = (
-        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
-        "https://www.w3.org/TR/webvtt1/\n\n"
-    )
-    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
-    assert content == reverse
-
-    with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
-        content = f.read()
-        vtt = _WebVTTFile.parse(content)
-    assert len(vtt) == 13
-    for block in vtt:
-        assert block.identifier
-    block = vtt.cue_blocks[0]
-    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
-    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
-    assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
-    block = vtt.cue_blocks[2]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
-    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
-    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
-    assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
-    assert block.payload[0].text == "Good."
-
-
 def test_e2e_vtt_conversions():
     directory = Path("./tests/data/webvtt/")
     vtt_paths = sorted(directory.rglob("*.vtt"))
@@ -230,3 +40,252 @@ def test_e2e_vtt_conversions():
         )
 
         assert verify_document(doc, str(gt_path) + ".json", GENERATE)
+
+
+def _create_vtt_stream(content: str) -> DocumentStream:
+    stream = DocumentStream(name="test.vtt", stream=BytesIO(content.strip().encode()))
+    dci = _DocumentConversionInput(path_or_stream_iterator=[])
+    assert dci._guess_format(stream) == InputFormat.VTT
+
+    return stream
+
+
+def _process_vtt_doc(doc: DoclingDocument) -> str:
+    text: str = ""
+    for item in doc.texts:
+        if (
+            isinstance(item, TextItem)
+            and item.source
+            and item.source[0].kind == "track"
+        ):
+            parent = item.parent.resolve(doc)
+            if parent and isinstance(parent, GroupItem):
+                text += " "
+            text += item.text
+
+    return text.strip()
+
+
+@pytest.fixture(scope="module")
+def converter() -> DocumentConverter:
+    return DocumentConverter()
+
+
+def test_simple_two_cues_basic(converter):
+    vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+Hello world!
+
+00:00:02.500 --> 00:00:04.000
+Second cue.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "Hello world! Second cue."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_cue_ids_present_are_ignored_in_output(converter):
+    vtt = """
+WEBVTT
+
+1
+00:00:00.000 --> 00:00:01.000
+First with ID.
+
+2
+00:00:01.250 --> 00:00:02.000
+Second with ID.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "First with ID. Second with ID."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_multi_line_cue_text_preserved(converter):
+    vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:03.000
+This is line one.
+This is line two.
+
+00:00:03.500 --> 00:00:05.000
+Another cue line one.
+Another cue line two.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "This is line one. This is line two. Another cue line one. Another cue line two."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_styling_and_voice_tags_stripped(converter):
+    vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+<v Roger><b>Hello</b> <i>there</i><u>!</u></v>
+
+00:00:02.200 --> 00:00:04.000
+<c.red>Styled</c> and <v Ann>voiced</v> text.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    # Expect tags removed but inner text retained, spacing preserved.
+    # expected = "Hello there! Styled and voiced text."
+    # TODO: temporary ground truth (issue docling-project/docling-core/#371)
+    expected = "Hello   there ! Styled  and  voiced  text."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_blank_cue_contributes_no_text(converter):
+    # First cue has text; second cue is intentionally blank (zero transcript lines).
+    vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+Visible text.
+
+00:00:02.500 --> 00:00:04.000
+
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "Visible text."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_note_blocks_are_ignored(converter):
+    vtt = """
+WEBVTT
+
+
+NOTE This is a file-level note
+It can span multiple lines.
+
+
+00:00:00.000 --> 00:00:02.000
+First cue text.
+
+
+NOTE Another note between cues
+
+
+00:00:02.500 --> 00:00:04.000
+Second cue text.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "First cue text. Second cue text."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_region_block_ignored_but_region_reference_ok(converter):
+    vtt = """
+WEBVTT
+
+REGION
+id:top
+width:40%
+lines:3
+
+00:00:00.000 --> 00:00:02.000 region:top line:90% position:50% size:35% align:start
+Top region text.
+
+00:00:02.500 --> 00:00:04.000
+Normal region text.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "Top region text. Normal region text."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_varied_timestamp_formats_and_settings_ignored(converter):
+    # First cue uses MM:SS.mmm; second uses HH:MM:SS.mmm and includes settings.
+    vtt = """
+WEBVTT
+
+00:01.000 --> 00:03.000
+Under one minute format.
+
+01:00:00.000 --> 01:00:02.000 line:0 position:10% align:end
+Hour format with settings.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    expected = "Under one minute format. Hour format with settings."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_cue_ids_plus_multiline_with_voice_and_style(converter):
+    # Mix multiple concepts: cue IDs, multi-line text, voice tags, style tags.
+    vtt = """
+WEBVTT
+
+
+
+intro
+00:00:00.000 --> 00:00:02.000
+<v Narrator><i>Welcome</i> to the show.</v>
+<b>Enjoy</b> your time.
+
+
+
+outro
+00:00:02.500 --> 00:00:04.000
+<v Host>Goodbye</v>, see you <u>soon</u>.
+"""
+    stream = _create_vtt_stream(vtt)
+    doc = converter.convert(stream).document
+
+    # expected = "Welcome to the show. Enjoy your time. Goodbye, see you soon."
+    # TODO: temporary ground truth (issue docling-project/docling-core/#371)
+    expected = "Welcome  to the show. Enjoy  your time. Goodbye , see you  soon ."
+    assert _process_vtt_doc(doc) == expected
+
+
+def test_style_blocks_and_note_between_styles_are_ignored(converter):
+    vtt = """
+WEBVTT
+
+STYLE
+::cue {
+  background-image: linear-gradient(to bottom, dimgray, lightgray);
+  color: papayawhip;
+}
+/* Style blocks cannot use blank lines nor "dash dash greater than" */
+
+NOTE comment blocks can be used between style blocks.
+
+STYLE
+::cue(b) {
+    color: peachpuff;
+}
+
+hello
+00:00:00.000 --> 00:00:10.000
+Hello <b>world</b>.
+"""
+    stream = _create_vtt_stream(vtt)
+    with warnings.catch_warnings():
+        # STYLE and NOTE blocks should be ignored without warnings
+        warnings.simplefilter("error")
+        doc = converter.convert(stream).document
+
+    # expected = "Hello world."
+    # TODO: temporary ground truth (issue docling-project/docling-core/#371)
+    expected = "Hello  world ."
+    assert _process_vtt_doc(doc) == expected
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 93f33e1fd1..5f559b511c 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -241,6 +241,20 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
 
             # TODO: add bbox check with tolerance
 
+        # Validate source
+        assert bool(true_item.source) == bool(pred_item.source), (
+            "Source exists mismatch"
+        )
+        if true_item.source:
+            true_source = true_item.source[0]
+            pred_source = pred_item.source[0]
+            assert true_source.start_time == pred_source.start_time, (
+                "TrackProvenance start time mismatch"
+            )
+            assert true_source.end_time == pred_source.end_time, (
+                "TrackProvenance end time mismatch"
+            )
+
         # Validate text content
         if isinstance(true_item, TextItem):
             assert isinstance(pred_item, TextItem), (
diff --git a/uv.lock b/uv.lock
index f393b112f0..52390d581c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1035,7 +1035,7 @@ requires-dist = [
     { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
     { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
     { name = "certifi", specifier = ">=2024.7.4" },
-    { name = "docling-core", extras = ["chunking"], specifier = ">=2.58.0,<3.0.0" },
+    { name = "docling-core", extras = ["chunking"], specifier = ">=2.62.0,<3.0.0" },
     { name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
     { name = "docling-parse", specifier = ">=4.7.0,<5.0.0" },
     { name = "easyocr", marker = "extra == 'easyocr'", specifier = ">=1.7,<2.0" },
@@ -1119,7 +1119,7 @@ examples = [
 
 [[package]]
 name = "docling-core"
-version = "2.60.2"
+version = "2.62.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jsonref" },
@@ -1133,9 +1133,9 @@ dependencies = [
     { name = "typer" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7d/e6/7ed57bc580f136db0a7457305ec63366f22c999b674ef5f7c0abe452d79f/docling_core-2.60.2.tar.gz", hash = "sha256:7a99e1671e796e39d0c735b7ae3833766a97ad287e15d434dfa417917e3b0e6d", size = 231978, upload-time = "2026-01-23T12:29:18.506Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/21/20d58a48f4baa9e16d49aaccf3048346a8e7833b65b09144315bf1d956db/docling_core-2.62.0.tar.gz", hash = "sha256:147c958fe3b552db5e78b5a301dba19349820066ec5ef189b67eb5ed00306a07", size = 250107, upload-time = "2026-01-30T14:01:44.448Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/5f/d39dd904b602f3a4072f1a7c38636702c32ed36d49aaafb21ea059face28/docling_core-2.60.2-py3-none-any.whl", hash = "sha256:63aee783f06240455c12c30e9af383b80d7ade80c896f81d68a4aff6cde2e2a1", size = 222319, upload-time = "2026-01-23T12:29:17.109Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/89/e5204af5669e6b73bfdf304fc3e4c6b4b98b10d06b8bd7dc186b5190c9f3/docling_core-2.62.0-py3-none-any.whl", hash = "sha256:0073ccbd0c9cf514b38be7d53ccd78ee7b92723294a623a3f36eb7a7aea67bf0", size = 238084, upload-time = "2026-01-30T14:01:43.059Z" },
 ]
 
 [package.optional-dependencies]