diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py
index 2a7d02ce74..001fc3eac8 100644
--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -1,8 +1,7 @@
import logging
-import re
+from dataclasses import dataclass, field
from io import BytesIO
from pathlib import Path
-from typing import Annotated, ClassVar, Literal, Optional, Union, cast
from docling_core.types.doc import (
ContentLayer,
@@ -10,12 +9,19 @@
DoclingDocument,
DocumentOrigin,
Formatting,
- GroupLabel,
- NodeItem,
+ TrackSource,
)
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic.types import StringConstraints
-from typing_extensions import Self, override
+from docling_core.types.doc.webvtt import (
+ WebVTTCueBoldSpan,
+ WebVTTCueComponent,
+ WebVTTCueComponentWithTerminator,
+ WebVTTCueItalicSpan,
+ WebVTTCueTextSpan,
+ WebVTTCueUnderlineSpan,
+ WebVTTCueVoiceSpan,
+ WebVTTFile,
+)
+from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@@ -24,409 +30,23 @@
_log = logging.getLogger(__name__)
-class _WebVTTTimestamp(BaseModel):
- """Model representing a WebVTT timestamp.
-
- A WebVTT timestamp is always interpreted relative to the current playback position
- of the media data that the WebVTT file is to be synchronized with.
- """
-
- model_config = ConfigDict(regex_engine="python-re")
-
- raw: Annotated[
- str,
- Field(
- description="A representation of the WebVTT Timestamp as a single string"
- ),
- ]
-
- _pattern: ClassVar[re.Pattern] = re.compile(
- r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
- )
- _hours: int
- _minutes: int
- _seconds: int
- _millis: int
-
- @model_validator(mode="after")
- def validate_raw(self) -> Self:
- m = self._pattern.match(self.raw)
- if not m:
- raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
- self._hours = int(m.group(1)) if m.group(1) else 0
- self._minutes = int(m.group(2))
- self._seconds = int(m.group(3))
- self._millis = int(m.group(4))
-
- if self._minutes < 0 or self._minutes > 59:
- raise ValueError("Minutes must be between 0 and 59")
- if self._seconds < 0 or self._seconds > 59:
- raise ValueError("Seconds must be between 0 and 59")
-
- return self
-
- @property
- def seconds(self) -> float:
- """A representation of the WebVTT Timestamp in seconds"""
- return (
- self._hours * 3600
- + self._minutes * 60
- + self._seconds
- + self._millis / 1000.0
- )
-
- @override
- def __str__(self) -> str:
- return self.raw
-
-
-_WebVTTCueIdentifier = Annotated[
- str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
-
-
-class _WebVTTCueTimings(BaseModel):
- """Model representating WebVTT cue timings."""
-
- start: Annotated[
- _WebVTTTimestamp, Field(description="Start time offset of the cue")
- ]
- end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
-
- @model_validator(mode="after")
- def check_order(self) -> Self:
- if self.start and self.end:
- if self.end.seconds <= self.start.seconds:
- raise ValueError("End timestamp must be greater than start timestamp")
- return self
-
- @override
- def __str__(self):
- return f"{self.start} --> {self.end}"
-
-
-class _WebVTTCueTextSpan(BaseModel):
- """Model representing a WebVTT cue text span."""
-
+@dataclass
+class AnnotatedText:
text: str
- span_type: Literal["text"] = "text"
-
- @field_validator("text", mode="after")
- @classmethod
- def validate_text(cls, value: str) -> str:
- if any(ch in value for ch in {"\n", "\r", "&", "<"}):
- raise ValueError("Cue text span contains invalid characters")
- if len(value) == 0:
- raise ValueError("Cue text span cannot be empty")
- return value
-
- @override
- def __str__(self):
- return self.text
-
-
-class _WebVTTCueVoiceSpan(BaseModel):
- """Model representing a WebVTT cue voice span."""
-
- annotation: Annotated[
- str,
- Field(
- description=(
- "Cue span start tag annotation text representing the name of thevoice"
- )
- ),
- ]
- classes: Annotated[
- list[str],
- Field(description="List of classes representing the cue span's significance"),
- ] = []
- components: Annotated[
- list["_WebVTTCueComponent"],
- Field(description="The components representing the cue internal text"),
- ] = []
- span_type: Literal["v"] = "v"
-
- @field_validator("annotation", mode="after")
- @classmethod
- def validate_annotation(cls, value: str) -> str:
- if any(ch in value for ch in {"\n", "\r", "&", ">"}):
- raise ValueError(
- "Cue span start tag annotation contains invalid characters"
- )
- if not value:
- raise ValueError("Cue text span cannot be empty")
- return value
-
- @field_validator("classes", mode="after")
- @classmethod
- def validate_classes(cls, value: list[str]) -> list[str]:
- for item in value:
- if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
- raise ValueError(
- "A cue span start tag class contains invalid characters"
- )
- if not item:
- raise ValueError("Cue span start tag classes cannot be empty")
- return value
-
- @override
- def __str__(self):
- tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
- inner = "".join(str(span) for span in self.components)
- return f"<{tag} {self.annotation}>{inner}"
-
-
-class _WebVTTCueClassSpan(BaseModel):
- span_type: Literal["c"] = "c"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-class _WebVTTCueItalicSpan(BaseModel):
- span_type: Literal["i"] = "i"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-class _WebVTTCueBoldSpan(BaseModel):
- span_type: Literal["b"] = "b"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-class _WebVTTCueUnderlineSpan(BaseModel):
- span_type: Literal["u"] = "u"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-_WebVTTCueComponent = Annotated[
- Union[
- _WebVTTCueTextSpan,
- _WebVTTCueClassSpan,
- _WebVTTCueItalicSpan,
- _WebVTTCueBoldSpan,
- _WebVTTCueUnderlineSpan,
- _WebVTTCueVoiceSpan,
- ],
- Field(discriminator="span_type", description="The WebVTT cue component"),
-]
-
-
-class _WebVTTCueBlock(BaseModel):
- """Model representing a WebVTT cue block.
-
- The optional WebVTT cue settings list is not supported.
- The cue payload is limited to the following spans: text, class, italic, bold,
- underline, and voice.
- """
-
- model_config = ConfigDict(regex_engine="python-re")
-
- identifier: Optional[_WebVTTCueIdentifier] = Field(
- None, description="The WebVTT cue identifier"
- )
- timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
- payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
-
- _pattern_block: ClassVar[re.Pattern] = re.compile(
- r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
- )
- _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
- r"^\.[^\t\n\r &<>]+)?" # zero or more classes
- r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation
- )
-
- @field_validator("payload", mode="after")
- @classmethod
- def validate_payload(cls, payload):
- for voice in payload:
- if "-->" in str(voice):
- raise ValueError("Cue payload must not contain '-->'")
- return payload
-
- @classmethod
- def parse(cls, raw: str) -> "_WebVTTCueBlock":
- lines = raw.strip().splitlines()
- if not lines:
- raise ValueError("Cue block must have at least one line")
- identifier: Optional[_WebVTTCueIdentifier] = None
- timing_line = lines[0]
- if "-->" not in timing_line and len(lines) > 1:
- identifier = timing_line
- timing_line = lines[1]
- cue_lines = lines[2:]
- else:
- cue_lines = lines[1:]
-
- if "-->" not in timing_line:
- raise ValueError("Cue block must contain WebVTT cue timings")
-
- start, end = [t.strip() for t in timing_line.split("-->")]
- end = re.split(" |\t", end)[0] # ignore the cue settings list
- timings: _WebVTTCueTimings = _WebVTTCueTimings(
- start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+ voice: str | None = None
+ formatting: Formatting | None = None
+
+ def copy_meta(self, text):
+ return AnnotatedText(
+ text=text,
+ voice=self.voice,
+ formatting=self.formatting.model_copy() if self.formatting else None,
)
- cue_text = " ".join(cue_lines).strip()
- if cue_text.startswith("" not in cue_text:
- # adding close tag for cue voice spans without end tag
- cue_text += ""
-
- stack: list[list[_WebVTTCueComponent]] = [[]]
- tag_stack: list[Union[str, tuple]] = []
-
- pos = 0
- matches = list(cls._pattern_block.finditer(cue_text))
- i = 0
- while i < len(matches):
- match = matches[i]
- if match.start() > pos:
- stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
- tag = match.group(0)
-
- if tag.startswith(("", "", "", "")):
- tag_type = tag[1:2]
- tag_stack.append(tag_type)
- stack.append([])
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueItalicSpan(components=children))
- tag_stack.pop()
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueBoldSpan(components=children))
- tag_stack.pop()
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
- tag_stack.pop()
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueClassSpan(components=children))
- tag_stack.pop()
- elif tag.startswith(""))
- else:
- parts.append(str(span))
-
- return "".join(parts)
-
-
-class _WebVTTFile(BaseModel):
- """A model representing a WebVTT file."""
-
- cue_blocks: list[_WebVTTCueBlock]
-
- @staticmethod
- def verify_signature(content: str) -> bool:
- if not content:
- return False
- elif len(content) == 6:
- return content == "WEBVTT"
- elif len(content) > 6 and content.startswith("WEBVTT"):
- return content[6] in (" ", "\t", "\n")
- else:
- return False
-
- @classmethod
- def parse(cls, raw: str) -> "_WebVTTFile":
- # Normalize newlines to LF
- raw = raw.replace("\r\n", "\n").replace("\r", "\n")
-
- # Check WebVTT signature
- if not cls.verify_signature(raw):
- raise ValueError("Invalid WebVTT file signature")
- # Strip "WEBVTT" header line
- lines = raw.split("\n", 1)
- body = lines[1] if len(lines) > 1 else ""
- # Remove NOTE/STYLE/REGION blocks
- body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
- body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
-
- # Split into cue blocks
- raw_blocks = re.split(r"\n\s*\n", body.strip())
- cues: list[_WebVTTCueBlock] = []
- for block in raw_blocks:
- try:
- cues.append(_WebVTTCueBlock.parse(block))
- except ValueError as e:
- _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
-
- return cls(cue_blocks=cues)
-
- def __iter__(self):
- return iter(self.cue_blocks)
-
- def __getitem__(self, idx):
- return self.cue_blocks[idx]
-
- def __len__(self):
- return len(self.cue_blocks)
+@dataclass
+class AnnotatedPar:
+ items: list[AnnotatedText]
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
@@ -440,7 +60,7 @@ class WebVTTDocumentBackend(DeclarativeDocumentBackend):
"""
@override
- def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+ def __init__(self, in_doc: InputDocument, path_or_stream: BytesIO | Path):
super().__init__(in_doc, path_or_stream)
self.content: str = ""
@@ -458,7 +78,7 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@override
def is_valid(self) -> bool:
- return _WebVTTFile.verify_signature(self.content)
+ return WebVTTFile.verify_signature(self.content)
@classmethod
@override
@@ -476,38 +96,6 @@ def unload(self):
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.VTT}
- @staticmethod
- def _add_text_from_component(
- doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
- ) -> None:
- """Adds a TextItem to a document by extracting text from a cue span component.
-
- TODO: address nesting
- """
- formatting = Formatting()
- text = ""
- if isinstance(item, _WebVTTCueItalicSpan):
- formatting.italic = True
- elif isinstance(item, _WebVTTCueBoldSpan):
- formatting.bold = True
- elif isinstance(item, _WebVTTCueUnderlineSpan):
- formatting.underline = True
- if isinstance(item, _WebVTTCueTextSpan):
- text = item.text
- else:
- # TODO: address nesting
- text = "".join(
- [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
- )
- if text := text.strip():
- doc.add_text(
- label=DocItemLabel.TEXT,
- text=text,
- parent=parent,
- content_layer=ContentLayer.BODY,
- formatting=formatting,
- )
-
@override
def convert(self) -> DoclingDocument:
_log.debug("Starting WebVTT conversion...")
@@ -521,52 +109,100 @@ def convert(self) -> DoclingDocument:
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
- vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
- for block in vtt.cue_blocks:
- block_group = doc.add_group(
- label=GroupLabel.SECTION,
- name="WebVTT cue block",
- parent=None,
- content_layer=ContentLayer.BODY,
- )
- if block.identifier:
- doc.add_text(
- label=DocItemLabel.TEXT,
- text=str(block.identifier),
- parent=block_group,
- content_layer=ContentLayer.BODY,
+ vtt: WebVTTFile = WebVTTFile.parse(self.content)
+ cue_text: list[AnnotatedPar] = []
+ parents: list[AnnotatedText] = []
+
+ def _extract_components(
+ payload: list[WebVTTCueComponentWithTerminator],
+ ) -> None:
+ nonlocal cue_text, parents
+ if not cue_text:
+ cue_text.append(AnnotatedPar(items=[]))
+ par = cue_text[-1]
+ for comp in payload:
+ item: AnnotatedText = (
+ parents[-1].copy_meta("") if parents else AnnotatedText(text="")
)
+ component: WebVTTCueComponent = comp.component
+ if isinstance(component, WebVTTCueTextSpan):
+ item.text = component.text
+ par.items.append(item)
+ else:
+ # configure metadata based on span type
+ if isinstance(component, WebVTTCueBoldSpan):
+ item.formatting = item.formatting or Formatting()
+ item.formatting.bold = True
+
+ elif isinstance(component, WebVTTCueItalicSpan):
+ item.formatting = item.formatting or Formatting()
+ item.formatting.italic = True
+
+ elif isinstance(component, WebVTTCueUnderlineSpan):
+ item.formatting = item.formatting or Formatting()
+ item.formatting.underline = True
+
+ elif isinstance(component, WebVTTCueVoiceSpan):
+ # voice spans cannot be embedded
+ item.voice = component.start_tag.annotation
+
+ parents.append(item)
+ _extract_components(component.internal_text.components)
+ parents.pop()
+
+ if comp.terminator is not None:
+ cue_text.append(AnnotatedPar(items=[]))
+ par = cue_text[-1]
+
+ def _add_text_item(
+ text: str,
+ formatting: Formatting | None,
+ item: AnnotatedText,
+ parent=None,
+ ):
+ track = TrackSource(
+ start_time=block.timings.start.seconds,
+ end_time=block.timings.end.seconds,
+ identifier=identifier,
+ voice=item.voice or None,
+ )
+
doc.add_text(
label=DocItemLabel.TEXT,
- text=str(block.timings),
- parent=block_group,
+ text=text,
content_layer=ContentLayer.BODY,
+ formatting=formatting,
+ parent=parent,
+ source=track,
)
- for cue_span in block.payload:
- if isinstance(cue_span, _WebVTTCueVoiceSpan):
- voice_group = doc.add_group(
- label=GroupLabel.INLINE,
- name="WebVTT cue voice span",
- parent=block_group,
- content_layer=ContentLayer.BODY,
- )
- voice = cue_span.annotation
- if classes := cue_span.classes:
- voice += f" ({', '.join(classes)})"
- voice += ": "
- doc.add_text(
- label=DocItemLabel.TEXT,
- text=voice,
- parent=voice_group,
- content_layer=ContentLayer.BODY,
+
+ if vtt.title:
+ doc.add_title(vtt.title, content_layer=ContentLayer.BODY)
+ for block in vtt.cue_blocks:
+ cue_text = []
+ parents = []
+ identifier = str(block.identifier) if block.identifier else None
+ _extract_components(block.payload)
+ for par in cue_text:
+ if not par.items:
+ continue
+ if len(par.items) == 1:
+ item = par.items[0]
+ _add_text_item(
+ text=item.text,
+ formatting=item.formatting,
+ item=item,
)
- for item in cue_span.components:
- WebVTTDocumentBackend._add_text_from_component(
- doc, item, voice_group
- )
else:
- WebVTTDocumentBackend._add_text_from_component(
- doc, cue_span, block_group
+ group = doc.add_inline_group(
+ "WebVTT cue span", content_layer=ContentLayer.BODY
)
+ for item in par.items:
+ _add_text_item(
+ text=item.text,
+ formatting=item.formatting,
+ item=item,
+ parent=group,
+ )
return doc
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
index 2bb94e42a6..7c8ea4cf3d 100644
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -1,47 +1,35 @@
import logging
-import os
-import re
import sys
import tempfile
from io import BytesIO
from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Union, cast
-
-from docling_core.types.doc import DoclingDocument, DocumentOrigin
-
-# import whisper # type: ignore
-# import librosa
-# import numpy as np
-# import soundfile as sf # type: ignore
-from docling_core.types.doc.labels import DocItemLabel
-from pydantic import BaseModel, Field, validator
+from typing import Optional, Union
+
+from docling_core.types.doc import (
+ ContentLayer,
+ DocItemLabel,
+ DoclingDocument,
+ DocumentOrigin,
+ TrackSource,
+)
+from pydantic import BaseModel, Field
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.noop_backend import NoOpBackend
-
-# from pydub import AudioSegment # type: ignore
-# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import (
ConversionStatus,
- FormatToMimeType,
)
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
)
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrMlxWhisperOptions,
InlineAsrNativeWhisperOptions,
- # AsrResponseFormat,
- InlineAsrOptions,
)
-from docling.datamodel.pipeline_options_vlm_model import (
- InferenceFramework,
-)
-from docling.datamodel.settings import settings
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
)
for citem in conversation:
+ track: TrackSource = TrackSource(
+ start_time=citem.start_time,
+ end_time=citem.end_time,
+ voice=citem.speaker,
+ )
conv_res.document.add_text(
- label=DocItemLabel.TEXT, text=citem.to_string()
+ label=DocItemLabel.TEXT,
+ text=citem.text,
+ content_layer=ContentLayer.BODY,
+ source=track,
)
return conv_res
@@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
)
for citem in conversation:
+ track: TrackSource = TrackSource(
+ start_time=citem.start_time,
+ end_time=citem.end_time,
+ voice=citem.speaker,
+ )
conv_res.document.add_text(
- label=DocItemLabel.TEXT, text=citem.to_string()
+ label=DocItemLabel.TEXT,
+ text=citem.text,
+ content_layer=ContentLayer.BODY,
+ source=track,
)
conv_res.status = ConversionStatus.SUCCESS
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 67be9e0de4..d284a4777c 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -2,7 +2,7 @@
import re
from io import BytesIO
from pathlib import Path
-from typing import List, Optional, Union, cast
+from typing import List, Union, cast
from docling_core.types.doc import (
BoundingBox,
@@ -12,8 +12,6 @@
ImageRef,
PictureItem,
ProvenanceItem,
- TableCell,
- TableData,
TextItem,
)
from docling_core.types.doc.base import (
@@ -21,7 +19,6 @@
Size,
)
from docling_core.types.doc.document import DocTagsDocument
-from lxml import etree
from PIL import Image as PILImage
from docling.backend.abstract_backend import (
@@ -42,7 +39,6 @@
InlineVlmOptions,
ResponseFormat,
)
-from docling.datamodel.settings import settings
from docling.models.vlm_pipeline_models.api_vlm_model import ApiVlmModel
from docling.models.vlm_pipeline_models.hf_transformers_model import (
HuggingFaceTransformersVlmModel,
diff --git a/pyproject.toml b/pyproject.toml
index 1898c52f1e..dec2c06813 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,7 @@ authors = [
requires-python = '>=3.10,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
- 'docling-core[chunking] (>=2.58.0,<3.0.0)',
+ 'docling-core[chunking] (>=2.62.0,<3.0.0)',
'docling-parse (>=4.7.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
index d7840e9941..db52ba1b79 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -1,66 +1,14 @@
item-0 at level 0: unspecified: group _root_
- item-1 at level 1: section: group WebVTT cue block
- item-2 at level 2: text: 00:11.000 --> 00:13.000
- item-3 at level 2: inline: group WebVTT cue voice span
- item-4 at level 3: text: Roger Bingham:
- item-5 at level 3: text: We are in New York City
- item-6 at level 1: section: group WebVTT cue block
- item-7 at level 2: text: 00:13.000 --> 00:16.000
- item-8 at level 2: inline: group WebVTT cue voice span
- item-9 at level 3: text: Roger Bingham:
- item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
- item-11 at level 1: section: group WebVTT cue block
- item-12 at level 2: text: 00:16.000 --> 00:18.000
- item-13 at level 2: inline: group WebVTT cue voice span
- item-14 at level 3: text: Roger Bingham:
- item-15 at level 3: text: from the American Museum of Natural History
- item-16 at level 1: section: group WebVTT cue block
- item-17 at level 2: text: 00:18.000 --> 00:20.000
- item-18 at level 2: inline: group WebVTT cue voice span
- item-19 at level 3: text: Roger Bingham:
- item-20 at level 3: text: And with me is Neil deGrasse Tyson
- item-21 at level 1: section: group WebVTT cue block
- item-22 at level 2: text: 00:20.000 --> 00:22.000
- item-23 at level 2: inline: group WebVTT cue voice span
- item-24 at level 3: text: Roger Bingham:
- item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
- item-26 at level 1: section: group WebVTT cue block
- item-27 at level 2: text: 00:22.000 --> 00:24.000
- item-28 at level 2: inline: group WebVTT cue voice span
- item-29 at level 3: text: Roger Bingham:
- item-30 at level 3: text: at the AMNH.
- item-31 at level 1: section: group WebVTT cue block
- item-32 at level 2: text: 00:24.000 --> 00:26.000
- item-33 at level 2: inline: group WebVTT cue voice span
- item-34 at level 3: text: Roger Bingham:
- item-35 at level 3: text: Thank you for walking down here.
- item-36 at level 1: section: group WebVTT cue block
- item-37 at level 2: text: 00:27.000 --> 00:30.000
- item-38 at level 2: inline: group WebVTT cue voice span
- item-39 at level 3: text: Roger Bingham:
- item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
- item-41 at level 1: section: group WebVTT cue block
- item-42 at level 2: text: 00:30.000 --> 00:31.500
- item-43 at level 2: inline: group WebVTT cue voice span
- item-44 at level 3: text: Roger Bingham:
- item-45 at level 3: text: When we e-mailed—
- item-46 at level 1: section: group WebVTT cue block
- item-47 at level 2: text: 00:30.500 --> 00:32.500
- item-48 at level 2: inline: group WebVTT cue voice span
- item-49 at level 3: text: Neil deGrasse Tyson:
- item-50 at level 3: text: Didn’t we talk about enough in that conversation?
- item-51 at level 1: section: group WebVTT cue block
- item-52 at level 2: text: 00:32.000 --> 00:35.500
- item-53 at level 2: inline: group WebVTT cue voice span
- item-54 at level 3: text: Roger Bingham:
- item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
- item-56 at level 1: section: group WebVTT cue block
- item-57 at level 2: text: 00:32.500 --> 00:33.500
- item-58 at level 2: inline: group WebVTT cue voice span
- item-59 at level 3: text: Neil deGrasse Tyson:
- item-60 at level 3: text: Laughs
- item-61 at level 1: section: group WebVTT cue block
- item-62 at level 2: text: 00:35.500 --> 00:38.000
- item-63 at level 2: inline: group WebVTT cue voice span
- item-64 at level 3: text: Roger Bingham:
- item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+ item-1 at level 1: text: We are in New York City
+ item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street
+ item-3 at level 1: text: from the American Museum of Natural History
+ item-4 at level 1: text: And with me is Neil deGrasse Tyson
+ item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium
+ item-6 at level 1: text: at the AMNH.
+ item-7 at level 1: text: Thank you for walking down here.
+ item-8 at level 1: text: And I want to do a follow-up on the last conversation we did.
+ item-9 at level 1: text: When we e-mailed—
+ item-10 at level 1: text: Didn’t we talk about enough in that conversation?
+ item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos
+ item-12 at level 1: text: Laughs
+ item-13 at level 1: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
index 8311825601..56548734b1 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "webvtt_example_01",
"origin": {
"mimetype": "text/vtt",
@@ -18,1052 +18,316 @@
"self_ref": "#/body",
"children": [
{
- "$ref": "#/groups/0"
+ "$ref": "#/texts/0"
},
{
- "$ref": "#/groups/2"
+ "$ref": "#/texts/1"
},
{
- "$ref": "#/groups/4"
+ "$ref": "#/texts/2"
},
{
- "$ref": "#/groups/6"
+ "$ref": "#/texts/3"
},
{
- "$ref": "#/groups/8"
+ "$ref": "#/texts/4"
},
{
- "$ref": "#/groups/10"
+ "$ref": "#/texts/5"
},
{
- "$ref": "#/groups/12"
+ "$ref": "#/texts/6"
},
{
- "$ref": "#/groups/14"
+ "$ref": "#/texts/7"
},
{
- "$ref": "#/groups/16"
+ "$ref": "#/texts/8"
},
{
- "$ref": "#/groups/18"
+ "$ref": "#/texts/9"
},
{
- "$ref": "#/groups/20"
+ "$ref": "#/texts/10"
},
{
- "$ref": "#/groups/22"
+ "$ref": "#/texts/11"
},
{
- "$ref": "#/groups/24"
+ "$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
- "groups": [
- {
- "self_ref": "#/groups/0",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
- {
- "$ref": "#/texts/0"
- },
- {
- "$ref": "#/groups/1"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [
- {
- "$ref": "#/texts/1"
- },
- {
- "$ref": "#/texts/2"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
+ "groups": [],
+ "texts": [
{
- "self_ref": "#/groups/2",
+ "self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/3"
- },
- {
- "$ref": "#/groups/3"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [
- {
- "$ref": "#/texts/4"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/5"
+ "kind": "track",
+ "start_time": 11.0,
+ "end_time": 13.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "We are in New York City",
+ "text": "We are in New York City"
},
{
- "self_ref": "#/groups/4",
+ "self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/6"
- },
- {
- "$ref": "#/groups/5"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/5",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [
- {
- "$ref": "#/texts/7"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/8"
+ "kind": "track",
+ "start_time": 13.0,
+ "end_time": 16.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "We’re actually at the Lucern Hotel, just down the street",
+ "text": "We’re actually at the Lucern Hotel, just down the street"
},
{
- "self_ref": "#/groups/6",
+ "self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/9"
- },
- {
- "$ref": "#/groups/7"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/7",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [
- {
- "$ref": "#/texts/10"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/11"
+ "kind": "track",
+ "start_time": 16.0,
+ "end_time": 18.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "from the American Museum of Natural History",
+ "text": "from the American Museum of Natural History"
},
{
- "self_ref": "#/groups/8",
+ "self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/12"
- },
- {
- "$ref": "#/groups/9"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/9",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [
- {
- "$ref": "#/texts/13"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/14"
+ "kind": "track",
+ "start_time": 18.0,
+ "end_time": 20.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "And with me is Neil deGrasse Tyson",
+ "text": "And with me is Neil deGrasse Tyson"
},
{
- "self_ref": "#/groups/10",
+ "self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/15"
- },
- {
- "$ref": "#/groups/11"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/11",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [
- {
- "$ref": "#/texts/16"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/17"
+ "kind": "track",
+ "start_time": 20.0,
+ "end_time": 22.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Astrophysicist, Director of the Hayden Planetarium",
+ "text": "Astrophysicist, Director of the Hayden Planetarium"
},
{
- "self_ref": "#/groups/12",
+ "self_ref": "#/texts/5",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/18"
- },
- {
- "$ref": "#/groups/13"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/13",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [
- {
- "$ref": "#/texts/19"
- },
+ "label": "text",
+ "source": [
{
- "$ref": "#/texts/20"
+ "kind": "track",
+ "start_time": 22.0,
+ "end_time": 24.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "at the AMNH.",
+ "text": "at the AMNH."
},
{
- "self_ref": "#/groups/14",
+ "self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/21"
- },
- {
- "$ref": "#/groups/15"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/15",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [
- {
- "$ref": "#/texts/22"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/23"
+ "kind": "track",
+ "start_time": 24.0,
+ "end_time": 26.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Thank you for walking down here.",
+ "text": "Thank you for walking down here."
},
{
- "self_ref": "#/groups/16",
+ "self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/24"
- },
- {
- "$ref": "#/groups/17"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/17",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [
- {
- "$ref": "#/texts/25"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/26"
+ "kind": "track",
+ "start_time": 27.0,
+ "end_time": 30.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "And I want to do a follow-up on the last conversation we did.",
+ "text": "And I want to do a follow-up on the last conversation we did."
},
{
- "self_ref": "#/groups/18",
+ "self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/27"
- },
- {
- "$ref": "#/groups/19"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/19",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [
- {
- "$ref": "#/texts/28"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/29"
+ "kind": "track",
+ "start_time": 30.0,
+ "end_time": 31.5,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "When we e-mailed—",
+ "text": "When we e-mailed—"
},
{
- "self_ref": "#/groups/20",
+ "self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/30"
- },
- {
- "$ref": "#/groups/21"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/21",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [
- {
- "$ref": "#/texts/31"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/32"
+ "kind": "track",
+ "start_time": 30.5,
+ "end_time": 32.5,
+ "voice": "Neil deGrasse Tyson"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Didn’t we talk about enough in that conversation?",
+ "text": "Didn’t we talk about enough in that conversation?"
},
{
- "self_ref": "#/groups/22",
+ "self_ref": "#/texts/10",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/33"
- },
- {
- "$ref": "#/groups/23"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/23",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [
- {
- "$ref": "#/texts/34"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/35"
+ "kind": "track",
+ "start_time": 32.0,
+ "end_time": 35.5,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+ "text": "No! No no no no; 'cos 'cos obviously 'cos"
},
{
- "self_ref": "#/groups/24",
+ "self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/36"
- },
- {
- "$ref": "#/groups/25"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/25",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [
- {
- "$ref": "#/texts/37"
- },
- {
- "$ref": "#/texts/38"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- }
- ],
- "texts": [
- {
- "self_ref": "#/texts/0",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:11.000 --> 00:13.000",
- "text": "00:11.000 --> 00:13.000"
- },
- {
- "self_ref": "#/texts/1",
- "parent": {
- "$ref": "#/groups/1"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/2",
- "parent": {
- "$ref": "#/groups/1"
- },
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "We are in New York City",
- "text": "We are in New York City",
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 32.5,
+ "end_time": 33.5,
+ "voice": "Neil deGrasse Tyson"
+ }
+ ],
+ "orig": "Laughs",
+ "text": "Laughs",
"formatting": {
"bold": false,
- "italic": false,
+ "italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
- "self_ref": "#/texts/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:13.000 --> 00:16.000",
- "text": "00:13.000 --> 00:16.000"
- },
- {
- "self_ref": "#/texts/4",
- "parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/5",
+ "self_ref": "#/texts/12",
"parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "We’re actually at the Lucern Hotel, just down the street",
- "text": "We’re actually at the Lucern Hotel, just down the street",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/6",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:16.000 --> 00:18.000",
- "text": "00:16.000 --> 00:18.000"
- },
- {
- "self_ref": "#/texts/7",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/8",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "from the American Museum of Natural History",
- "text": "from the American Museum of Natural History",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/9",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:18.000 --> 00:20.000",
- "text": "00:18.000 --> 00:20.000"
- },
- {
- "self_ref": "#/texts/10",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/11",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "And with me is Neil deGrasse Tyson",
- "text": "And with me is Neil deGrasse Tyson",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/12",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:20.000 --> 00:22.000",
- "text": "00:20.000 --> 00:22.000"
- },
- {
- "self_ref": "#/texts/13",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/14",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Astrophysicist, Director of the Hayden Planetarium",
- "text": "Astrophysicist, Director of the Hayden Planetarium",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/15",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:22.000 --> 00:24.000",
- "text": "00:22.000 --> 00:24.000"
- },
- {
- "self_ref": "#/texts/16",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/17",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "at the AMNH.",
- "text": "at the AMNH.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/18",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:24.000 --> 00:26.000",
- "text": "00:24.000 --> 00:26.000"
- },
- {
- "self_ref": "#/texts/19",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/20",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Thank you for walking down here.",
- "text": "Thank you for walking down here.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/21",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:27.000 --> 00:30.000",
- "text": "00:27.000 --> 00:30.000"
- },
- {
- "self_ref": "#/texts/22",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/23",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "And I want to do a follow-up on the last conversation we did.",
- "text": "And I want to do a follow-up on the last conversation we did.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/24",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:30.000 --> 00:31.500",
- "text": "00:30.000 --> 00:31.500"
- },
- {
- "self_ref": "#/texts/25",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/26",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "When we e-mailed—",
- "text": "When we e-mailed—",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/27",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:30.500 --> 00:32.500",
- "text": "00:30.500 --> 00:32.500"
- },
- {
- "self_ref": "#/texts/28",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Neil deGrasse Tyson: ",
- "text": "Neil deGrasse Tyson: "
- },
- {
- "self_ref": "#/texts/29",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Didn’t we talk about enough in that conversation?",
- "text": "Didn’t we talk about enough in that conversation?",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/30",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:32.000 --> 00:35.500",
- "text": "00:32.000 --> 00:35.500"
- },
- {
- "self_ref": "#/texts/31",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/32",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "No! No no no no; 'cos 'cos obviously 'cos",
- "text": "No! No no no no; 'cos 'cos obviously 'cos",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/33",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:32.500 --> 00:33.500",
- "text": "00:32.500 --> 00:33.500"
- },
- {
- "self_ref": "#/texts/34",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Neil deGrasse Tyson: ",
- "text": "Neil deGrasse Tyson: "
- },
- {
- "self_ref": "#/texts/35",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Laughs",
- "text": "Laughs",
- "formatting": {
- "bold": false,
- "italic": true,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/36",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:35.500 --> 00:38.000",
- "text": "00:35.500 --> 00:38.000"
- },
- {
- "self_ref": "#/texts/37",
- "parent": {
- "$ref": "#/groups/25"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/38",
- "parent": {
- "$ref": "#/groups/25"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 35.5,
+ "end_time": 38.0,
+ "voice": "Roger Bingham"
+ }
+ ],
"orig": "You know I’m so excited my glasses are falling off here.",
- "text": "You know I’m so excited my glasses are falling off here.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "text": "You know I’m so excited my glasses are falling off here."
}
],
"pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
index c57670289f..95d9e65753 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -1,51 +1,25 @@
-00:11.000 --> 00:13.000
+We are in New York City
-Roger Bingham: We are in New York City
+We’re actually at the Lucern Hotel, just down the street
-00:13.000 --> 00:16.000
+from the American Museum of Natural History
-Roger Bingham: We’re actually at the Lucern Hotel, just down the street
+And with me is Neil deGrasse Tyson
-00:16.000 --> 00:18.000
+Astrophysicist, Director of the Hayden Planetarium
-Roger Bingham: from the American Museum of Natural History
+at the AMNH.
-00:18.000 --> 00:20.000
+Thank you for walking down here.
-Roger Bingham: And with me is Neil deGrasse Tyson
+And I want to do a follow-up on the last conversation we did.
-00:20.000 --> 00:22.000
+When we e-mailed—
-Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
+Didn’t we talk about enough in that conversation?
-00:22.000 --> 00:24.000
+No! No no no no; 'cos 'cos obviously 'cos
-Roger Bingham: at the AMNH.
+*Laughs*
-00:24.000 --> 00:26.000
-
-Roger Bingham: Thank you for walking down here.
-
-00:27.000 --> 00:30.000
-
-Roger Bingham: And I want to do a follow-up on the last conversation we did.
-
-00:30.000 --> 00:31.500
-
-Roger Bingham: When we e-mailed—
-
-00:30.500 --> 00:32.500
-
-Neil deGrasse Tyson: Didn’t we talk about enough in that conversation?
-
-00:32.000 --> 00:35.500
-
-Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
-
-00:32.500 --> 00:33.500
-
-Neil deGrasse Tyson: *Laughs*
-
-00:35.500 --> 00:38.000
-
-Roger Bingham: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
index 6d90404ff7..56f63bc3f5 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -1,22 +1,12 @@
item-0 at level 0: unspecified: group _root_
- item-1 at level 1: section: group WebVTT cue block
- item-2 at level 2: text: 00:00.000 --> 00:02.000
- item-3 at level 2: inline: group WebVTT cue voice span
- item-4 at level 3: text: Esme (first, loud):
- item-5 at level 3: text: It’s a blue apple tree!
- item-6 at level 1: section: group WebVTT cue block
- item-7 at level 2: text: 00:02.000 --> 00:04.000
- item-8 at level 2: inline: group WebVTT cue voice span
- item-9 at level 3: text: Mary:
- item-10 at level 3: text: No way!
- item-11 at level 1: section: group WebVTT cue block
- item-12 at level 2: text: 00:04.000 --> 00:06.000
- item-13 at level 2: inline: group WebVTT cue voice span
- item-14 at level 3: text: Esme:
- item-15 at level 3: text: Hee!
- item-16 at level 2: text: laughter
- item-17 at level 1: section: group WebVTT cue block
- item-18 at level 2: text: 00:06.000 --> 00:08.000
- item-19 at level 2: inline: group WebVTT cue voice span
- item-20 at level 3: text: Mary (loud):
- item-21 at level 3: text: That’s awesome!
\ No newline at end of file
+ item-1 at level 1: text: It’s a blue apple tree!
+ item-2 at level 1: text: No way!
+ item-3 at level 1: inline: group WebVTT cue span
+ item-4 at level 2: text: Hee!
+ item-5 at level 2: text:
+ item-6 at level 2: text: laughter
+ item-7 at level 1: text: That’s awesome!
+ item-8 at level 1: inline: group WebVTT cue span
+ item-9 at level 2: text: Sur les
+ item-10 at level 2: text: playground
+ item-11 at level 2: text: , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
index 72647d93d0..3103261655 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -1,10 +1,10 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "webvtt_example_02",
"origin": {
"mimetype": "text/vtt",
- "binary_hash": 5029965721282070624,
+ "binary_hash": 8584853280299071027,
"filename": "webvtt_example_02.vtt"
},
"furniture": {
@@ -18,16 +18,19 @@
"self_ref": "#/body",
"children": [
{
- "$ref": "#/groups/0"
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/texts/1"
},
{
- "$ref": "#/groups/2"
+ "$ref": "#/groups/0"
},
{
- "$ref": "#/groups/4"
+ "$ref": "#/texts/5"
},
{
- "$ref": "#/groups/6"
+ "$ref": "#/groups/1"
}
],
"content_layer": "body",
@@ -41,70 +44,22 @@
"$ref": "#/body"
},
"children": [
- {
- "$ref": "#/texts/0"
- },
- {
- "$ref": "#/groups/1"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [
- {
- "$ref": "#/texts/1"
- },
{
"$ref": "#/texts/2"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
- {
- "self_ref": "#/groups/2",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
+ },
{
"$ref": "#/texts/3"
},
- {
- "$ref": "#/groups/3"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [
{
"$ref": "#/texts/4"
- },
- {
- "$ref": "#/texts/5"
}
],
"content_layer": "body",
- "name": "WebVTT cue voice span",
+ "name": "WebVTT cue span",
"label": "inline"
},
{
- "self_ref": "#/groups/4",
+ "self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
@@ -112,23 +67,6 @@
{
"$ref": "#/texts/6"
},
- {
- "$ref": "#/groups/5"
- },
- {
- "$ref": "#/texts/9"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/5",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [
{
"$ref": "#/texts/7"
},
@@ -137,41 +75,7 @@
}
],
"content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
- {
- "self_ref": "#/groups/6",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
- {
- "$ref": "#/texts/10"
- },
- {
- "$ref": "#/groups/7"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/7",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [
- {
- "$ref": "#/texts/11"
- },
- {
- "$ref": "#/texts/12"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
+ "name": "WebVTT cue span",
"label": "inline"
}
],
@@ -179,143 +83,177 @@
{
"self_ref": "#/texts/0",
"parent": {
- "$ref": "#/groups/0"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "00:00.000 --> 00:02.000",
- "text": "00:00.000 --> 00:02.000"
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 0.0,
+ "end_time": 2.0,
+ "voice": "Esme",
+ "classes": [
+ "v.first.loud"
+ ]
+ }
+ ],
+ "orig": "It’s a blue apple tree!",
+ "text": "It’s a blue apple tree!"
},
{
"self_ref": "#/texts/1",
"parent": {
- "$ref": "#/groups/1"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "Esme (first, loud): ",
- "text": "Esme (first, loud): "
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 2.0,
+ "end_time": 4.0,
+ "voice": "Mary"
+ }
+ ],
+ "orig": "No way!",
+ "text": "No way!"
},
{
"self_ref": "#/texts/2",
"parent": {
- "$ref": "#/groups/1"
+ "$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "It’s a blue apple tree!",
- "text": "It’s a blue apple tree!",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 4.0,
+ "end_time": 6.0,
+ "voice": "Esme"
+ }
+ ],
+ "orig": "Hee!",
+ "text": "Hee!"
},
{
"self_ref": "#/texts/3",
"parent": {
- "$ref": "#/groups/2"
+ "$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "00:02.000 --> 00:04.000",
- "text": "00:02.000 --> 00:04.000"
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 4.0,
+ "end_time": 6.0
+ }
+ ],
+ "orig": " ",
+ "text": " "
},
{
"self_ref": "#/texts/4",
"parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Mary: ",
- "text": "Mary: "
- },
- {
- "self_ref": "#/texts/5",
- "parent": {
- "$ref": "#/groups/3"
+ "$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "No way!",
- "text": "No way!",
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 4.0,
+ "end_time": 6.0
+ }
+ ],
+ "orig": "laughter",
+ "text": "laughter",
"formatting": {
"bold": false,
- "italic": false,
+ "italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
- "self_ref": "#/texts/6",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:04.000 --> 00:06.000",
- "text": "00:04.000 --> 00:06.000"
- },
- {
- "self_ref": "#/texts/7",
+ "self_ref": "#/texts/5",
"parent": {
- "$ref": "#/groups/5"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "Esme: ",
- "text": "Esme: "
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 6.0,
+ "end_time": 8.0,
+ "voice": "Mary",
+ "classes": [
+ "v.loud"
+ ]
+ }
+ ],
+ "orig": "That’s awesome!",
+ "text": "That’s awesome!"
},
{
- "self_ref": "#/texts/8",
+ "self_ref": "#/texts/6",
"parent": {
- "$ref": "#/groups/5"
+ "$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "Hee!",
- "text": "Hee!",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 8.0,
+ "end_time": 10.0
+ }
+ ],
+ "orig": "Sur les ",
+ "text": "Sur les "
},
{
- "self_ref": "#/texts/9",
+ "self_ref": "#/texts/7",
"parent": {
- "$ref": "#/groups/4"
+ "$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "laughter",
- "text": "laughter",
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 8.0,
+ "end_time": 10.0,
+ "languages": [
+ "en"
+ ],
+ "classes": [
+ "i.foreignphrase"
+ ]
+ }
+ ],
+ "orig": "playground",
+ "text": "playground",
"formatting": {
"bold": false,
"italic": true,
@@ -325,47 +263,23 @@
}
},
{
- "self_ref": "#/texts/10",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:06.000 --> 00:08.000",
- "text": "00:06.000 --> 00:08.000"
- },
- {
- "self_ref": "#/texts/11",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Mary (loud): ",
- "text": "Mary (loud): "
- },
- {
- "self_ref": "#/texts/12",
+ "self_ref": "#/texts/8",
"parent": {
- "$ref": "#/groups/7"
+ "$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "That’s awesome!",
- "text": "That’s awesome!",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 8.0,
+ "end_time": 10.0
+ }
+ ],
+ "orig": ", ici à Montpellier",
+ "text": ", ici à Montpellier"
}
],
"pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
index db84cf116d..7f62407381 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -1,17 +1,9 @@
-00:00.000 --> 00:02.000
+It’s a blue apple tree!
-Esme (first, loud): It’s a blue apple tree!
+No way!
-00:02.000 --> 00:04.000
+Hee! *laughter*
-Mary: No way!
+That’s awesome!
-00:04.000 --> 00:06.000
-
-Esme: Hee!
-
-*laughter*
-
-00:06.000 --> 00:08.000
-
-Mary (loud): That’s awesome!
\ No newline at end of file
+Sur les *playground* , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
index ca344e5957..a46794123c 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -1,77 +1,18 @@
item-0 at level 0: unspecified: group _root_
- item-1 at level 1: section: group WebVTT cue block
- item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
- item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
- item-4 at level 2: inline: group WebVTT cue voice span
- item-5 at level 3: text: Speaker A:
- item-6 at level 3: text: OK, I think now we should be recording
- item-7 at level 1: section: group WebVTT cue block
- item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
- item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
- item-10 at level 2: inline: group WebVTT cue voice span
- item-11 at level 3: text: Speaker A:
- item-12 at level 3: text: properly.
- item-13 at level 1: section: group WebVTT cue block
- item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
- item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
- item-16 at level 2: text: Good.
- item-17 at level 1: section: group WebVTT cue block
- item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
- item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
- item-20 at level 2: inline: group WebVTT cue voice span
- item-21 at level 3: text: Speaker A:
- item-22 at level 3: text: Yeah.
- item-23 at level 1: section: group WebVTT cue block
- item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
- item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
- item-26 at level 2: inline: group WebVTT cue voice span
- item-27 at level 3: text: Speaker B:
- item-28 at level 3: text: I was also thinking.
- item-29 at level 1: section: group WebVTT cue block
- item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
- item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
- item-32 at level 2: inline: group WebVTT cue voice span
- item-33 at level 3: text: Speaker B:
- item-34 at level 3: text: Would be maybe good to create items,
- item-35 at level 1: section: group WebVTT cue block
- item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
- item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
- item-38 at level 2: inline: group WebVTT cue voice span
- item-39 at level 3: text: Speaker B:
- item-40 at level 3: text: some metadata, some options that can be specific.
- item-41 at level 1: section: group WebVTT cue block
- item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
- item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
- item-44 at level 2: inline: group WebVTT cue voice span
- item-45 at level 3: text: Speaker A:
- item-46 at level 3: text: Yeah, I mean I think you went even more than
- item-47 at level 1: section: group WebVTT cue block
- item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
- item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
- item-50 at level 2: inline: group WebVTT cue voice span
- item-51 at level 3: text: Speaker B:
- item-52 at level 3: text: But we preserved the atoms.
- item-53 at level 1: section: group WebVTT cue block
- item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
- item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
- item-56 at level 2: inline: group WebVTT cue voice span
- item-57 at level 3: text: Speaker A:
- item-58 at level 3: text: than me. I just opened the format.
- item-59 at level 1: section: group WebVTT cue block
- item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
- item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
- item-62 at level 2: inline: group WebVTT cue voice span
- item-63 at level 3: text: Speaker A:
- item-64 at level 3: text: give it a try, yeah.
- item-65 at level 1: section: group WebVTT cue block
- item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
- item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
- item-68 at level 2: inline: group WebVTT cue voice span
- item-69 at level 3: text: Speaker B:
- item-70 at level 3: text: Okay, talk to you later.
- item-71 at level 1: section: group WebVTT cue block
- item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
- item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
- item-74 at level 2: inline: group WebVTT cue voice span
- item-75 at level 3: text: Speaker A:
- item-76 at level 3: text: See you.
\ No newline at end of file
+ item-1 at level 1: text: OK,
+ item-2 at level 1: text: I think now we should be recording
+ item-3 at level 1: text: properly.
+ item-4 at level 1: text: Good.
+ item-5 at level 1: text: Yeah.
+ item-6 at level 1: text: I was also thinking.
+ item-7 at level 1: text: Would be maybe good to create items,
+ item-8 at level 1: text: some metadata,
+ item-9 at level 1: text: some options that can be specific.
+ item-10 at level 1: text: Yeah,
+ item-11 at level 1: text: I mean I think you went even more than
+ item-12 at level 1: text: But we preserved the atoms.
+ item-13 at level 1: text: than me.
+ item-14 at level 1: text: I just opened the format.
+ item-15 at level 1: text: give it a try, yeah.
+ item-16 at level 1: text: Okay, talk to you later.
+ item-17 at level 1: text: See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
index 5df08e2bf3..e744229666 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "webvtt_example_03",
"origin": {
"mimetype": "text/vtt",
@@ -18,1218 +18,418 @@
"self_ref": "#/body",
"children": [
{
- "$ref": "#/groups/0"
+ "$ref": "#/texts/0"
},
{
- "$ref": "#/groups/2"
+ "$ref": "#/texts/1"
},
{
- "$ref": "#/groups/4"
+ "$ref": "#/texts/2"
},
{
- "$ref": "#/groups/5"
+ "$ref": "#/texts/3"
},
{
- "$ref": "#/groups/7"
+ "$ref": "#/texts/4"
},
{
- "$ref": "#/groups/9"
+ "$ref": "#/texts/5"
},
{
- "$ref": "#/groups/11"
+ "$ref": "#/texts/6"
},
{
- "$ref": "#/groups/13"
+ "$ref": "#/texts/7"
},
{
- "$ref": "#/groups/15"
+ "$ref": "#/texts/8"
},
{
- "$ref": "#/groups/17"
+ "$ref": "#/texts/9"
},
{
- "$ref": "#/groups/19"
+ "$ref": "#/texts/10"
},
{
- "$ref": "#/groups/21"
+ "$ref": "#/texts/11"
},
{
- "$ref": "#/groups/23"
+ "$ref": "#/texts/12"
+ },
+ {
+ "$ref": "#/texts/13"
+ },
+ {
+ "$ref": "#/texts/14"
+ },
+ {
+ "$ref": "#/texts/15"
+ },
+ {
+ "$ref": "#/texts/16"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
- "groups": [
+ "groups": [],
+ "texts": [
{
- "self_ref": "#/groups/0",
+ "self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/0"
- },
- {
- "$ref": "#/texts/1"
- },
- {
- "$ref": "#/groups/1"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [
- {
- "$ref": "#/texts/2"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/3"
+ "kind": "track",
+ "start_time": 4.963,
+ "end_time": 8.571,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "OK,",
+ "text": "OK,"
},
{
- "self_ref": "#/groups/2",
+ "self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/4"
- },
- {
- "$ref": "#/texts/5"
- },
- {
- "$ref": "#/groups/3"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [
- {
- "$ref": "#/texts/6"
- },
- {
- "$ref": "#/texts/7"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
- {
- "self_ref": "#/groups/4",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
- {
- "$ref": "#/texts/8"
- },
- {
- "$ref": "#/texts/9"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/10"
+ "kind": "track",
+ "start_time": 4.963,
+ "end_time": 8.571,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
+ "orig": "I think now we should be recording",
+ "text": "I think now we should be recording"
},
{
- "self_ref": "#/groups/5",
+ "self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/11"
- },
- {
- "$ref": "#/texts/12"
- },
- {
- "$ref": "#/groups/6"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/6",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [
- {
- "$ref": "#/texts/13"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/14"
+ "kind": "track",
+ "start_time": 8.571,
+ "end_time": 9.403,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "properly.",
+ "text": "properly."
},
{
- "self_ref": "#/groups/7",
+ "self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/15"
- },
- {
- "$ref": "#/texts/16"
- },
- {
- "$ref": "#/groups/8"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/8",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [
- {
- "$ref": "#/texts/17"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/18"
+ "kind": "track",
+ "start_time": 10.683,
+ "end_time": 11.563,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Good.",
+ "text": "Good."
},
{
- "self_ref": "#/groups/9",
+ "self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/19"
- },
- {
- "$ref": "#/texts/20"
- },
- {
- "$ref": "#/groups/10"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/10",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [
- {
- "$ref": "#/texts/21"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/22"
+ "kind": "track",
+ "start_time": 13.363,
+ "end_time": 13.803,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Yeah.",
+ "text": "Yeah."
},
{
- "self_ref": "#/groups/11",
+ "self_ref": "#/texts/5",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/23"
- },
- {
- "$ref": "#/texts/24"
- },
- {
- "$ref": "#/groups/12"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/12",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [
- {
- "$ref": "#/texts/25"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/26"
+ "kind": "track",
+ "start_time": 49.603,
+ "end_time": 53.363,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "I was also thinking.",
+ "text": "I was also thinking."
},
{
- "self_ref": "#/groups/13",
+ "self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/27"
- },
- {
- "$ref": "#/texts/28"
- },
- {
- "$ref": "#/groups/14"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/14",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [
- {
- "$ref": "#/texts/29"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/30"
+ "kind": "track",
+ "start_time": 54.963,
+ "end_time": 62.072,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Would be maybe good to create items,",
+ "text": "Would be maybe good to create items,"
},
{
- "self_ref": "#/groups/15",
+ "self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/31"
- },
- {
- "$ref": "#/texts/32"
- },
- {
- "$ref": "#/groups/16"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/16",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [
- {
- "$ref": "#/texts/33"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/34"
+ "kind": "track",
+ "start_time": 62.072,
+ "end_time": 66.811,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "some metadata,",
+ "text": "some metadata,"
},
{
- "self_ref": "#/groups/17",
+ "self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/35"
- },
- {
- "$ref": "#/texts/36"
- },
- {
- "$ref": "#/groups/18"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/18",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [
- {
- "$ref": "#/texts/37"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/38"
+ "kind": "track",
+ "start_time": 62.072,
+ "end_time": 66.811,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "some options that can be specific.",
+ "text": "some options that can be specific."
},
{
- "self_ref": "#/groups/19",
+ "self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/39"
- },
- {
- "$ref": "#/texts/40"
- },
- {
- "$ref": "#/groups/20"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/20",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [
- {
- "$ref": "#/texts/41"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/42"
+ "kind": "track",
+ "start_time": 70.243,
+ "end_time": 73.014,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Yeah,",
+ "text": "Yeah,"
},
{
- "self_ref": "#/groups/21",
+ "self_ref": "#/texts/10",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/43"
- },
- {
- "$ref": "#/texts/44"
- },
- {
- "$ref": "#/groups/22"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/22",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [
- {
- "$ref": "#/texts/45"
- },
+ "label": "text",
+ "prov": [],
+ "source": [
{
- "$ref": "#/texts/46"
+ "kind": "track",
+ "start_time": 70.243,
+ "end_time": 73.014,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "I mean I think you went even more than",
+ "text": "I mean I think you went even more than"
},
{
- "self_ref": "#/groups/23",
+ "self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/47"
- },
- {
- "$ref": "#/texts/48"
- },
- {
- "$ref": "#/groups/24"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/24",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [
- {
- "$ref": "#/texts/49"
- },
- {
- "$ref": "#/texts/50"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- }
- ],
- "texts": [
- {
- "self_ref": "#/texts/0",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
- },
- {
- "self_ref": "#/texts/1",
- "parent": {
- "$ref": "#/groups/0"
- },
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "00:00:04.963 --> 00:00:08.571",
- "text": "00:00:04.963 --> 00:00:08.571"
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 70.563,
+ "end_time": 72.643,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+ "voice": "Speaker B"
+ }
+ ],
+ "orig": "But we preserved the atoms.",
+ "text": "But we preserved the atoms."
},
{
- "self_ref": "#/texts/2",
+ "self_ref": "#/texts/12",
"parent": {
- "$ref": "#/groups/1"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 73.014,
+ "end_time": 75.907,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+ "voice": "Speaker A"
+ }
+ ],
+ "orig": "than me.",
+ "text": "than me."
},
{
- "self_ref": "#/texts/3",
+ "self_ref": "#/texts/13",
"parent": {
- "$ref": "#/groups/1"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "OK, I think now we should be recording",
- "text": "OK, I think now we should be recording",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 73.014,
+ "end_time": 75.907,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+ "voice": "Speaker A"
+ }
+ ],
+ "orig": "I just opened the format.",
+ "text": "I just opened the format."
},
{
- "self_ref": "#/texts/4",
+ "self_ref": "#/texts/14",
"parent": {
- "$ref": "#/groups/2"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1"
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 110.222,
+ "end_time": 111.643,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+ "voice": "Speaker A"
+ }
+ ],
+ "orig": "give it a try, yeah.",
+ "text": "give it a try, yeah."
},
{
- "self_ref": "#/texts/5",
+ "self_ref": "#/texts/15",
"parent": {
- "$ref": "#/groups/2"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
- "orig": "00:00:08.571 --> 00:00:09.403",
- "text": "00:00:08.571 --> 00:00:09.403"
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 112.043,
+ "end_time": 115.043,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+ "voice": "Speaker B"
+ }
+ ],
+ "orig": "Okay, talk to you later.",
+ "text": "Okay, talk to you later."
},
{
- "self_ref": "#/texts/6",
+ "self_ref": "#/texts/16",
"parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/7",
- "parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "properly.",
- "text": "properly.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/8",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
- },
- {
- "self_ref": "#/texts/9",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:10.683 --> 00:00:11.563",
- "text": "00:00:10.683 --> 00:00:11.563"
- },
- {
- "self_ref": "#/texts/10",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Good.",
- "text": "Good.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/11",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0"
- },
- {
- "self_ref": "#/texts/12",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:13.363 --> 00:00:13.803",
- "text": "00:00:13.363 --> 00:00:13.803"
- },
- {
- "self_ref": "#/texts/13",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/14",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Yeah.",
- "text": "Yeah.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/15",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0"
- },
- {
- "self_ref": "#/texts/16",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:49.603 --> 00:00:53.363",
- "text": "00:00:49.603 --> 00:00:53.363"
- },
- {
- "self_ref": "#/texts/17",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/18",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "I was also thinking.",
- "text": "I was also thinking.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/19",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0"
- },
- {
- "self_ref": "#/texts/20",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:54.963 --> 00:01:02.072",
- "text": "00:00:54.963 --> 00:01:02.072"
- },
- {
- "self_ref": "#/texts/21",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/22",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Would be maybe good to create items,",
- "text": "Would be maybe good to create items,",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/23",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1"
- },
- {
- "self_ref": "#/texts/24",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:02.072 --> 00:01:06.811",
- "text": "00:01:02.072 --> 00:01:06.811"
- },
- {
- "self_ref": "#/texts/25",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/26",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "some metadata, some options that can be specific.",
- "text": "some metadata, some options that can be specific.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/27",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0"
- },
- {
- "self_ref": "#/texts/28",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:10.243 --> 00:01:13.014",
- "text": "00:01:10.243 --> 00:01:13.014"
- },
- {
- "self_ref": "#/texts/29",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/30",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Yeah, I mean I think you went even more than",
- "text": "Yeah, I mean I think you went even more than",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/31",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0"
- },
- {
- "self_ref": "#/texts/32",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:10.563 --> 00:01:12.643",
- "text": "00:01:10.563 --> 00:01:12.643"
- },
- {
- "self_ref": "#/texts/33",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/34",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "But we preserved the atoms.",
- "text": "But we preserved the atoms.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/35",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1"
- },
- {
- "self_ref": "#/texts/36",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:13.014 --> 00:01:15.907",
- "text": "00:01:13.014 --> 00:01:15.907"
- },
- {
- "self_ref": "#/texts/37",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/38",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "than me. I just opened the format.",
- "text": "than me. I just opened the format.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/39",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1"
- },
- {
- "self_ref": "#/texts/40",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:50.222 --> 00:01:51.643",
- "text": "00:01:50.222 --> 00:01:51.643"
- },
- {
- "self_ref": "#/texts/41",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/42",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "give it a try, yeah.",
- "text": "give it a try, yeah.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/43",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0"
- },
- {
- "self_ref": "#/texts/44",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:52.043 --> 00:01:55.043",
- "text": "00:01:52.043 --> 00:01:55.043"
- },
- {
- "self_ref": "#/texts/45",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/46",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Okay, talk to you later.",
- "text": "Okay, talk to you later.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/47",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0"
- },
- {
- "self_ref": "#/texts/48",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:54.603 --> 00:01:55.283",
- "text": "00:01:54.603 --> 00:01:55.283"
- },
- {
- "self_ref": "#/texts/49",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/50",
- "parent": {
- "$ref": "#/groups/24"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 114.603,
+ "end_time": 115.283,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+ "voice": "Speaker A"
+ }
+ ],
"orig": "See you.",
- "text": "See you.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "text": "See you."
}
],
"pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
index 859a6dde3f..b58d350b3d 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -1,77 +1,33 @@
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+OK,
-00:00:04.963 --> 00:00:08.571
+I think now we should be recording
-Speaker A: OK, I think now we should be recording
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
-
-00:00:08.571 --> 00:00:09.403
-
-Speaker A: properly.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
-
-00:00:10.683 --> 00:00:11.563
+properly.
Good.
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
-
-00:00:13.363 --> 00:00:13.803
-
-Speaker A: Yeah.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
-
-00:00:49.603 --> 00:00:53.363
-
-Speaker B: I was also thinking.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
-
-00:00:54.963 --> 00:01:02.072
-
-Speaker B: Would be maybe good to create items,
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
-
-00:01:02.072 --> 00:01:06.811
-
-Speaker B: some metadata, some options that can be specific.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
-
-00:01:10.243 --> 00:01:13.014
-
-Speaker A: Yeah, I mean I think you went even more than
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
-
-00:01:10.563 --> 00:01:12.643
-
-Speaker B: But we preserved the atoms.
+Yeah.
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+I was also thinking.
-00:01:13.014 --> 00:01:15.907
+Would be maybe good to create items,
-Speaker A: than me. I just opened the format.
+some metadata,
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+some options that can be specific.
-00:01:50.222 --> 00:01:51.643
+Yeah,
-Speaker A: give it a try, yeah.
+I mean I think you went even more than
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+But we preserved the atoms.
-00:01:52.043 --> 00:01:55.043
+than me.
-Speaker B: Okay, talk to you later.
+I just opened the format.
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+give it a try, yeah.
-00:01:54.603 --> 00:01:55.283
+Okay, talk to you later.
-Speaker A: See you.
\ No newline at end of file
+See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
new file mode 100644
index 0000000000..93feba5e9a
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
@@ -0,0 +1,14 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: text: Last night the chef surprised us with a culinary adventure.
+ item-2 at level 1: inline: group WebVTT cue span
+ item-3 at level 2: text: The waiter offered a
+ item-4 at level 2: text: steaming bowl of
+ item-5 at level 2: text: paella
+ item-6 at level 2: text: that instantly transported the diners to a sunny Mediterranean coast.
+ item-7 at level 1: inline: group WebVTT cue span
+ item-8 at level 2: text: The dessert’s
+ item-9 at level 2: text: unexpected
+ item-10 at level 2: text:
+ item-11 at level 2: text: arcobaleno
+ item-12 at level 2: text: of flavors
+ item-13 at level 2: text: left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
new file mode 100644
index 0000000000..3a07d69e9b
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
@@ -0,0 +1,366 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.8.0",
+ "name": "webvtt_example_04",
+ "origin": {
+ "mimetype": "text/vtt",
+ "binary_hash": 5389775195091554844,
+ "filename": "webvtt_example_04.vtt"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/groups/0"
+ },
+ {
+ "$ref": "#/groups/1"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [
+ {
+ "self_ref": "#/groups/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/1"
+ },
+ {
+ "$ref": "#/texts/2"
+ },
+ {
+ "$ref": "#/texts/3"
+ },
+ {
+ "$ref": "#/texts/4"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/1",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/5"
+ },
+ {
+ "$ref": "#/texts/6"
+ },
+ {
+ "$ref": "#/texts/7"
+ },
+ {
+ "$ref": "#/texts/8"
+ },
+ {
+ "$ref": "#/texts/9"
+ },
+ {
+ "$ref": "#/texts/10"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue span",
+ "label": "inline"
+ }
+ ],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14580.0,
+ "end_time": 14760.0,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "Last night the chef surprised us with a culinary adventure.",
+ "text": "Last night the chef surprised us with a culinary adventure."
+ },
+ {
+ "self_ref": "#/texts/1",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "The waiter offered a ",
+ "text": "The waiter offered a "
+ },
+ {
+ "self_ref": "#/texts/2",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "steaming bowl of ",
+ "text": "steaming bowl of ",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/3",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234",
+ "languages": [
+ "es-ES"
+ ]
+ }
+ ],
+ "orig": "paella",
+ "text": "paella",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/4",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+ "text": " that instantly transported the diners to a sunny Mediterranean coast."
+ },
+ {
+ "self_ref": "#/texts/5",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "The dessert’s ",
+ "text": "The dessert’s "
+ },
+ {
+ "self_ref": "#/texts/6",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234",
+ "classes": [
+ "b.loud"
+ ]
+ }
+ ],
+ "orig": "unexpected",
+ "text": "unexpected",
+ "formatting": {
+ "bold": true,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/7",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " ",
+ "text": " ",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/8",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234",
+ "languages": [
+ "it"
+ ]
+ }
+ ],
+ "orig": "arcobaleno",
+ "text": "arcobaleno",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": true,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/9",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " of flavors",
+ "text": " of flavors",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/10",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "source": [
+ {
+ "kind": "track",
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " left everyone in awe.",
+ "text": " left everyone in awe."
+ }
+ ],
+ "pictures": [],
+ "tables": [],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
new file mode 100644
index 0000000000..f2312a059c
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
@@ -0,0 +1,5 @@
+Last night the chef surprised us with a culinary adventure.
+
+The waiter offered a *steaming bowl of * *paella* that instantly transported the diners to a sunny Mediterranean coast.
+
+The dessert’s ***unexpected*** * * *arcobaleno* * of flavors* left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt
index 1152a1e8fa..6bd1821011 100644
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -12,4 +12,7 @@ NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
Hee! laughter
00:06.000 --> 00:08.000
-That’s awesome!
\ No newline at end of file
+That’s awesome!
+
+00:08.000 --> 00:10.000
+Sur les playground, ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_04.vtt b/tests/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 0000000000..fd7b788c06
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a steaming bowl of paella that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s unexpected arcobaleno of flavors left everyone in awe.
\ No newline at end of file
diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py
index a910671bb5..cadcef9b33 100644
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -1,21 +1,12 @@
-# Assisted by watsonx Code Assistant
-
+import warnings
+from io import BytesIO
from pathlib import Path
import pytest
-from docling_core.types.doc import DoclingDocument
-from pydantic import ValidationError
-
-from docling.backend.webvtt_backend import (
- _WebVTTCueItalicSpan,
- _WebVTTCueTextSpan,
- _WebVTTCueTimings,
- _WebVTTCueVoiceSpan,
- _WebVTTFile,
- _WebVTTTimestamp,
-)
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import ConversionResult
+from docling_core.types.doc import DoclingDocument, GroupItem, TextItem
+
+from docling.datamodel.base_models import DocumentStream, InputFormat
+from docling.datamodel.document import ConversionResult, _DocumentConversionInput
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
@@ -24,187 +15,6 @@
GENERATE = GEN_TEST_DATA
-def test_vtt_cue_commponents():
- """Test WebVTT components."""
- valid_timestamps = [
- "00:01:02.345",
- "12:34:56.789",
- "02:34.567",
- "00:00:00.000",
- ]
- valid_total_seconds = [
- 1 * 60 + 2.345,
- 12 * 3600 + 34 * 60 + 56.789,
- 2 * 60 + 34.567,
- 0.0,
- ]
- for idx, ts in enumerate(valid_timestamps):
- model = _WebVTTTimestamp(raw=ts)
- assert model.seconds == valid_total_seconds[idx]
-
- """Test invalid WebVTT timestamps."""
- invalid_timestamps = [
- "00:60:02.345", # minutes > 59
- "00:01:60.345", # seconds > 59
- "00:01:02.1000", # milliseconds > 999
- "01:02:03", # missing milliseconds
- "01:02", # missing milliseconds
- ":01:02.345", # extra : for missing hours
- "abc:01:02.345", # invalid format
- ]
- for ts in invalid_timestamps:
- with pytest.raises(ValidationError):
- _WebVTTTimestamp(raw=ts)
-
- """Test the timestamp __str__ method."""
- model = _WebVTTTimestamp(raw="00:01:02.345")
- assert str(model) == "00:01:02.345"
-
- """Test valid cue timings."""
- start = _WebVTTTimestamp(raw="00:10.005")
- end = _WebVTTTimestamp(raw="00:14.007")
- cue_timings = _WebVTTCueTimings(start=start, end=end)
- assert cue_timings.start == start
- assert cue_timings.end == end
- assert str(cue_timings) == "00:10.005 --> 00:14.007"
-
- """Test invalid cue timings with end timestamp before start."""
- start = _WebVTTTimestamp(raw="00:10.700")
- end = _WebVTTTimestamp(raw="00:10.500")
- with pytest.raises(ValidationError) as excinfo:
- _WebVTTCueTimings(start=start, end=end)
- assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
-
- """Test invalid cue timings with missing end."""
- start = _WebVTTTimestamp(raw="00:10.500")
- with pytest.raises(ValidationError) as excinfo:
- _WebVTTCueTimings(start=start)
- assert "Field required" in str(excinfo.value)
-
- """Test invalid cue timings with missing start."""
- end = _WebVTTTimestamp(raw="00:10.500")
- with pytest.raises(ValidationError) as excinfo:
- _WebVTTCueTimings(end=end)
- assert "Field required" in str(excinfo.value)
-
- """Test with valid text."""
- valid_text = "This is a valid cue text span."
- span = _WebVTTCueTextSpan(text=valid_text)
- assert span.text == valid_text
- assert str(span) == valid_text
-
- """Test with text containing newline characters."""
- invalid_text = "This cue text span\ncontains a newline."
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text=invalid_text)
-
- """Test with text containing ampersand."""
- invalid_text = "This cue text span contains &."
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text=invalid_text)
-
- """Test with text containing less-than sign."""
- invalid_text = "This cue text span contains <."
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text=invalid_text)
-
- """Test with empty text."""
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text="")
-
- """Test that annotation validation works correctly."""
- valid_annotation = "valid-annotation"
- invalid_annotation = "invalid\nannotation"
- with pytest.raises(ValidationError):
- _WebVTTCueVoiceSpan(annotation=invalid_annotation)
- assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
-
- """Test that classes validation works correctly."""
- annotation = "speaker name"
- valid_classes = ["class1", "class2"]
- invalid_classes = ["class\nwith\nnewlines", ""]
- with pytest.raises(ValidationError):
- _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
- assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
-
- """Test that components validation works correctly."""
- annotation = "speaker name"
- valid_components = [_WebVTTCueTextSpan(text="random text")]
- invalid_components = [123, "not a component"]
- with pytest.raises(ValidationError):
- _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
- assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
-
- """Test valid cue voice spans."""
- cue_span = _WebVTTCueVoiceSpan(
- annotation="speaker",
- classes=["loud", "clear"],
- components=[_WebVTTCueTextSpan(text="random text")],
- )
-
- expected_str = "random text"
- assert str(cue_span) == expected_str
-
- cue_span = _WebVTTCueVoiceSpan(
- annotation="speaker",
- components=[_WebVTTCueTextSpan(text="random text")],
- )
- expected_str = "random text"
- assert str(cue_span) == expected_str
-
-
-def test_webvtt_file():
- """Test WebVTT files."""
- with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
- content = f.read()
- vtt = _WebVTTFile.parse(content)
- assert len(vtt) == 13
- block = vtt.cue_blocks[11]
- assert str(block.timings) == "00:32.500 --> 00:33.500"
- assert len(block.payload) == 1
- cue_span = block.payload[0]
- assert isinstance(cue_span, _WebVTTCueVoiceSpan)
- assert cue_span.annotation == "Neil deGrasse Tyson"
- assert not cue_span.classes
- assert len(cue_span.components) == 1
- comp = cue_span.components[0]
- assert isinstance(comp, _WebVTTCueItalicSpan)
- assert len(comp.components) == 1
- comp2 = comp.components[0]
- assert isinstance(comp2, _WebVTTCueTextSpan)
- assert comp2.text == "Laughs"
-
- with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
- content = f.read()
- vtt = _WebVTTFile.parse(content)
- assert len(vtt) == 4
- reverse = (
- "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
- "https://www.w3.org/TR/webvtt1/\n\n"
- )
- reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
- assert content == reverse
-
- with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
- content = f.read()
- vtt = _WebVTTFile.parse(content)
- assert len(vtt) == 13
- for block in vtt:
- assert block.identifier
- block = vtt.cue_blocks[0]
- assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
- assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
- assert len(block.payload) == 1
- assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
- block = vtt.cue_blocks[2]
- assert isinstance(cue_span, _WebVTTCueVoiceSpan)
- assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
- assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
- assert len(block.payload) == 1
- assert isinstance(block.payload[0], _WebVTTCueTextSpan)
- assert block.payload[0].text == "Good."
-
-
def test_e2e_vtt_conversions():
directory = Path("./tests/data/webvtt/")
vtt_paths = sorted(directory.rglob("*.vtt"))
@@ -230,3 +40,252 @@ def test_e2e_vtt_conversions():
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
+
+
+def _create_vtt_stream(content: str) -> DocumentStream:
+ stream = DocumentStream(name="test.vtt", stream=BytesIO(content.strip().encode()))
+ dci = _DocumentConversionInput(path_or_stream_iterator=[])
+ assert dci._guess_format(stream) == InputFormat.VTT
+
+ return stream
+
+
+def _process_vtt_doc(doc: DoclingDocument) -> str:
+ text: str = ""
+ for item in doc.texts:
+ if (
+ isinstance(item, TextItem)
+ and item.source
+ and item.source[0].kind == "track"
+ ):
+ parent = item.parent.resolve(doc)
+ if parent and isinstance(parent, GroupItem):
+ text += " "
+ text += item.text
+
+ return text.strip()
+
+
+@pytest.fixture(scope="module")
+def converter() -> DocumentConverter:
+ return DocumentConverter()
+
+
+def test_simple_two_cues_basic(converter):
+ vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+Hello world!
+
+00:00:02.500 --> 00:00:04.000
+Second cue.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "Hello world! Second cue."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_cue_ids_present_are_ignored_in_output(converter):
+ vtt = """
+WEBVTT
+
+1
+00:00:00.000 --> 00:00:01.000
+First with ID.
+
+2
+00:00:01.250 --> 00:00:02.000
+Second with ID.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "First with ID. Second with ID."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_multi_line_cue_text_preserved(converter):
+ vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:03.000
+This is line one.
+This is line two.
+
+00:00:03.500 --> 00:00:05.000
+Another cue line one.
+Another cue line two.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "This is line one. This is line two. Another cue line one. Another cue line two."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_styling_and_voice_tags_stripped(converter):
+ vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+Hello there!
+
+00:00:02.200 --> 00:00:04.000
+Styled and voiced text.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ # Expect tags removed but inner text retained, spacing preserved.
+ # expected = "Hello there! Styled and voiced text."
+ # TODO: temporary ground truth (issue docling-project/docling-core/#371)
+ expected = "Hello there ! Styled and voiced text."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_blank_cue_contributes_no_text(converter):
+ # First cue has text; second cue is intentionally blank (zero transcript lines).
+ vtt = """
+WEBVTT
+
+00:00:00.000 --> 00:00:02.000
+Visible text.
+
+00:00:02.500 --> 00:00:04.000
+
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "Visible text."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_note_blocks_are_ignored(converter):
+ vtt = """
+WEBVTT
+
+
+NOTE This is a file-level note
+It can span multiple lines.
+
+
+00:00:00.000 --> 00:00:02.000
+First cue text.
+
+
+NOTE Another note between cues
+
+
+00:00:02.500 --> 00:00:04.000
+Second cue text.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "First cue text. Second cue text."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_region_block_ignored_but_region_reference_ok(converter):
+ vtt = """
+WEBVTT
+
+REGION
+id:top
+width:40%
+lines:3
+
+00:00:00.000 --> 00:00:02.000 region:top line:90% position:50% size:35% align:start
+Top region text.
+
+00:00:02.500 --> 00:00:04.000
+Normal region text.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "Top region text. Normal region text."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_varied_timestamp_formats_and_settings_ignored(converter):
+ # First cue uses MM:SS.mmm; second uses HH:MM:SS.mmm and includes settings.
+ vtt = """
+WEBVTT
+
+00:01.000 --> 00:03.000
+Under one minute format.
+
+01:00:00.000 --> 01:00:02.000 line:0 position:10% align:end
+Hour format with settings.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ expected = "Under one minute format. Hour format with settings."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_cue_ids_plus_multiline_with_voice_and_style(converter):
+ # Mix multiple concepts: cue IDs, multi-line text, voice tags, style tags.
+ vtt = """
+WEBVTT
+
+
+
+intro
+00:00:00.000 --> 00:00:02.000
+Welcome to the show.
+Enjoy your time.
+
+
+
+outro
+00:00:02.500 --> 00:00:04.000
+Goodbye, see you soon.
+"""
+ stream = _create_vtt_stream(vtt)
+ doc = converter.convert(stream).document
+
+ # expected = "Welcome to the show. Enjoy your time. Goodbye, see you soon."
+ # TODO: temporary ground truth (issue docling-project/docling-core/#371)
+ expected = "Welcome to the show. Enjoy your time. Goodbye , see you soon ."
+ assert _process_vtt_doc(doc) == expected
+
+
+def test_style_blocks_and_note_between_styles_are_ignored(converter):
+ vtt = """
+WEBVTT
+
+STYLE
+::cue {
+ background-image: linear-gradient(to bottom, dimgray, lightgray);
+ color: papayawhip;
+}
+/* Style blocks cannot use blank lines nor "dash dash greater than" */
+
+NOTE comment blocks can be used between style blocks.
+
+STYLE
+::cue(b) {
+ color: peachpuff;
+}
+
+hello
+00:00:00.000 --> 00:00:10.000
+Hello world.
+"""
+ stream = _create_vtt_stream(vtt)
+ with warnings.catch_warnings():
+ # STYLE and NOTE blocks should be ignored without warnings
+ warnings.simplefilter("error")
+ doc = converter.convert(stream).document
+
+ # expected = "Hello world."
+ # TODO: temporary ground truth (issue docling-project/docling-core/#371)
+ expected = "Hello world ."
+ assert _process_vtt_doc(doc) == expected
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 93f33e1fd1..5f559b511c 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -241,6 +241,20 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
# TODO: add bbox check with tolerance
+ # Validate source
+ assert bool(true_item.source) == bool(pred_item.source), (
+ "Source exists mismatch"
+ )
+ if true_item.source:
+ true_source = true_item.source[0]
+ pred_source = pred_item.source[0]
+ assert true_source.start_time == pred_source.start_time, (
+ "TrackProvenance start time mismatch"
+ )
+ assert true_source.end_time == pred_source.end_time, (
+ "TrackProvenance end time mismatch"
+ )
+
# Validate text content
if isinstance(true_item, TextItem):
assert isinstance(pred_item, TextItem), (
diff --git a/uv.lock b/uv.lock
index f393b112f0..52390d581c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1035,7 +1035,7 @@ requires-dist = [
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
{ name = "certifi", specifier = ">=2024.7.4" },
- { name = "docling-core", extras = ["chunking"], specifier = ">=2.58.0,<3.0.0" },
+ { name = "docling-core", extras = ["chunking"], specifier = ">=2.62.0,<3.0.0" },
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
{ name = "docling-parse", specifier = ">=4.7.0,<5.0.0" },
{ name = "easyocr", marker = "extra == 'easyocr'", specifier = ">=1.7,<2.0" },
@@ -1119,7 +1119,7 @@ examples = [
[[package]]
name = "docling-core"
-version = "2.60.2"
+version = "2.62.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jsonref" },
@@ -1133,9 +1133,9 @@ dependencies = [
{ name = "typer" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/7d/e6/7ed57bc580f136db0a7457305ec63366f22c999b674ef5f7c0abe452d79f/docling_core-2.60.2.tar.gz", hash = "sha256:7a99e1671e796e39d0c735b7ae3833766a97ad287e15d434dfa417917e3b0e6d", size = 231978, upload-time = "2026-01-23T12:29:18.506Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/21/20d58a48f4baa9e16d49aaccf3048346a8e7833b65b09144315bf1d956db/docling_core-2.62.0.tar.gz", hash = "sha256:147c958fe3b552db5e78b5a301dba19349820066ec5ef189b67eb5ed00306a07", size = 250107, upload-time = "2026-01-30T14:01:44.448Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/fa/5f/d39dd904b602f3a4072f1a7c38636702c32ed36d49aaafb21ea059face28/docling_core-2.60.2-py3-none-any.whl", hash = "sha256:63aee783f06240455c12c30e9af383b80d7ade80c896f81d68a4aff6cde2e2a1", size = 222319, upload-time = "2026-01-23T12:29:17.109Z" },
+ { url = "https://files.pythonhosted.org/packages/c5/89/e5204af5669e6b73bfdf304fc3e4c6b4b98b10d06b8bd7dc186b5190c9f3/docling_core-2.62.0-py3-none-any.whl", hash = "sha256:0073ccbd0c9cf514b38be7d53ccd78ee7b92723294a623a3f36eb7a7aea67bf0", size = 238084, upload-time = "2026-01-30T14:01:43.059Z" },
]
[package.optional-dependencies]