docling-project · ceberam · Jan 30, 2026 · Dec 8, 2025 · Dec 12, 2025 · Dec 14, 2025
diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
@@ -1,47 +1,35 @@
 import logging
-import os
-import re
 import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Union, cast
-
-from docling_core.types.doc import DoclingDocument, DocumentOrigin
-
-# import whisper  # type: ignore
-# import librosa
-# import numpy as np
-# import soundfile as sf  # type: ignore
-from docling_core.types.doc.labels import DocItemLabel
-from pydantic import BaseModel, Field, validator
+from typing import Optional, Union
+
+from docling_core.types.doc import (
+    ContentLayer,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    TrackSource,
+)
+from pydantic import BaseModel, Field
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.noop_backend import NoOpBackend
-
-# from pydub import AudioSegment  # type: ignore
-# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
 )
 from docling.datamodel.base_models import (
     ConversionStatus,
-    FormatToMimeType,
 )
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AsrPipelineOptions,
 )
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrMlxWhisperOptions,
     InlineAsrNativeWhisperOptions,
-    # AsrResponseFormat,
-    InlineAsrOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import (
-    InferenceFramework,
-)
-from docling.datamodel.settings import settings
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
             )
 
             for citem in conversation:
+                track: TrackSource = TrackSource(
+                    start_time=citem.start_time,
+                    end_time=citem.end_time,
+                    voice=citem.speaker,
+                )
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.text,
+                    content_layer=ContentLayer.BODY,
+                    source=track,
                 )
 
             return conv_res
@@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
             )
 
             for citem in conversation:
+                track: TrackSource = TrackSource(
+                    start_time=citem.start_time,
+                    end_time=citem.end_time,
+                    voice=citem.speaker,
+                )
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.text,
+                    content_layer=ContentLayer.BODY,
+                    source=track,
                 )
 
             conv_res.status = ConversionStatus.SUCCESS

diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
@@ -2,7 +2,7 @@
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import List, Optional, Union, cast
+from typing import List, Union, cast
 
 from docling_core.types.doc import (
     BoundingBox,
@@ -12,16 +12,13 @@
     ImageRef,
     PictureItem,
     ProvenanceItem,
-    TableCell,
-    TableData,
     TextItem,
 )
 from docling_core.types.doc.base import (
     BoundingBox,
     Size,
 )
 from docling_core.types.doc.document import DocTagsDocument
-from lxml import etree
 from PIL import Image as PILImage
 
 from docling.backend.abstract_backend import (
@@ -42,7 +39,6 @@
     InlineVlmOptions,
     ResponseFormat,
 )
-from docling.datamodel.settings import settings
 from docling.models.vlm_pipeline_models.api_vlm_model import ApiVlmModel
 from docling.models.vlm_pipeline_models.hf_transformers_model import (
     HuggingFaceTransformersVlmModel,

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,7 +44,7 @@ authors = [
 requires-python = '>=3.10,<4.0'
 dependencies = [
   'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.58.0,<3.0.0)',
+  'docling-core[chunking] (>=2.62.0,<3.0.0)',
   'docling-parse (>=4.7.0,<5.0.0)',
   "docling-ibm-models>=3.9.1,<4",
   'filetype (>=1.2.0,<2.0.0)',

diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -1,66 +1,14 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 00:11.000 --> 00:13.000
-    item-3 at level 2: inline: group WebVTT cue voice span
-      item-4 at level 3: text: Roger Bingham: 
-      item-5 at level 3: text: We are in New York City
-  item-6 at level 1: section: group WebVTT cue block
-    item-7 at level 2: text: 00:13.000 --> 00:16.000
-    item-8 at level 2: inline: group WebVTT cue voice span
-      item-9 at level 3: text: Roger Bingham: 
-      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
-  item-11 at level 1: section: group WebVTT cue block
-    item-12 at level 2: text: 00:16.000 --> 00:18.000
-    item-13 at level 2: inline: group WebVTT cue voice span
-      item-14 at level 3: text: Roger Bingham: 
-      item-15 at level 3: text: from the American Museum of Natural History
-  item-16 at level 1: section: group WebVTT cue block
-    item-17 at level 2: text: 00:18.000 --> 00:20.000
-    item-18 at level 2: inline: group WebVTT cue voice span
-      item-19 at level 3: text: Roger Bingham: 
-      item-20 at level 3: text: And with me is Neil deGrasse Tyson
-  item-21 at level 1: section: group WebVTT cue block
-    item-22 at level 2: text: 00:20.000 --> 00:22.000
-    item-23 at level 2: inline: group WebVTT cue voice span
-      item-24 at level 3: text: Roger Bingham: 
-      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
-  item-26 at level 1: section: group WebVTT cue block
-    item-27 at level 2: text: 00:22.000 --> 00:24.000
-    item-28 at level 2: inline: group WebVTT cue voice span
-      item-29 at level 3: text: Roger Bingham: 
-      item-30 at level 3: text: at the AMNH.
-  item-31 at level 1: section: group WebVTT cue block
-    item-32 at level 2: text: 00:24.000 --> 00:26.000
-    item-33 at level 2: inline: group WebVTT cue voice span
-      item-34 at level 3: text: Roger Bingham: 
-      item-35 at level 3: text: Thank you for walking down here.
-  item-36 at level 1: section: group WebVTT cue block
-    item-37 at level 2: text: 00:27.000 --> 00:30.000
-    item-38 at level 2: inline: group WebVTT cue voice span
-      item-39 at level 3: text: Roger Bingham: 
-      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
-  item-41 at level 1: section: group WebVTT cue block
-    item-42 at level 2: text: 00:30.000 --> 00:31.500
-    item-43 at level 2: inline: group WebVTT cue voice span
-      item-44 at level 3: text: Roger Bingham: 
-      item-45 at level 3: text: When we e-mailed—
-  item-46 at level 1: section: group WebVTT cue block
-    item-47 at level 2: text: 00:30.500 --> 00:32.500
-    item-48 at level 2: inline: group WebVTT cue voice span
-      item-49 at level 3: text: Neil deGrasse Tyson: 
-      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
-  item-51 at level 1: section: group WebVTT cue block
-    item-52 at level 2: text: 00:32.000 --> 00:35.500
-    item-53 at level 2: inline: group WebVTT cue voice span
-      item-54 at level 3: text: Roger Bingham: 
-      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
-  item-56 at level 1: section: group WebVTT cue block
-    item-57 at level 2: text: 00:32.500 --> 00:33.500
-    item-58 at level 2: inline: group WebVTT cue voice span
-      item-59 at level 3: text: Neil deGrasse Tyson: 
-      item-60 at level 3: text: Laughs
-  item-61 at level 1: section: group WebVTT cue block
-    item-62 at level 2: text: 00:35.500 --> 00:38.000
-    item-63 at level 2: inline: group WebVTT cue voice span
-      item-64 at level 3: text: Roger Bingham: 
-      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
+  item-1 at level 1: text: We are in New York City
+  item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street
+  item-3 at level 1: text: from the American Museum of Natural History
+  item-4 at level 1: text: And with me is Neil deGrasse Tyson
+  item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium
+  item-6 at level 1: text: at the AMNH.
+  item-7 at level 1: text: Thank you for walking down here.
+  item-8 at level 1: text: And I want to do a follow-up on the last conversation we did.
+  item-9 at level 1: text: When we e-mailed—
+  item-10 at level 1: text: Didn’t we talk about enough in that conversation?
+  item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos
+  item-12 at level 1: text: Laughs
+  item-13 at level 1: text: You know I’m so excited my glasses are falling off here.