Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
594 changes: 115 additions & 479 deletions docling/backend/webvtt_backend.py

Large diffs are not rendered by default.

54 changes: 29 additions & 25 deletions docling/pipeline/asr_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,35 @@
import logging
import os
import re
import sys
import tempfile
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Union, cast

from docling_core.types.doc import DoclingDocument, DocumentOrigin

# import whisper # type: ignore
# import librosa
# import numpy as np
# import soundfile as sf # type: ignore
from docling_core.types.doc.labels import DocItemLabel
from pydantic import BaseModel, Field, validator
from typing import Optional, Union

from docling_core.types.doc import (
ContentLayer,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
TrackSource,
)
from pydantic import BaseModel, Field

from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.noop_backend import NoOpBackend

# from pydub import AudioSegment # type: ignore
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToMimeType,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
)
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrMlxWhisperOptions,
InlineAsrNativeWhisperOptions,
# AsrResponseFormat,
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework,
)
from docling.datamodel.settings import settings
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import ProfilingScope, TimeRecorder
Expand Down Expand Up @@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
)

for citem in conversation:
track: TrackSource = TrackSource(
start_time=citem.start_time,
end_time=citem.end_time,
voice=citem.speaker,
)
conv_res.document.add_text(
label=DocItemLabel.TEXT, text=citem.to_string()
label=DocItemLabel.TEXT,
text=citem.text,
content_layer=ContentLayer.BODY,
source=track,
)

return conv_res
Expand Down Expand Up @@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
)

for citem in conversation:
track: TrackSource = TrackSource(
start_time=citem.start_time,
end_time=citem.end_time,
voice=citem.speaker,
)
conv_res.document.add_text(
label=DocItemLabel.TEXT, text=citem.to_string()
label=DocItemLabel.TEXT,
text=citem.text,
content_layer=ContentLayer.BODY,
source=track,
)

conv_res.status = ConversionStatus.SUCCESS
Expand Down
6 changes: 1 addition & 5 deletions docling/pipeline/vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast
from typing import List, Union, cast

from docling_core.types.doc import (
BoundingBox,
Expand All @@ -12,16 +12,13 @@
ImageRef,
PictureItem,
ProvenanceItem,
TableCell,
TableData,
TextItem,
)
from docling_core.types.doc.base import (
BoundingBox,
Size,
)
from docling_core.types.doc.document import DocTagsDocument
from lxml import etree
from PIL import Image as PILImage

from docling.backend.abstract_backend import (
Expand All @@ -42,7 +39,6 @@
InlineVlmOptions,
ResponseFormat,
)
from docling.datamodel.settings import settings
from docling.models.vlm_pipeline_models.api_vlm_model import ApiVlmModel
from docling.models.vlm_pipeline_models.hf_transformers_model import (
HuggingFaceTransformersVlmModel,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ authors = [
requires-python = '>=3.10,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
'docling-core[chunking] (>=2.58.0,<3.0.0)',
'docling-core[chunking] (>=2.62.0,<3.0.0)',
'docling-parse (>=4.7.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',
Expand Down
78 changes: 13 additions & 65 deletions tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
Original file line number Diff line number Diff line change
@@ -1,66 +1,14 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group WebVTT cue block
item-2 at level 2: text: 00:11.000 --> 00:13.000
item-3 at level 2: inline: group WebVTT cue voice span
item-4 at level 3: text: Roger Bingham:
item-5 at level 3: text: We are in New York City
item-6 at level 1: section: group WebVTT cue block
item-7 at level 2: text: 00:13.000 --> 00:16.000
item-8 at level 2: inline: group WebVTT cue voice span
item-9 at level 3: text: Roger Bingham:
item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
item-11 at level 1: section: group WebVTT cue block
item-12 at level 2: text: 00:16.000 --> 00:18.000
item-13 at level 2: inline: group WebVTT cue voice span
item-14 at level 3: text: Roger Bingham:
item-15 at level 3: text: from the American Museum of Natural History
item-16 at level 1: section: group WebVTT cue block
item-17 at level 2: text: 00:18.000 --> 00:20.000
item-18 at level 2: inline: group WebVTT cue voice span
item-19 at level 3: text: Roger Bingham:
item-20 at level 3: text: And with me is Neil deGrasse Tyson
item-21 at level 1: section: group WebVTT cue block
item-22 at level 2: text: 00:20.000 --> 00:22.000
item-23 at level 2: inline: group WebVTT cue voice span
item-24 at level 3: text: Roger Bingham:
item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
item-26 at level 1: section: group WebVTT cue block
item-27 at level 2: text: 00:22.000 --> 00:24.000
item-28 at level 2: inline: group WebVTT cue voice span
item-29 at level 3: text: Roger Bingham:
item-30 at level 3: text: at the AMNH.
item-31 at level 1: section: group WebVTT cue block
item-32 at level 2: text: 00:24.000 --> 00:26.000
item-33 at level 2: inline: group WebVTT cue voice span
item-34 at level 3: text: Roger Bingham:
item-35 at level 3: text: Thank you for walking down here.
item-36 at level 1: section: group WebVTT cue block
item-37 at level 2: text: 00:27.000 --> 00:30.000
item-38 at level 2: inline: group WebVTT cue voice span
item-39 at level 3: text: Roger Bingham:
item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
item-41 at level 1: section: group WebVTT cue block
item-42 at level 2: text: 00:30.000 --> 00:31.500
item-43 at level 2: inline: group WebVTT cue voice span
item-44 at level 3: text: Roger Bingham:
item-45 at level 3: text: When we e-mailed—
item-46 at level 1: section: group WebVTT cue block
item-47 at level 2: text: 00:30.500 --> 00:32.500
item-48 at level 2: inline: group WebVTT cue voice span
item-49 at level 3: text: Neil deGrasse Tyson:
item-50 at level 3: text: Didn’t we talk about enough in that conversation?
item-51 at level 1: section: group WebVTT cue block
item-52 at level 2: text: 00:32.000 --> 00:35.500
item-53 at level 2: inline: group WebVTT cue voice span
item-54 at level 3: text: Roger Bingham:
item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
item-56 at level 1: section: group WebVTT cue block
item-57 at level 2: text: 00:32.500 --> 00:33.500
item-58 at level 2: inline: group WebVTT cue voice span
item-59 at level 3: text: Neil deGrasse Tyson:
item-60 at level 3: text: Laughs
item-61 at level 1: section: group WebVTT cue block
item-62 at level 2: text: 00:35.500 --> 00:38.000
item-63 at level 2: inline: group WebVTT cue voice span
item-64 at level 3: text: Roger Bingham:
item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
item-1 at level 1: text: We are in New York City
item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street
item-3 at level 1: text: from the American Museum of Natural History
item-4 at level 1: text: And with me is Neil deGrasse Tyson
item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium
item-6 at level 1: text: at the AMNH.
item-7 at level 1: text: Thank you for walking down here.
item-8 at level 1: text: And I want to do a follow-up on the last conversation we did.
item-9 at level 1: text: When we e-mailed—
item-10 at level 1: text: Didn’t we talk about enough in that conversation?
item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos
item-12 at level 1: text: Laughs
item-13 at level 1: text: You know I’m so excited my glasses are falling off here.
Loading