Skip to content

Commit

Permalink
feat: update parser with bytesio interface and set as new default bac…
Browse files Browse the repository at this point in the history
…kend (#32)

* update parser with bytesio interface

Signed-off-by: Michele Dolfi <[email protected]>

* change default backend

Signed-off-by: Michele Dolfi <[email protected]>

* update DEFAULT_BACKEND

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Aug 14, 2024
1 parent 61be78a commit 90dd676
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 32 deletions.
7 changes: 4 additions & 3 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,11 @@ def __init__(self, path_or_stream: Union[BytesIO, Path]):
super().__init__(path_or_stream)
self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call
if isinstance(path_or_stream, BytesIO):
raise NotImplemented("This backend does not support byte streams yet.")
parser = pdf_parser()
self._parser_doc = parser.find_cells(str(path_or_stream))
if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else:
self._parser_doc = parser.find_cells(str(path_or_stream))

def page_count(self) -> int:
return len(self._parser_doc["pages"])
Expand Down
6 changes: 3 additions & 3 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pydantic import BaseModel

from docling.backend.abstract_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
Expand Down Expand Up @@ -64,7 +64,7 @@ def __init__(
path_or_stream: Union[BytesIO, Path],
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
pdf_backend=PyPdfiumDocumentBackend,
pdf_backend=DoclingParseDocumentBackend,
):
super().__init__()

Expand Down Expand Up @@ -308,7 +308,7 @@ class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits()

DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend

def docs(
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
Expand Down
103 changes: 78 additions & 25 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = { version = "^1.7", optional = true }
docling-parse = "^0.0.1"
docling-parse = "^0.2.0"
certifi = ">=2024.7.4"

[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit 90dd676

Please sign in to comment.