docling-project · dolfim-ibm · Aug 7, 2024 · Aug 7, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -15,7 +15,6 @@ COPY examples/minimal.py /root/minimal.py
 
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
 RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
-RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
 
 # On container shell:
 # > cd /root/

diff --git a/README.md b/README.md
@@ -56,17 +56,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 
 ### Convert a batch of documents
 
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 
 From a local repo clone, you can run it with:
 
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 
 ### Adjust pipeline features
 
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
+
+
 #### Control pipeline options
 
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:

diff --git a/examples/convert.py → examples/batch_convert.py b/examples/convert.py → examples/batch_convert.py
@@ -4,9 +4,7 @@
 from pathlib import Path
 from typing import Iterable
 
-# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter
 
@@ -52,16 +50,7 @@ def main():
         Path("./test/data/2305.03393v1.pdf"),
     ]
 
-    artifacts_path = DocumentConverter.download_models_hf()
-
-    pipeline_options = PipelineOptions(do_table_structure=True)
-    pipeline_options.table_structure_options.do_cell_matching = True
-
-    doc_converter = DocumentConverter(
-        artifacts_path=artifacts_path,
-        pipeline_options=pipeline_options,
-        pdf_backend=DoclingParseDocumentBackend,
-    )
+    doc_converter = DocumentConverter()
 
     input = DocumentConversionInput.from_paths(input_doc_paths)
 

diff --git a/examples/custom_convert.py b/examples/custom_convert.py
@@ -0,0 +1,125 @@
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def export_documents(
+    converted_docs: Iterable[ConvertedDocument],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+
+    for doc in converted_docs:
+        if doc.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = doc.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(doc.render_as_dict()))
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w") as fp:
+                fp.write(doc.render_as_markdown())
+        else:
+            _log.info(f"Document {doc.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./test/data/2206.01062.pdf"),
+        Path("./test/data/2203.01017v2.pdf"),
+        Path("./test/data/2305.03393v1.pdf"),
+    ]
+
+    ###########################################################################
+
+    # The following sections contain a combination of PipelineOptions
+    # and PDF Backends for various configurations.
+    # Uncomment one section at the time to see the differences in the output.
+
+    # PyPdfium without OCR
+    # --------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=False
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = False
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=PyPdfiumDocumentBackend,
+    # )
+
+    # PyPdfium with OCR
+    # -----------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=False
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=PyPdfiumDocumentBackend,
+    # )
+
+    # Docling Parse without OCR
+    # -------------------------
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    # Docling Parse with OCR
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=True
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    ###########################################################################
+
+    # Define input files
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    converted_docs = doc_converter.convert(input)
+    export_documents(converted_docs, output_dir=Path("./scratch"))
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/minimal.py b/examples/minimal.py
@@ -1,11 +1,8 @@
-from docling.datamodel.document import DocumentConversionInput
 from docling.document_converter import DocumentConverter
 
-artifacts_path = DocumentConverter.download_models_hf()
-doc_converter = DocumentConverter(artifacts_path=artifacts_path)
-
-input = DocumentConversionInput.from_paths(["factsheet.pdf"])
-converted_docs = doc_converter.convert(input)
-
-for d in converted_docs:
-    print(d.render_as_dict())
+source = "https://arxiv.org/pdf/2206.01062"  # PDF path or URL
+converter = DocumentConverter()
+doc = converter.convert_single(source)
+print(
+    doc.export_to_markdown()
+)  # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"