fix: Add unit tests (#51)

* add the pytests Signed-off-by: Peter Staar <[email protected]> * renamed the test folder and added the toplevel test Signed-off-by: Peter Staar <[email protected]> * updated the toplevel function test Signed-off-by: Peter Staar <[email protected]> * need to start running all tests successfully Signed-off-by: Peter Staar <[email protected]> * added the reference converted documents Signed-off-by: Peter Staar <[email protected]> * added first test for json and md output Signed-off-by: Peter Staar <[email protected]> * ran pre-commit Signed-off-by: Peter Staar <[email protected]> * replaced deprecated json function with model_dump_json Signed-off-by: Peter Staar <[email protected]> * replaced deprecated json function with model_dump_json Signed-off-by: Peter Staar <[email protected]> * reformatted code Signed-off-by: Peter Staar <[email protected]> * Fix backend tests Signed-off-by: Christoph Auer <[email protected]> * commented out the drawing Signed-off-by: Peter Staar <[email protected]> * ci: avoid duplicate runs Signed-off-by: Michele Dolfi <[email protected]> * commented out json verification for now Signed-off-by: Peter Staar <[email protected]> * added verification of input cells Signed-off-by: Peter Staar <[email protected]> * reformat code Signed-off-by: Peter Staar <[email protected]> * added test to verify the cells in the pages Signed-off-by: Peter Staar <[email protected]> * added test to verify the cells in the pages (2) Signed-off-by: Peter Staar <[email protected]> * added test to verify the cells in the pages (3) Signed-off-by: Peter Staar <[email protected]> * run all examples in CI Signed-off-by: Michele Dolfi <[email protected]> * make sure examples return failures Signed-off-by: Michele Dolfi <[email protected]> * raise a failure if examples fail Signed-off-by: Michele Dolfi <[email protected]> * fix examples Signed-off-by: Michele Dolfi <[email protected]> * run examples after tests Signed-off-by: Michele Dolfi <[email protected]> * Add tests and update top_level_tests using only datamodels Signed-off-by: Christoph Auer <[email protected]> * Remove unnecessary code Signed-off-by: Christoph Auer <[email protected]> * Validate conversion status on e2e test Signed-off-by: Christoph Auer <[email protected]> * package verify utils and add more tests Signed-off-by: Michele Dolfi <[email protected]> * reduce docs in example, since they are already in the tests Signed-off-by: Michele Dolfi <[email protected]> * skip batch_convert Signed-off-by: Michele Dolfi <[email protected]> * pin docling-parse 1.1.2 Signed-off-by: Michele Dolfi <[email protected]> * updated the error messages Signed-off-by: Peter Staar <[email protected]> * commented out the json verification for now Signed-off-by: Peter Staar <[email protected]> * bumped GLM version Signed-off-by: Peter Staar <[email protected]> * Fix lockfile Signed-off-by: Christoph Auer <[email protected]> * Pin new docling-parse v1.1.3 Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
DS4SD · Aug 30, 2024 · 48f4d1b · 48f4d1b
1 parent 256f4d5
commit 48f4d1b
Show file tree

Hide file tree

Showing 43 changed files with 4,999 additions and 353 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -14,3 +14,22 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Run styling check
         run: poetry run pre-commit run --all-files
+      - name: Install with poetry
+        run: poetry install --all-extras
+      - name: Testing
+        run: |
+          poetry run pytest -v tests
+      - name: Run examples
+        run: |
+          for file in examples/*.py; do
+            # Skip batch_convert.py
+            if [[ "$(basename "$file")" == "batch_convert.py" ]]; then
+                echo "Skipping $file"
+                continue
+            fi
+
+            echo "Running example $file"
+            poetry run python "$file" || exit 1
+          done
+      - name: Build with poetry
+        run: poetry build
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: "Run CI"
 
 on:
   pull_request:
-    types: [opened, reopened, synchronize, ready_for_review]
+    types: [opened, reopened]
   push:
     branches:
       - "**"
@@ -25,4 +25,4 @@ jobs:
   #     - uses: ./.github/actions/setup-poetry
   #     - name: Build docs
   #       run: poetry run mkdocs build --verbose --clean
-
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,15 +4,15 @@ repos:
     hooks:
       - id: system
         name: Black
-        entry: poetry run black docling examples
+        entry: poetry run black docling examples tests
         pass_filenames: false
         language: system
         files: '\.py$'
   - repo: local
     hooks:
       - id: system
         name: isort
-        entry: poetry run isort docling examples
+        entry: poetry run isort docling examples tests
         pass_filenames: false
         language: system
         files: '\.py$'

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -238,9 +238,9 @@ class EquationPrediction(BaseModel):
 
 class PagePredictions(BaseModel):
     layout: LayoutPrediction = None
-    tablestructure: TableStructurePrediction = None
-    figures_classification: FigureClassificationPrediction = None
-    equations_prediction: EquationPrediction = None
+    tablestructure: Optional[TableStructurePrediction] = None
+    figures_classification: Optional[FigureClassificationPrediction] = None
+    equations_prediction: Optional[EquationPrediction] = None
 
 
 PageElement = Union[TextElement, TableElement, FigureElement]

diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py
@@ -16,8 +16,12 @@
 class GlmModel:
     def __init__(self, config):
         self.config = config
+        self.model_names = self.config.get(
+            "model_names", ""
+        )  # "language;term;reference"
         load_pretrained_nlp_models()
-        model = init_nlp_model(model_names="language;term;reference")
+        # model = init_nlp_model(model_names="language;term;reference")
+        model = init_nlp_model(model_names=self.model_names)
         self.model = model
 
     def __call__(self, conv_res: ConversionResult) -> DsDocument:

diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
@@ -44,7 +44,16 @@ def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
 
             for tc in table_element.table_cells:
                 x0, y0, x1, y1 = tc.bbox.as_tuple()
-                draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
+                if tc.column_header:
+                    width = 3
+                else:
+                    width = 1
+                draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
+                draw.text(
+                    (x0 + 3, y0 + 3),
+                    text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
+                    fill="black",
+                )
 
         image.show()
 

diff --git a/examples/batch_convert.py b/examples/batch_convert.py
@@ -49,17 +49,18 @@ def export_documents(
         f"of which {failure_count} failed "
         f"and {partial_success_count} were partially converted."
     )
+    return success_count, partial_success_count, failure_count
 
 
 def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        Path("./test/data/2206.01062.pdf"),
-        Path("./test/data/2203.01017v2.pdf"),
-        Path("./test/data/2305.03393v1.pdf"),
-        Path("./test/data/redp5110.pdf"),
-        Path("./test/data/redp5695.pdf"),
+        Path("./tests/data/2206.01062.pdf"),
+        Path("./tests/data/2203.01017v2.pdf"),
+        Path("./tests/data/2305.03393v1.pdf"),
+        Path("./tests/data/redp5110.pdf"),
+        Path("./tests/data/redp5695.pdf"),
     ]
 
     # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
@@ -73,12 +74,19 @@ def main():
     start_time = time.time()
 
     conv_results = doc_converter.convert(input)
-    export_documents(conv_results, output_dir=Path("./scratch"))
+    success_count, partial_success_count, failure_count = export_documents(
+        conv_results, output_dir=Path("./scratch")
+    )
 
     end_time = time.time() - start_time
 
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
 
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/custom_convert.py b/examples/custom_convert.py
@@ -42,14 +42,14 @@ def export_documents(
         f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
     )
 
+    return success_count, failure_count
+
 
 def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        Path("./test/data/2206.01062.pdf"),
-        Path("./test/data/2203.01017v2.pdf"),
-        Path("./test/data/2305.03393v1.pdf"),
+        Path("./tests/data/2206.01062.pdf"),
     ]
 
     ###########################################################################
@@ -114,12 +114,19 @@ def main():
     start_time = time.time()
 
     conv_results = doc_converter.convert(input)
-    export_documents(conv_results, output_dir=Path("./scratch"))
+    success_count, failure_count = export_documents(
+        conv_results, output_dir=Path("./scratch")
+    )
 
     end_time = time.time() - start_time
 
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
 
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/export_figures.py b/examples/export_figures.py
@@ -22,7 +22,7 @@ def main():
     logging.basicConfig(level=logging.INFO)
 
     input_doc_paths = [
-        Path("./test/data/2206.01062.pdf"),
+        Path("./tests/data/2206.01062.pdf"),
     ]
     output_dir = Path("./scratch")
 
@@ -41,10 +41,13 @@ def main():
 
     conv_results = doc_converter.convert(input_files)
 
+    success_count = 0
+    failure_count = 0
     output_dir.mkdir(parents=True, exist_ok=True)
     for conv_res in conv_results:
         if conv_res.status != ConversionStatus.SUCCESS:
             _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
             continue
 
         doc_filename = conv_res.input.file.stem
@@ -66,10 +69,17 @@ def main():
             with element_image_filename.open("wb") as fp:
                 image.save(fp, "PNG")
 
+        success_count += 1
+
     end_time = time.time() - start_time
 
     _log.info(f"All documents were converted in {end_time:.2f} seconds.")
 
+    if failure_count > 0:
+        raise RuntimeError(
+            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
+        )
+
 
 if __name__ == "__main__":
     main()