fix: Correct text extraction for table cells (#21)

* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak <[email protected]> * Completed checks Signed-off-by: Maxim Lysak <[email protected]> --------- Signed-off-by: Maxim Lysak <[email protected]> Co-authored-by: Maxim Lysak <[email protected]>
DS4SD · Jul 30, 2024 · f4bf3d2 · f4bf3d2
1 parent b07c4a7
commit f4bf3d2
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 3 deletions.
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
@@ -114,12 +114,15 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
                     for element in table_out["tf_responses"]:
 
                         if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(element["bbox"])
+                            the_bbox = BoundingBox.model_validate(
+                                element["bbox"]
+                            ).scaled(1 / self.scale)
                             text_piece = page._backend.get_text_in_rect(the_bbox)
                             element["bbox"]["token"] = text_piece
 
                         tc = TableCell.model_validate(element)
-                        tc.bbox = tc.bbox.scaled(1 / self.scale)
+                        if self.do_cell_matching:
+                            tc.bbox = tc.bbox.scaled(1 / self.scale)
                         table_cells.append(tc)
 
                     # Retrieving cols/rows, after post processing:

diff --git a/examples/convert.py b/examples/convert.py
@@ -53,7 +53,13 @@ def main():
 
     artifacts_path = DocumentConverter.download_models_hf()
 
-    doc_converter = DocumentConverter(artifacts_path=artifacts_path)
+    pipeline_options = PipelineOptions(do_table_structure=True)
+    # use text cells predicted from table structure model, instead of matching with pdf cells
+    pipeline_options.table_structure_options.do_cell_matching = False
+
+    doc_converter = DocumentConverter(
+        artifacts_path=artifacts_path, pipeline_options=pipeline_options
+    )
 
     input = DocumentConversionInput.from_paths(input_doc_paths)