Heuristic fallback for edge case tables

VikParuchuri · Oct 14, 2024 · 859302c · 859302c
1 parent 97b8341
commit 859302c
Show file tree

Hide file tree

Showing 11 changed files with 279 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Tabled is a small library for detecting and extracting tables.  It uses [surya](
 
 ## Community
 
-[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.`
+[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
 
 # Hosted API
 
@@ -87,26 +87,45 @@ pip install streamlit
 tabled_gui
 ```
 
+## From python
+
+```python
+from tabled.extract import extract_tables
+from tabled.fileinput import load_pdfs_images
+from tabled.inference.models import load_detection_models, load_recognition_models
+
+det_models, rec_models = load_detection_models(), load_recognition_models()
+images, highres_images, names, text_lines = load_pdfs_images(IN_PATH)
+
+page_results = extract_tables(images, highres_images, text_lines, det_models, rec_models)
+```
+
 # Benchmarks
 
-|   Avg score | Time per table (s) |   Total tables |
-|-------------|--------------------|----------------|
-|        0.91 | 0.03               |            688 |
+|   Avg score |   Time per table |   Total tables |
+|-------------|------------------|----------------|
+|       0.847 |            0.029 |            688 |
 
 ## Quality
 
 Getting good ground truth data for tables is hard, since you're either constrained to simple layouts that can be heuristically parsed and rendered, or you need to use LLMs, which make mistakes.  I chose to use GPT-4 table predictions as a pseudo-ground-truth.
 
-Tabled gets a `.91` alignment score when compared to GPT-4, which indicates alignment between the text in table rows/cells.  Some of the misalignments are due to GPT-4 mistakes, or small inconsistencies in what GPT-4 considered the borders of the table.  In general, extraction quality is quite high.
+Tabled gets a `.847` alignment score when compared to GPT-4, which indicates alignment between the text in table rows/cells.  Some of the misalignments are due to GPT-4 mistakes, or small inconsistencies in what GPT-4 considered the borders of the table.  In general, extraction quality is quite high.
 
 ## Performance
 
-Running on an A10G with 10GB of VRAM usage and batch size `64`, tabled takes `.03` seconds per table.
+Running on an A10G with 10GB of VRAM usage and batch size `64`, tabled takes `.029` seconds per table.
 
-## Running your own
+## Running the benchmark
 
 Run the benchmark with:
 
 ```shell
 python benchmarks/benchmark.py out.json
-```
+```
+
+# Acknowledgements
+
+- Thank you to [Peter Jansen](https://cognitiveai.org/) for the benchmarking dataset, and for discussion about table parsing.
+- Huggingface for inference code and model hosting
+- PyTorch for training/inference
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -27,6 +27,7 @@ def main():
     results = []
     table_imgs = []
     table_blocks = []
+    image_sizes = []
     for i in range(len(ds)):
         row = ds[i]
         line_data = json.loads(row["text_lines"])
@@ -37,11 +38,12 @@ def main():
         table_block = get_table_blocks([table_bbox], line_data, image_size)[0]
         table_imgs.append(table_img)
         table_blocks.append(table_block)
+        image_sizes.append(image_size)
 
     start = time.time()
     table_rec = recognize_tables(table_imgs, table_blocks, [False] * len(table_imgs), rec_models)
     total_time = time.time() - start
-    cells = [assign_rows_columns(tr) for tr in table_rec]
+    cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, image_sizes)]
 
     for i in range(len(ds)):
         row = ds[i]

diff --git a/benchmarks/scoring.py b/benchmarks/scoring.py
@@ -15,18 +15,13 @@ def align_rows(hypothesis, ref_row):
     best_alignment = []
     best_alignment_score = 0
     for j in range(0, len(hypothesis)):
-        hyp_row = hypothesis[j]
         alignments = []
         for i in range(len(ref_row)):
             if i >= len(hypothesis[j]):
                 alignments.append(0)
                 continue
-            max_cell_align = 0
-            for k in range(0, len(hyp_row)):
-                cell_align = fuzz.ratio(hyp_row[k], ref_row[i], score_cutoff=30) / 100
-                if cell_align > max_cell_align:
-                    max_cell_align = cell_align
-            alignments.append(max_cell_align)
+            alignment = fuzz.ratio(hypothesis[j][i], ref_row[i], score_cutoff=30) / 100
+            alignments.append(alignment)
         if len(alignments) == 0:
             continue
         alignment_score = sum(alignments) / len(alignments)

diff --git a/extract.py b/extract.py
@@ -9,13 +9,8 @@
 
 from tabled.extract import extract_tables
 from tabled.formats import formatter
-from tabled.formats.markdown import markdown_format
-from tabled.inference.detection import detect_tables
-
-from tabled.assignment import assign_rows_columns
 from tabled.fileinput import load_pdfs_images
 from tabled.inference.models import load_detection_models, load_recognition_models
-from tabled.inference.recognition import get_cells, recognize_tables
 
 
 @click.command()

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ pydantic-settings = "^2.5.2"
 pydantic = "^2.9.2"
 python-dotenv = "^1.0.1"
 tabulate = "^0.9.0"
+scikit-learn = "^1.5.2"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.1.1"

diff --git a/table_app.py b/table_app.py
@@ -35,7 +35,7 @@ def run_table_rec(image, highres_image, text_line, models, skip_detection=False,
     cells, needs_ocr = get_cells(table_imgs, table_bboxes, highres_image_sizes, table_text_lines, models[0][:2], detect_boxes=detect_boxes)
 
     table_rec = recognize_tables(table_imgs, cells, needs_ocr, models[1])
-    cells = [assign_rows_columns(tr) for tr in table_rec]
+    cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, highres_image_sizes)]
 
     out_data = []
     for idx, (cell, pred, table_img) in enumerate(zip(cells, table_rec, table_imgs)):

diff --git a/tabled/assignment.py b/tabled/assignment.py
@@ -3,6 +3,7 @@
 import numpy as np
 from surya.schema import TableResult, Bbox
 
+from tabled.heuristics import heuristic_layout
 from tabled.schema import SpanTableCell
 
 
@@ -227,11 +228,18 @@ def find_row_gap(r1, r2):
     detection_result.rows = new_rows
 
 
-def assign_rows_columns(detection_result: TableResult) -> List[SpanTableCell]:
+def assign_rows_columns(detection_result: TableResult, image_size: list, heuristic_thresh=.6) -> List[SpanTableCell]:
     table_cells = initial_assignment(detection_result)
     merge_multiline_rows(detection_result, table_cells)
     table_cells = initial_assignment(detection_result)
     assign_overlappers(table_cells, detection_result)
+    total_unassigned = len([tc for tc in table_cells if tc.row_ids[0] is None or tc.col_ids[0] is None])
+    unassigned_frac = total_unassigned / max(len(table_cells), 1)
+
+    if unassigned_frac > heuristic_thresh:
+        table_cells = heuristic_layout(table_cells, image_size)
+        return table_cells
+
     assign_unassigned(table_cells, detection_result)
     handle_rowcol_spans(table_cells, detection_result)
     return table_cells
diff --git a/tabled/extract.py b/tabled/extract.py
@@ -23,7 +23,7 @@ def extract_tables(images, highres_images, text_lines, det_models, rec_models, s
     cells, needs_ocr = get_cells(table_imgs, table_bboxes, highres_image_sizes, table_text_lines, det_models[:2], detect_boxes=detect_boxes)
 
     table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
-    cells = [assign_rows_columns(tr) for tr in table_rec]
+    cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, highres_image_sizes)]
 
     results = []
     counter = 0