Skip to content

Commit

Permalink
Heuristic fallback for edge case tables
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 14, 2024
1 parent 97b8341 commit 859302c
Show file tree
Hide file tree
Showing 11 changed files with 279 additions and 25 deletions.
35 changes: 27 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Tabled is a small library for detecting and extracting tables. It uses [surya](

## Community

[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.`
[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.

# Hosted API

Expand Down Expand Up @@ -87,26 +87,45 @@ pip install streamlit
tabled_gui
```

## From python

```python
from tabled.extract import extract_tables
from tabled.fileinput import load_pdfs_images
from tabled.inference.models import load_detection_models, load_recognition_models

det_models, rec_models = load_detection_models(), load_recognition_models()
images, highres_images, names, text_lines = load_pdfs_images(IN_PATH)

page_results = extract_tables(images, highres_images, text_lines, det_models, rec_models)
```

# Benchmarks

| Avg score | Time per table (s) | Total tables |
|-------------|--------------------|----------------|
| 0.91 | 0.03 | 688 |
| Avg score | Time per table | Total tables |
|-------------|------------------|----------------|
| 0.847 | 0.029 | 688 |

## Quality

Getting good ground truth data for tables is hard, since you're either constrained to simple layouts that can be heuristically parsed and rendered, or you need to use LLMs, which make mistakes. I chose to use GPT-4 table predictions as a pseudo-ground-truth.

Tabled gets a `.91` alignment score when compared to GPT-4, which indicates alignment between the text in table rows/cells. Some of the misalignments are due to GPT-4 mistakes, or small inconsistencies in what GPT-4 considered the borders of the table. In general, extraction quality is quite high.
Tabled gets a `.847` alignment score when compared to GPT-4, which indicates alignment between the text in table rows/cells. Some of the misalignments are due to GPT-4 mistakes, or small inconsistencies in what GPT-4 considered the borders of the table. In general, extraction quality is quite high.

## Performance

Running on an A10G with 10GB of VRAM usage and batch size `64`, tabled takes `.03` seconds per table.
Running on an A10G with 10GB of VRAM usage and batch size `64`, tabled takes `.029` seconds per table.

## Running your own
## Running the benchmark

Run the benchmark with:

```shell
python benchmarks/benchmark.py out.json
```
```

# Acknowledgements

- Thank you to [Peter Jansen](https://cognitiveai.org/) for the benchmarking dataset, and for discussion about table parsing.
- Huggingface for inference code and model hosting
- PyTorch for training/inference
4 changes: 3 additions & 1 deletion benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def main():
results = []
table_imgs = []
table_blocks = []
image_sizes = []
for i in range(len(ds)):
row = ds[i]
line_data = json.loads(row["text_lines"])
Expand All @@ -37,11 +38,12 @@ def main():
table_block = get_table_blocks([table_bbox], line_data, image_size)[0]
table_imgs.append(table_img)
table_blocks.append(table_block)
image_sizes.append(image_size)

start = time.time()
table_rec = recognize_tables(table_imgs, table_blocks, [False] * len(table_imgs), rec_models)
total_time = time.time() - start
cells = [assign_rows_columns(tr) for tr in table_rec]
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, image_sizes)]

for i in range(len(ds)):
row = ds[i]
Expand Down
9 changes: 2 additions & 7 deletions benchmarks/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,13 @@ def align_rows(hypothesis, ref_row):
best_alignment = []
best_alignment_score = 0
for j in range(0, len(hypothesis)):
hyp_row = hypothesis[j]
alignments = []
for i in range(len(ref_row)):
if i >= len(hypothesis[j]):
alignments.append(0)
continue
max_cell_align = 0
for k in range(0, len(hyp_row)):
cell_align = fuzz.ratio(hyp_row[k], ref_row[i], score_cutoff=30) / 100
if cell_align > max_cell_align:
max_cell_align = cell_align
alignments.append(max_cell_align)
alignment = fuzz.ratio(hypothesis[j][i], ref_row[i], score_cutoff=30) / 100
alignments.append(alignment)
if len(alignments) == 0:
continue
alignment_score = sum(alignments) / len(alignments)
Expand Down
5 changes: 0 additions & 5 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,8 @@

from tabled.extract import extract_tables
from tabled.formats import formatter
from tabled.formats.markdown import markdown_format
from tabled.inference.detection import detect_tables

from tabled.assignment import assign_rows_columns
from tabled.fileinput import load_pdfs_images
from tabled.inference.models import load_detection_models, load_recognition_models
from tabled.inference.recognition import get_cells, recognize_tables


@click.command()
Expand Down
119 changes: 118 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pydantic-settings = "^2.5.2"
pydantic = "^2.9.2"
python-dotenv = "^1.0.1"
tabulate = "^0.9.0"
scikit-learn = "^1.5.2"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.1.1"
Expand Down
2 changes: 1 addition & 1 deletion table_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def run_table_rec(image, highres_image, text_line, models, skip_detection=False,
cells, needs_ocr = get_cells(table_imgs, table_bboxes, highres_image_sizes, table_text_lines, models[0][:2], detect_boxes=detect_boxes)

table_rec = recognize_tables(table_imgs, cells, needs_ocr, models[1])
cells = [assign_rows_columns(tr) for tr in table_rec]
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, highres_image_sizes)]

out_data = []
for idx, (cell, pred, table_img) in enumerate(zip(cells, table_rec, table_imgs)):
Expand Down
10 changes: 9 additions & 1 deletion tabled/assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
from surya.schema import TableResult, Bbox

from tabled.heuristics import heuristic_layout
from tabled.schema import SpanTableCell


Expand Down Expand Up @@ -227,11 +228,18 @@ def find_row_gap(r1, r2):
detection_result.rows = new_rows


def assign_rows_columns(detection_result: TableResult) -> List[SpanTableCell]:
def assign_rows_columns(detection_result: TableResult, image_size: list, heuristic_thresh=.6) -> List[SpanTableCell]:
table_cells = initial_assignment(detection_result)
merge_multiline_rows(detection_result, table_cells)
table_cells = initial_assignment(detection_result)
assign_overlappers(table_cells, detection_result)
total_unassigned = len([tc for tc in table_cells if tc.row_ids[0] is None or tc.col_ids[0] is None])
unassigned_frac = total_unassigned / max(len(table_cells), 1)

if unassigned_frac > heuristic_thresh:
table_cells = heuristic_layout(table_cells, image_size)
return table_cells

assign_unassigned(table_cells, detection_result)
handle_rowcol_spans(table_cells, detection_result)
return table_cells
2 changes: 1 addition & 1 deletion tabled/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def extract_tables(images, highres_images, text_lines, det_models, rec_models, s
cells, needs_ocr = get_cells(table_imgs, table_bboxes, highres_image_sizes, table_text_lines, det_models[:2], detect_boxes=detect_boxes)

table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
cells = [assign_rows_columns(tr) for tr in table_rec]
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, highres_image_sizes)]

results = []
counter = 0
Expand Down
Loading

0 comments on commit 859302c

Please sign in to comment.