From 0499a3f6219da2d2ba70ec4ee72a55832c410390 Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Thu, 18 Apr 2024 12:09:56 +0800 Subject: [PATCH] rm page number exception for pdf parser (#424) ### What problem does this PR solve? #423 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 6c3324544d..c9a200374d 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -830,6 +830,7 @@ def _line_tag(self, bx, ZM): pn = [bx["page_number"]] top = bx["top"] - self.page_cum_height[pn[0] - 1] bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] + if pn[-1] - 1 >= len(self.page_images): return "" while bott * ZM > self.page_images[pn[-1] - 1].size[1]: bott -= self.page_images[pn[-1] - 1].size[1] / ZM pn.append(pn[-1] + 1)