Skip to content

Commit c196456

Browse files
committed
pdfrenderer.cpp: Ignore non-text blocks
Fix tesseract-ocr#3957.
1 parent 490611e commit c196456

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

src/api/pdfrenderer.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include <allheaders.h>
2727
#include <tesseract/baseapi.h>
28+
#include <tesseract/publictypes.h> // for PTIsTextType()
2829
#include <tesseract/renderer.h>
2930
#include <cmath>
3031
#include <cstring>
@@ -354,6 +355,12 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double
354355
const std::unique_ptr</*non-const*/ ResultIterator> res_it(api->GetIterator());
355356
while (!res_it->Empty(RIL_BLOCK)) {
356357
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
358+
auto block_type = res_it->BlockType();
359+
if (!PTIsTextType(block_type)) {
360+
// ignore non-text blocks
361+
res_it->Next(RIL_BLOCK);
362+
continue;
363+
}
357364
pdf_str << "BT\n3 Tr"; // Begin text object, use invisible ink
358365
old_fontsize = 0; // Every block will declare its fontsize
359366
new_block = true; // Every block will declare its affine matrix

0 commit comments

Comments
 (0)