From fbbbdb45659ef8ea6e92419c02a9ec072da0d5ca Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Wed, 12 Dec 2018 17:39:54 +0100 Subject: [PATCH] Use std::stringstream to generate ALTO output and add element Using std::stringstream simplifies the code. The element is needed between two >String> elements. Remove also some unneeded spaces in the ALTO output. Signed-off-by: Stefan Weil --- src/api/altorenderer.cpp | 131 ++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 78 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 67b7c1fc7e..497eb21394 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -14,17 +14,17 @@ // limitations under the License. #include +#include // for std::stringstream #include "baseapi.h" #include "renderer.h" namespace tesseract { -/// -/// Add coordinates to specified TextBlock, TextLine, or String bounding box -/// Add word confidence if adding to a String bounding box +/// Add coordinates to specified TextBlock, TextLine or String bounding box. +/// Add word confidence if adding to a String bounding box. /// static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level, - STRING* alto_str) { + std::stringstream& alto_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); @@ -33,43 +33,19 @@ static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level, int height = bottom - top; int width = right - left; - *alto_str += " HPOS=\""; - alto_str->add_str_int("", hpos); - *alto_str += "\""; - *alto_str += " VPOS=\""; - alto_str->add_str_int("", vpos); - *alto_str += "\""; - *alto_str += " WIDTH=\""; - alto_str->add_str_int("", width); - *alto_str += "\""; - *alto_str += " HEIGHT=\""; - alto_str->add_str_int("", height); - *alto_str += "\""; + alto_str << " HPOS=\"" << hpos << "\""; + alto_str << " VPOS=\"" << vpos << "\""; + alto_str << " WIDTH=\"" << width << "\""; + alto_str << " HEIGHT=\"" << height << "\""; if (level == RIL_WORD) { int wc = it->Confidence(RIL_WORD); - *alto_str += " WC=\"0."; - alto_str->add_str_int("", wc); - *alto_str += "\""; - } - if (level != RIL_WORD) { - *alto_str += ">"; + alto_str << " WC=\"0." << wc << "\""; + } else { + alto_str << ">"; } } -/// -/// Add a unique ID to an ALTO element -/// -static void AddIdToAlto(STRING* alto_str, const std::string base, int num1) { - const size_t BUFSIZE = 64; - char id_buffer[BUFSIZE]; - snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); - id_buffer[BUFSIZE - 1] = '\0'; - *alto_str += " ID=\""; - *alto_str += id_buffer; - *alto_str += "\""; -} - /// /// Append the ALTO XML for the beginning of the document /// @@ -111,10 +87,10 @@ bool TessAltoRenderer::BeginDocumentHandler() { /// Append the ALTO XML for the layout of the image /// bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) { - const std::unique_ptr hocr(api->GetAltoText(imagenum())); - if (hocr == nullptr) return false; + const std::unique_ptr text(api->GetAltoText(imagenum())); + if (text == nullptr) return false; - AppendString(hocr.get()); + AppendString(text.get()); return true; } @@ -150,8 +126,6 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { int lcnt = 0, bcnt = 0, wcnt = 0; int page_id = page_number; - STRING alto_str(""); - if (input_file_ == nullptr) SetInputName(nullptr); #ifdef _WIN32 @@ -171,23 +145,16 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { delete[] utf8_str; #endif - alto_str += "\t\t\n"; + std::stringstream alto_str; + alto_str + << "\t\t\n" + << "\t\t\t\n"; ResultIterator* res_it = GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { @@ -197,58 +164,66 @@ char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { } if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - alto_str += "\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { - alto_str += "\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + int left, top, right, bottom; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + do { const std::unique_ptr grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { - alto_str += HOcrEscape(grapheme.get()); + alto_str << HOcrEscape(grapheme.get()).c_str(); } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - alto_str += "\"/>\n"; + alto_str << "\"/>"; wcnt++; if (last_word_in_line) { - alto_str += "\t\t\t\t\t\n"; + alto_str << "\n\t\t\t\t\t\n"; lcnt++; + } else { + int hpos = right; + int vpos = top; + res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); + int width = left - hpos; + alto_str << "\n"; } if (last_word_in_block) { - alto_str += "\t\t\t\t\n"; + alto_str << "\t\t\t\t\n"; bcnt++; } } - alto_str += "\t\t\t\n"; - alto_str += "\t\t\n"; + alto_str << "\t\t\t\n" + << "\t\t\n"; + const std::string& text = alto_str.str(); - char* ret = new char[alto_str.length() + 1]; - strcpy(ret, alto_str.string()); + char* result = new char[text.length() + 1]; + strcpy(result, text.c_str()); delete res_it; - return ret; + return result; } } // namespace tesseract