diff --git a/INSTALL.GIT.md b/INSTALL.GIT.md index a9cc30077d..25f3873666 100644 --- a/INSTALL.GIT.md +++ b/INSTALL.GIT.md @@ -38,7 +38,6 @@ all languages). git clone https://github.com/tesseract-ocr/tessdata.git tesseract-ocr.tessdata - You need an Internet connection and [curl](https://curl.haxx.se/) to compile `ScrollView.jar` because the build will automatically download [piccolo2d-core-3.0.1.jar](https://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0.1/piccolo2d-core-3.0.1.jar) and diff --git a/src/api/pdfrenderer.cpp b/src/api/pdfrenderer.cpp index 3f6290152f..88383bb7bf 100644 --- a/src/api/pdfrenderer.cpp +++ b/src/api/pdfrenderer.cpp @@ -242,13 +242,13 @@ static void GetWordBaseline(int writing_direction, int ppi, int height, int word double word_length; double x, y; { - int px = word_x1; - int py = word_y1; double l2 = dist2(line_x1, line_y1, line_x2, line_y2); if (l2 == 0) { x = line_x1; y = line_y1; } else { + int px = word_x1; + int py = word_y1; double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2; x = line_x2 + t * (line_x2 - line_x1); y = line_y2 + t * (line_y2 - line_y1); diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp index d550adfe3b..e50bda50f6 100644 --- a/src/ccmain/applybox.cpp +++ b/src/ccmain/applybox.cpp @@ -258,10 +258,10 @@ void Tesseract::MaximallyChopWord(const std::vector &boxes, BLOCK *block, } const double e = exp(1.0); // The base of natural logs. unsigned blob_number; - int right_chop_index = 0; if (!assume_fixed_pitch_char_segment) { // We only chop if the language is not fixed pitch like CJK. SEAM *seam = nullptr; + int right_chop_index = 0; while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) { word_res->InsertSeam(blob_number, seam); BLOB_CHOICE *left_choice = blob_choices[blob_number]; @@ -685,6 +685,7 @@ void Tesseract::SearchForText(const std::vector *choices, in void Tesseract::TidyUp(PAGE_RES *page_res) { int ok_blob_count = 0; int bad_blob_count = 0; + // TODO: check usage of ok_word_count. int ok_word_count = 0; int unlabelled_words = 0; PAGE_RES_IT pr_it(page_res); diff --git a/src/ccmain/control.cpp b/src/ccmain/control.cpp index 0551be0ca8..30afc47763 100644 --- a/src/ccmain/control.cpp +++ b/src/ccmain/control.cpp @@ -949,6 +949,7 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next } real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr); AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs); + // TODO: check code. int non_overlapped = 0; int non_overlapped_used = 0; for (unsigned i = 0; i < word_wanted.size(); ++i) { @@ -1121,9 +1122,9 @@ bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, C_BLOB *blob, const std::vector &outlines, int num_outlines, std::vector *ok_outlines) { - std::string best_str; float target_cert = certainty_threshold; if (blob != nullptr) { + std::string best_str; float target_c2; target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2); if (debug_noise_removal) { @@ -1797,9 +1798,6 @@ Allow a single hyphen in a lower case word } bool Tesseract::check_debug_pt(WERD_RES *word, int location) { - bool show_map_detail = false; - int16_t i; - if (!test_pt) { return false; } @@ -1811,6 +1809,7 @@ bool Tesseract::check_debug_pt(WERD_RES *word, int location) { if (location < 0) { return true; // For breakpoint use } + bool show_map_detail = false; tessedit_rejection_debug.set_value(true); debug_x_ht_level.set_value(2); tprintf("\n\nTESTWD::"); @@ -1864,7 +1863,7 @@ bool Tesseract::check_debug_pt(WERD_RES *word, int location) { tprintf("\n"); if (show_map_detail) { tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); - for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { + for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); word->reject_map[i].full_print(debug_fp); } @@ -1891,13 +1890,12 @@ static void find_modal_font( // good chars in word int16_t *font_out, // output font int8_t *font_count // output count ) { - int16_t font; // font index - int32_t count; // pile count - if (fonts->get_total() > 0) { - font = static_cast(fonts->mode()); + // font index + int16_t font = static_cast(fonts->mode()); *font_out = font; - count = fonts->pile_count(font); + // pile count + int32_t count = fonts->pile_count(font); *font_count = count < INT8_MAX ? count : INT8_MAX; fonts->add(font, -*font_count); } else { diff --git a/src/ccmain/docqual.cpp b/src/ccmain/docqual.cpp index 8941f8f0d6..15e802d0f3 100644 --- a/src/ccmain/docqual.cpp +++ b/src/ccmain/docqual.cpp @@ -60,10 +60,10 @@ int16_t Tesseract::word_blob_quality(WERD_RES *word) { } int16_t Tesseract::word_outline_errs(WERD_RES *word) { - int16_t i = 0; int16_t err_count = 0; if (word->rebuild_word != nullptr) { + int16_t i = 0; for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) { TBLOB *blob = word->rebuild_word->blobs[b]; err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines()); @@ -209,13 +209,8 @@ void Tesseract::unrej_good_quality_words( // unreject potential void Tesseract::doc_and_block_rejection( // reject big chunks PAGE_RES_IT &page_res_it, bool good_quality_doc) { - int16_t block_no = 0; - int16_t row_no = 0; BLOCK_RES *current_block; - ROW_RES *current_row; - bool rej_word; - bool prev_word_rejected; int16_t char_quality = 0; int16_t accepted_char_quality; @@ -238,7 +233,7 @@ void Tesseract::doc_and_block_rejection( // reject big chunks WERD_RES *word; while ((word = page_res_it.word()) != nullptr) { current_block = page_res_it.block(); - block_no = current_block->block->pdblk.index(); + int16_t block_no = current_block->block->pdblk.index(); if (current_block->char_count > 0 && (current_block->rej_count * 100.0 / current_block->char_count) > tessedit_reject_block_percent) { @@ -246,8 +241,9 @@ void Tesseract::doc_and_block_rejection( // reject big chunks tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no, current_block->char_count, current_block->rej_count); } - prev_word_rejected = false; + bool prev_word_rejected = false; while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) { + bool rej_word; if (tessedit_preserve_blk_rej_perfect_wds) { rej_word = word->reject_map.reject_count() > 0 || word->reject_map.length() < tessedit_preserve_min_wd_len; @@ -284,9 +280,9 @@ void Tesseract::doc_and_block_rejection( // reject big chunks } /* Walk rows in block testing for row rejection */ - row_no = 0; + int16_t row_no = 0; while (page_res_it.word() != nullptr && page_res_it.block() == current_block) { - current_row = page_res_it.row(); + ROW_RES *current_row = page_res_it.row(); row_no++; /* Reject whole row if: fraction of chars on row which are rejected exceed a limit AND @@ -302,9 +298,10 @@ void Tesseract::doc_and_block_rejection( // reject big chunks tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no, current_row->char_count, current_row->rej_count); } - prev_word_rejected = false; + bool prev_word_rejected = false; while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) { /* Preserve words on good docs unless they are mostly rejected*/ + bool rej_word; if (!tessedit_row_rej_good_docs && good_quality_doc) { rej_word = word->reject_map.reject_count() / static_cast(word->reject_map.length()) > @@ -448,8 +445,6 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { } bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) { - float rating_per_ch; - int adjusted_len; int crunch_mode = 0; if (word->best_choice->unichar_string().empty() || @@ -457,11 +452,11 @@ bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level word->best_choice->unichar_string().size())) { crunch_mode = 1; } else { - adjusted_len = word->reject_map.length(); + int adjusted_len = word->reject_map.length(); if (adjusted_len > crunch_rating_max) { adjusted_len = crunch_rating_max; } - rating_per_ch = word->best_choice->rating() / adjusted_len; + float rating_per_ch = word->best_choice->rating() / adjusted_len; if (rating_per_ch > crunch_terrible_rating) { crunch_mode = 2; @@ -528,7 +523,6 @@ bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_leve } void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { - WERD_RES *word; PAGE_RES_IT copy_it; bool deleting_from_bol = false; bool marked_delete_point = false; @@ -539,7 +533,7 @@ void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { page_res_it.restart_page(); while (page_res_it.word() != nullptr) { - word = page_res_it.word(); + WERD_RES *word = page_res_it.word(); delete_mode = word_deletable(word, debug_delete_mode); if (delete_mode != CR_NONE) { diff --git a/src/ccmain/fixspace.cpp b/src/ccmain/fixspace.cpp index dee79395b3..7f1b166dfc 100644 --- a/src/ccmain/fixspace.cpp +++ b/src/ccmain/fixspace.cpp @@ -171,7 +171,6 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_R void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) { int16_t best_score; WERD_RES_LIST current_perm; - int16_t current_score; bool improved = false; best_score = eval_word_spacing(best_perm); // default score @@ -183,7 +182,7 @@ void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK * while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { match_current_words(current_perm, row, block); - current_score = eval_word_spacing(current_perm); + int16_t current_score = eval_word_spacing(current_perm); dump_words(current_perm, current_score, 2, improved); if (current_score > best_score) { best_perm.clear(); @@ -201,11 +200,10 @@ void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK * void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { WERD_RES_IT src_it(&src_list); WERD_RES_IT new_it(&new_list); - WERD_RES *src_wd; WERD_RES *new_wd; for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { - src_wd = src_it.data(); + WERD_RES *src_wd = src_it.data(); if (!src_wd->combination) { new_wd = WERD_RES::deep_copy(src_wd); new_wd->combination = false; @@ -393,8 +391,6 @@ void transform_to_next_perm(WERD_RES_LIST &words) { WERD_RES_IT prev_word_it(&words); WERD_RES *word; WERD_RES *prev_word; - WERD_RES *combo; - WERD *copy_word; int16_t prev_right = -INT16_MAX; TBOX box; int16_t gap; @@ -425,12 +421,13 @@ void transform_to_next_perm(WERD_RES_LIST &words) { gap = box.left() - prev_right; if (gap <= min_gap) { prev_word = prev_word_it.data(); + WERD_RES *combo; if (prev_word->combination) { combo = prev_word; } else { /* Make a new combination and insert before * the first word being joined. */ - copy_word = new WERD; + auto *copy_word = new WERD; *copy_word = *(prev_word->word); // deep copy combo = new WERD_RES(copy_word); @@ -546,7 +543,6 @@ void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) WERD_RES *word_res; WERD_RES_LIST sub_word_list; WERD_RES_IT sub_word_list_it(&sub_word_list); - int16_t blob_index; int16_t new_length; float junk; @@ -556,7 +552,7 @@ void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) return; } - blob_index = worst_noise_blob(word_res, &junk); + auto blob_index = worst_noise_blob(word_res, &junk); if (blob_index < 0) { return; } @@ -623,7 +619,6 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { WERD_RES_IT worst_word_it; float worst_noise_score = 9999; int worst_blob_index = -1; // Noisiest blob of noisiest wd - int blob_index; // of wds noisiest blob float noise_score; // of wds noisiest blob WERD_RES *word_res; C_BLOB_IT blob_it; @@ -636,7 +631,7 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { int16_t i; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { - blob_index = worst_noise_blob(word_it.data(), &noise_score); + auto blob_index = worst_noise_blob(word_it.data(), &noise_score); if (blob_index > -1 && worst_noise_score > noise_score) { worst_noise_score = noise_score; worst_blob_index = blob_index; @@ -806,7 +801,6 @@ float Tesseract::blob_noise_score(TBLOB *blob) { void fixspace_dbg(WERD_RES *word) { TBOX box = word->word->bounding_box(); const bool show_map_detail = false; - int16_t i; box.print(); tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); @@ -816,7 +810,7 @@ void fixspace_dbg(WERD_RES *word) { tprintf("\n"); if (show_map_detail) { tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); - for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { + for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); word->reject_map[i].full_print(debug_fp); } diff --git a/src/ccmain/output.cpp b/src/ccmain/output.cpp index 73f732945f..a530c15e40 100644 --- a/src/ccmain/output.cpp +++ b/src/ccmain/output.cpp @@ -101,11 +101,11 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it, bool force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; - bool need_reject = false; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { + bool need_reject = false; if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) && diff --git a/src/ccmain/paragraphs.cpp b/src/ccmain/paragraphs.cpp index f3d9aa92cd..ed05b73f42 100644 --- a/src/ccmain/paragraphs.cpp +++ b/src/ccmain/paragraphs.cpp @@ -2407,8 +2407,8 @@ static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowI // Set up text, lword_text, and rword_text (mostly for debug printing). std::string fake_text; PageIterator pit(static_cast(it)); - bool first_word = true; if (!pit.Empty(RIL_WORD)) { + bool first_word = true; do { fake_text += "x"; if (first_word) { diff --git a/src/ccmain/pgedit.cpp b/src/ccmain/pgedit.cpp index dd239851f5..27f9b2151c 100644 --- a/src/ccmain/pgedit.cpp +++ b/src/ccmain/pgedit.cpp @@ -703,9 +703,7 @@ bool Tesseract::word_display(PAGE_RES_IT *pr_it) { WERD_RES *word_res = pr_it->word(); WERD *word = word_res->word; TBOX word_bb; // word bounding box - int word_height; // ht of word BB bool displayed_something = false; - float shift; // from bot left if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) { # ifndef DISABLED_LEGACY_ENGINE @@ -842,13 +840,14 @@ bool Tesseract::word_display(PAGE_RES_IT *pr_it) { if (text.length() > 0) { word_bb = word->bounding_box(); image_win->Pen(ScrollView::RED); - word_height = word_bb.height(); - int text_height = 0.50 * word_height; + auto word_height = word_bb.height(); + int text_height = word_height / 2; if (text_height > 20) { text_height = 20; } image_win->TextAttributes("Arial", text_height, false, false, false); - shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f; + // from bot left + float shift = (word_height < word_bb.width()) ? 0.25f * word_height : 0.0f; image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.c_str()); if (blame.length() > 0) { image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height, diff --git a/src/ccmain/reject.cpp b/src/ccmain/reject.cpp index e3d53925ef..1448f8e137 100644 --- a/src/ccmain/reject.cpp +++ b/src/ccmain/reject.cpp @@ -293,8 +293,6 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) { int16_t i; int16_t offset; bool non_conflict_set_char; // non conf set a/n? - bool conflict = false; - bool allow_1s; ACCEPTABLE_WERD_TYPE word_type; bool dict_perm_type; bool dict_word_ok; @@ -411,11 +409,11 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) { Else reject all conflict chs */ if (word_contains_non_1_digit(word, lengths)) { - allow_1s = + bool allow_1s = (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM); int16_t offset; - conflict = false; + bool conflict = false; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_res->best_choice->unichar_lengths()[i++]) { if ((!allow_1s || (word[offset] != '1')) && diff --git a/unittest/README.md b/unittest/README.md index 02303b04ec..64a409689f 100644 --- a/unittest/README.md +++ b/unittest/README.md @@ -1,9 +1,9 @@ # Unit Testing for Tesseract - ## Requirements ### Files and structure + ``` ├── langdata_lstm