Fixed issue #633 (multi-language mode

tesseract-ocr · Jan 25, 2017 · b453f74 · stweil · Feb 18, 2017 · theraysmith
1 parent ca16a08
commit b453f74
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 43 deletions.
diff --git a/ccmain/control.cpp b/ccmain/control.cpp
@@ -754,29 +754,39 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
   }
 }
 
-// Factored helper considers the indexed word and updates all the pointed
-// values.
-static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
-                         float* rating, float* certainty, bool* bad,
-                         bool* valid_permuter, int* right, int* next_left) {
+// Helper finds the gap between the index word and the next.
+static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
+                    int* next_left) {
   *right = -MAX_INT32;
   *next_left = MAX_INT32;
   if (index < words.size()) {
+    *right = words[index]->word->bounding_box().right();
+    if (index + 1 < words.size())
+      *next_left = words[index + 1]->word->bounding_box().left();
+  }
+}
+
+// Factored helper computes the rating, certainty, badness and validity of
+// the permuter of the words in [first_index, end_index).
+static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
+                             int first_index, int end_index, float* rating,
+                             float* certainty, bool* bad,
+                             bool* valid_permuter) {
+  if (end_index <= first_index) {
+    *bad = true;
+    *valid_permuter = false;
+  }
+  for (int index = first_index; index < end_index && index < words.size();
+       ++index) {
     WERD_CHOICE* choice = words[index]->best_choice;
-    if (choice == NULL) {
+    if (choice == nullptr) {
       *bad = true;
     } else {
       *rating += choice->rating();
       *certainty = MIN(*certainty, choice->certainty());
       if (!Dict::valid_word_permuter(choice->permuter(), false))
         *valid_permuter = false;
     }
-    *right = words[index]->word->bounding_box().right();
-    if (index + 1 < words.size())
-      *next_left = words[index + 1]->word->bounding_box().left();
-  } else {
-    *valid_permuter = false;
-    *bad = true;
   }
 }
 
@@ -801,24 +811,13 @@ static int SelectBestWords(double rating_ratio,
   while (b < best_words->size() || n < new_words->size()) {
     // Start of the current run in each.
     int start_b = b, start_n = n;
-    // Rating of the current run in each.
-    float b_rating = 0.0f, n_rating = 0.0f;
-    // Certainty of the current run in each.
-    float b_certainty = 0.0f, n_certainty = 0.0f;
-    // True if any word is missing its best choice.
-    bool b_bad = false, n_bad = false;
-    // True if all words have a valid permuter.
-    bool b_valid_permuter = true, n_valid_permuter = true;
-
     while (b < best_words->size() || n < new_words->size()) {
       int b_right = -MAX_INT32;
       int next_b_left = MAX_INT32;
-      EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
-                   &b_valid_permuter, &b_right, &next_b_left);
+      WordGap(*best_words, b, &b_right, &next_b_left);
       int n_right = -MAX_INT32;
       int next_n_left = MAX_INT32;
-      EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
-                   &n_valid_permuter, &n_right, &next_n_left);
+      WordGap(*new_words, n, &n_right, &next_n_left);
       if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
         // The word breaks overlap. [start_b,b] and [start_n, n] match.
         break;
@@ -830,29 +829,41 @@ static int SelectBestWords(double rating_ratio,
       else
         ++n;
     }
+    // Rating of the current run in each.
+    float b_rating = 0.0f, n_rating = 0.0f;
+    // Certainty of the current run in each.
+    float b_certainty = 0.0f, n_certainty = 0.0f;
+    // True if any word is missing its best choice.
+    bool b_bad = false, n_bad = false;
+    // True if all words have a valid permuter.
+    bool b_valid_permuter = true, n_valid_permuter = true;
+    int end_b = b < best_words->size() ? b + 1 : b;
+    int end_n = n < new_words->size() ? n + 1 : n;
+    EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
+                     &b_bad, &b_valid_permuter);
+    EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
+                     &n_bad, &n_valid_permuter);
     bool new_better = false;
     if (!n_bad && (b_bad || (n_certainty > b_certainty &&
                              n_rating < b_rating) ||
                             (!b_valid_permuter && n_valid_permuter &&
                              n_rating < b_rating * rating_ratio &&
                              n_certainty > b_certainty - certainty_margin))) {
       // New is better.
-      for (int i = start_n; i <= n; ++i) {
+      for (int i = start_n; i < end_n; ++i) {
         out_words.push_back((*new_words)[i]);
         (*new_words)[i] = NULL;
         ++num_new;
       }
       new_better = true;
     } else if (!b_bad) {
       // Current best is better.
-      for (int i = start_b; i <= b; ++i) {
+      for (int i = start_b; i < end_b; ++i) {
         out_words.push_back((*best_words)[i]);
         (*best_words)[i] = NULL;
         ++num_best;
       }
     }
-    int end_b = b < best_words->size() ? b + 1 : b;
-    int end_n = n < new_words->size() ? n + 1 : n;
     if (debug) {
       tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
               " valid dict: %d v %d\n",
@@ -875,10 +886,9 @@ static int SelectBestWords(double rating_ratio,
 // Returns positive if this recognizer found more new best words than the
 // number kept from best_words.
 int Tesseract::RetryWithLanguage(const WordData& word_data,
-                                 WordRecognizer recognizer,
+                                 WordRecognizer recognizer, bool debug,
                                  WERD_RES** in_word,
                                  PointerVector<WERD_RES>* best_words) {
-  bool debug = classify_debug_level;
   if (debug) {
     tprintf("Trying word using lang %s, oem %d\n",
             lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
@@ -1281,7 +1291,8 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
   // Points to the best result. May be word or in lang_words.
   WERD_RES* word = word_data->word;
   clock_t start_t = clock();
-  if (classify_debug_level) {
+  bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
+  if (debug) {
     tprintf("%s word with lang %s at:",
             word->done ? "Already done" : "Processing",
             most_recently_used_->lang.string());
@@ -1300,20 +1311,20 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
          most_recently_used_ != sub_langs_[sub]; ++sub) {}
   }
   most_recently_used_->RetryWithLanguage(
-      *word_data, recognizer, &word_data->lang_words[sub], &best_words);
+      *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
   Tesseract* best_lang_tess = most_recently_used_;
   if (!WordsAcceptable(best_words)) {
     // Try all the other languages to see if they are any better.
     if (most_recently_used_ != this &&
-        this->RetryWithLanguage(*word_data, recognizer,
+        this->RetryWithLanguage(*word_data, recognizer, debug,
                                 &word_data->lang_words[sub_langs_.size()],
                                 &best_words) > 0) {
       best_lang_tess = this;
     }
     for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
          ++i) {
       if (most_recently_used_ != sub_langs_[i] &&
-          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
+          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
                                            &word_data->lang_words[i],
                                            &best_words) > 0) {
         best_lang_tess = sub_langs_[i];

diff --git a/ccmain/linerec.cpp b/ccmain/linerec.cpp
@@ -309,6 +309,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
                 word_certainty);
         word->best_choice->print();
       }
+      word->best_choice->set_certainty(word_certainty);
       // Discard words that are impossibly bad, but allow a bit more for
       // dictionary words, and keep bad words in non-space-delimited langs.
       if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
@@ -324,7 +325,6 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
         // It is a dud.
         word->SetupFake(lstm_recognizer_->GetUnicharset());
       }
-      word->best_choice->set_certainty(word_certainty);
     }
   }
 }

diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
@@ -214,6 +214,8 @@ Tesseract::Tesseract()
       BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
       double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
       double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
+      INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
+                 this->params()),
       INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
                  this->params()),
       BOOL_MEMBER(paragraph_text_based, true,
@@ -636,6 +638,8 @@ Tesseract::~Tesseract() {
 }
 
 void Tesseract::Clear() {
+  STRING debug_name = imagebasename + "_debug.pdf";
+  pixa_debug_.WritePDF(debug_name.string());
   pixDestroy(&pix_binary_);
   pixDestroy(&pix_grey_);
   pixDestroy(&pix_thresholds_);
@@ -703,7 +707,7 @@ void Tesseract::PrepareForPageseg() {
   // the newly splitted image.
   splitter_.set_orig_pix(pix_binary());
   splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
-  if (splitter_.Split(true)) {
+  if (splitter_.Split(true, &pixa_debug_)) {
     ASSERT_HOST(splitter_.splitted_image());
     pixDestroy(&pix_binary_);
     pix_binary_ = pixClone(splitter_.splitted_image());
@@ -732,7 +736,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
   splitter_.set_segmentation_block_list(block_list);
   splitter_.set_ocr_split_strategy(max_ocr_strategy);
   // Run the splitter for OCR
-  bool split_for_ocr = splitter_.Split(false);
+  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
   // Restore pix_binary to the binarized original pix for future reference.
   ASSERT_HOST(splitter_.orig_pix());
   pixDestroy(&pix_binary_);

diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
@@ -28,11 +28,12 @@
 
 #include "allheaders.h"
 #include "control.h"
-#include "docqual.h"
+#include "debugpixa.h"
 #include "devanagari_processing.h"
+#include "docqual.h"
 #include "genericvector.h"
-#include "params.h"
 #include "ocrclass.h"
+#include "params.h"
 #include "textord.h"
 #include "wordrec.h"
 
@@ -372,9 +373,8 @@ class Tesseract : public Wordrec {
   // Helper to recognize the word using the given (language-specific) tesseract.
   // Returns positive if this recognizer found more new best words than the
   // number kept from best_words.
-  int RetryWithLanguage(const WordData& word_data,
-                        WordRecognizer recognizer,
-                        WERD_RES** in_word,
+  int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
+                        bool debug, WERD_RES** in_word,
                         PointerVector<WERD_RES>* best_words);
   // Moves good-looking "noise"/diacritics from the reject list to the main
   // blob list on the current word. Returns true if anything was done, and
@@ -907,6 +907,7 @@ class Tesseract : public Wordrec {
   BOOL_VAR_H(test_pt, false, "Test for point");
   double_VAR_H(test_pt_x, 99999.99, "xcoord");
   double_VAR_H(test_pt_y, 99999.99, "ycoord");
+  INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
   INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
   BOOL_VAR_H(paragraph_text_based, true,
              "Run paragraph detection on the post-text-recognition "
@@ -1194,6 +1195,8 @@ class Tesseract : public Wordrec {
   Pix* pix_original_;
   // Thresholds that were used to generate the thresholded image from grey.
   Pix* pix_thresholds_;
+  // Debug images. If non-empty, will be written on destruction.
+  DebugPixa pixa_debug_;
   // Input image resolution after any scaling. The resolution is not well
   // transmitted by operations on Pix, so we keep an independent record here.
   int source_resolution_;

diff --git a/lstm/recodebeam.cpp b/lstm/recodebeam.cpp
@@ -276,6 +276,15 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
     }
     if (t < width) {
       int unichar_id = best_nodes[t]->unichar_id;
+      if (unichar_id == UNICHAR_SPACE && !certs->empty() &&
+          best_nodes[t]->permuter != NO_PERM) {
+        // All the rating and certainty go on the previous character except
+        // for the space itself.
+        if (certainty < certs->back()) certs->back() = certainty;
+        ratings->back() += rating;
+        certainty = 0.0;
+        rating = 0.0;
+      }
       unichar_ids->push_back(unichar_id);
       xcoords->push_back(t);
       do {