Merge pull request #3592 from stweil/unsigned

Fix compiler warnings (mainly signed / unsigned mismatches) and modernize some code
tesseract-ocr · Oct 10, 2021 · 5a36943 · 5a36943
2 parents 0aad8b8 + d935502
commit 5a36943
Showing 86 changed files with 685 additions and 777 deletions.
diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp
@@ -243,7 +243,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
   std::vector<BLOB_CHOICE *> blob_choices;
   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
   auto rating = static_cast<float>(INT8_MAX);
-  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
+  for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
     // The rating and certainty are not quite arbitrary. Since
     // select_blob_to_chop uses the worst certainty to choose, they all have
     // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
@@ -257,7 +257,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
     rating -= 0.125f;
   }
   const double e = exp(1.0); // The base of natural logs.
-  int blob_number;
+  unsigned blob_number;
   int right_chop_index = 0;
   if (!assume_fixed_pitch_char_segment) {
     // We only chop if the language is not fixed pitch like CJK.
@@ -613,8 +613,8 @@ bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WER
 /// @param best_rating
 /// @param best_segmentation
 void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
-                              int choices_length, const std::vector<UNICHAR_ID> &target_text,
-                              int text_index, float rating, std::vector<int> *segmentation,
+                              unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
+                              unsigned text_index, float rating, std::vector<int> *segmentation,
                               float *best_rating, std::vector<int> *best_segmentation) {
   const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
   for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
@@ -625,12 +625,12 @@ void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, in
     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
       const BLOB_CHOICE *choice = choice_it.data();
       choice_rating = choice->rating();
-      UNICHAR_ID class_id = choice->unichar_id();
+      auto class_id = choice->unichar_id();
       if (class_id == target_text[text_index]) {
         break;
       }
       // Search ambigs table.
-      if (class_id < table.size() && table[class_id] != nullptr) {
+      if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
         AmbigSpec_IT spec_it(table[class_id]);
         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
           const AmbigSpec *ambig_spec = spec_it.data();

diff --git a/src/ccmain/control.cpp b/src/ccmain/control.cpp
@@ -227,7 +227,7 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT
       }
     }
     if (word->word->tess_failed) {
-      int s;
+      unsigned s;
       for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
       }
       // If all are failed, skip it. Image words are skipped by this test.
@@ -727,7 +727,7 @@ void Tesseract::script_pos_pass(PAGE_RES *page_res) {
       // Scan for upper/lower.
       int num_upper = 0;
       int num_lower = 0;
-      for (int i = 0; i < word->best_choice->length(); ++i) {
+      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
         if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
           ++num_upper;
         } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
@@ -743,7 +743,7 @@ void Tesseract::script_pos_pass(PAGE_RES *page_res) {
 }
 
 // Helper finds the gap between the index word and the next.
-static void WordGap(const PointerVector<WERD_RES> &words, int index, int *right, int *next_left) {
+static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
   *right = -INT32_MAX;
   *next_left = INT32_MAX;
   if (index < words.size()) {
@@ -756,13 +756,13 @@ static void WordGap(const PointerVector<WERD_RES> &words, int index, int *right,
 
 // Factored helper computes the rating, certainty, badness and validity of
 // the permuter of the words in [first_index, end_index).
-static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, int first_index, int end_index,
+static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
                              float *rating, float *certainty, bool *bad, bool *valid_permuter) {
   if (end_index <= first_index) {
     *bad = true;
     *valid_permuter = false;
   }
-  for (int index = first_index; index < end_index && index < words.size(); ++index) {
+  for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
     WERD_CHOICE *choice = words[index]->best_choice;
     if (choice == nullptr) {
       *bad = true;
@@ -790,11 +790,11 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de
   // boundary at the end.
   std::vector<WERD_RES *> out_words;
   // Index into each word vector (best, new).
-  int b = 0, n = 0;
+  unsigned b = 0, n = 0;
   int num_best = 0, num_new = 0;
   while (b < best_words->size() || n < new_words->size()) {
     // Start of the current run in each.
-    int start_b = b, start_n = n;
+    auto start_b = b, start_n = n;
     while (b < best_words->size() || n < new_words->size()) {
       int b_right = -INT32_MAX;
       int next_b_left = INT32_MAX;
@@ -884,7 +884,7 @@ int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recog
     *in_word = nullptr;
   }
   if (debug) {
-    for (int i = 0; i < new_words.size(); ++i) {
+    for (unsigned i = 0; i < new_words.size(); ++i) {
       new_words[i]->DebugTopChoice("Lang result");
     }
   }
@@ -896,7 +896,7 @@ int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recog
 
 // Helper returns true if all the words are acceptable.
 static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
-  for (int w = 0; w < words.size(); ++w) {
+  for (unsigned w = 0; w < words.size(); ++w) {
     if (words[w]->tess_failed || !words[w]->tess_accepted) {
       return false;
     }
@@ -1597,10 +1597,10 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *b
         word->fix_hyphens();
       }
       /* Don't trust fix_quotes! - though I think I've fixed the bug */
-      if (word->best_choice->length() != word->box_word->length()) {
+      if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
         tprintf(
             "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
-            " #Blobs=%d\n",
+            " #Blobs=%u\n",
             word->best_choice->debug_string().c_str(), word->best_choice->length(),
             word->box_word->length());
       }
@@ -1621,7 +1621,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *b
 static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
   // Find the corresponding best BLOB_CHOICE from any position in the word_res.
   BLOB_CHOICE *best_choice = nullptr;
-  for (int i = 0; i < word_res->best_choice->length(); ++i) {
+  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
     BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
     if (choice != nullptr) {
       if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
@@ -1637,7 +1637,7 @@ static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_re
 // in the best_choice.
 static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
   WERD_CHOICE *word = word_res->best_choice;
-  for (int i = 0; i < word_res->best_choice->length(); ++i) {
+  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
     BLOB_CHOICE *choice =
         FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
     if (choice == nullptr) {
@@ -1646,7 +1646,7 @@ static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res)
     }
   }
   // Correct any incorrect results in word.
-  for (int i = 0; i < word->length(); ++i) {
+  for (unsigned i = 0; i < word->length(); ++i) {
     if (word->unichar_id(i) != blob_choice->unichar_id()) {
       word->set_unichar_id(blob_choice->unichar_id(), i);
     }
@@ -1666,7 +1666,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {
 
   // Find the frequency of each unique character in the word.
   SortHelper<UNICHAR_ID> rep_ch(word.length());
-  for (int i = 0; i < word.length(); ++i) {
+  for (unsigned i = 0; i < word.length(); ++i) {
     rep_ch.Add(word.unichar_id(i), 1);
   }
 
@@ -1951,7 +1951,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
   if (tessedit_debug_fonts) {
     tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
   }
-  for (int b = 0; b < word->best_choice->length(); ++b) {
+  for (unsigned b = 0; b < word->best_choice->length(); ++b) {
     const BLOB_CHOICE *choice = word->GetBlobChoice(b);
     if (choice == nullptr) {
       continue;

diff --git a/src/ccmain/docqual.cpp b/src/ccmain/docqual.cpp
@@ -64,7 +64,7 @@ int16_t Tesseract::word_outline_errs(WERD_RES *word) {
   int16_t err_count = 0;
 
   if (word->rebuild_word != nullptr) {
-    for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
+    for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
       TBLOB *blob = word->rebuild_word->blobs[b];
       err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
       i++;
@@ -911,7 +911,7 @@ bool Tesseract::noise_outlines(TWERD *word) {
   int16_t max_dimension;
   float small_limit = kBlnXHeight * crunch_small_outlines_size;
 
-  for (int b = 0; b < word->NumBlobs(); ++b) {
+  for (unsigned b = 0; b < word->NumBlobs(); ++b) {
     TBLOB *blob = word->blobs[b];
     for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
       outline_count++;

diff --git a/src/ccmain/equationdetect.cpp b/src/ccmain/equationdetect.cpp
@@ -742,7 +742,7 @@ int EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int
 
   // Search right side.
   index = pos + 1 - sorted_vec.begin();
-  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
+  while (static_cast<size_t>(index) < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
     count++;
   }
 

diff --git a/src/ccmain/fixspace.cpp b/src/ccmain/fixspace.cpp
@@ -262,7 +262,7 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
   int16_t total_score = 0;
   int16_t word_count = 0;
   int16_t done_word_count = 0;
-  int16_t i;
+  int i;
   int16_t offset;
   int16_t prev_word_score = 0;
   bool prev_word_done = false;
@@ -684,7 +684,6 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
 
 int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
   float noise_score[512];
-  int i;
   int min_noise_blob; // 1st contender
   int max_noise_blob; // last contender
   int non_noise_count;
@@ -697,7 +696,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
   }
 
   // Normalised.
-  int blob_count = word_res->box_word->length();
+  auto blob_count = word_res->box_word->length();
   ASSERT_HOST(blob_count <= 512);
   if (blob_count < 5) {
     return -1; // too short to split
@@ -712,7 +711,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
   }
 #endif
 
-  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
+  for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
     TBLOB *blob = word_res->rebuild_word->blobs[i];
     if (word_res->reject_map[i].accepted()) {
       noise_score[i] = non_noise_limit;
@@ -731,7 +730,8 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
   /* Now find the worst one which is far enough away from the end of the word */
 
   non_noise_count = 0;
-  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
+  int i;
+  for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
     if (noise_score[i] >= non_noise_limit) {
       non_noise_count++;
     }
@@ -760,7 +760,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
 
   *worst_noise_score = small_limit;
   worst_noise_blob = -1;
-  for (i = min_noise_blob; i <= max_noise_blob; i++) {
+  for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
     if (noise_score[i] < *worst_noise_score) {
       worst_noise_blob = i;
       *worst_noise_score = noise_score[i];
@@ -838,7 +838,6 @@ int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
   WERD_RES_IT word_it(&word_res_list);
   WERD_RES *word;
   int16_t score = 0;
-  int16_t i;
   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
 
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
@@ -849,9 +848,9 @@ int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
     if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
         word->best_choice->permuter() == FREQ_DAWG_PERM ||
         word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
-      int num_blobs = word->rebuild_word->NumBlobs();
+      auto num_blobs = word->rebuild_word->NumBlobs();
       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
-      for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
+      for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
         TBLOB *blob = word->rebuild_word->blobs[i];
         if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
           score -= 1; // penalise possibly erroneous non-space

diff --git a/src/ccmain/linerec.cpp b/src/ccmain/linerec.cpp
@@ -269,22 +269,14 @@ void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
   if (stopper_dict == nullptr) {
     stopper_dict = &getDict();
   }
-  bool any_nonspace_delimited = false;
-  for (int w = 0; w < words->size(); ++w) {
-    WERD_RES *word = (*words)[w];
-    if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) {
-      any_nonspace_delimited = true;
-      break;
-    }
-  }
-  for (int w = 0; w < words->size(); ++w) {
+  for (unsigned w = 0; w < words->size(); ++w) {
     WERD_RES *word = (*words)[w];
     if (word->best_choice == nullptr) {
       // It is a dud.
       word->SetupFake(lstm_recognizer_->GetUnicharset());
     } else {
       // Set the best state.
-      for (int i = 0; i < word->best_choice->length(); ++i) {
+      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
         int length = word->best_choice->state(i);
         word->best_state.push_back(length);
       }

diff --git a/src/ccmain/ltrresultiterator.cpp b/src/ccmain/ltrresultiterator.cpp
@@ -335,10 +335,10 @@ char *LTRResultIterator::WordNormedUTF8Text() const {
   WERD_CHOICE *best_choice = it_->word()->best_choice;
   const UNICHARSET *unicharset = it_->word()->uch_set;
   ASSERT_HOST(best_choice != nullptr);
-  for (int i = 0; i < best_choice->length(); ++i) {
+  for (unsigned i = 0; i < best_choice->length(); ++i) {
     ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
   }
-  int length = ocr_text.length() + 1;
+  auto length = ocr_text.length() + 1;
   char *result = new char[length];
   strncpy(result, ocr_text.c_str(), length);
   return result;
@@ -404,7 +404,7 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
         strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
       blanks_before_word_ = 0;
     }
-    auto index = *tstep_index_;
+    unsigned index = *tstep_index_;
     index += blanks_before_word_;
     if (index < word_res_->CTC_symbol_choices.size()) {
       LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
@@ -484,7 +484,7 @@ float ChoiceIterator::Confidence() const {
 
 // Returns the set of timesteps which belong to the current symbol
 std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
-  int offset = *tstep_index_ + blanks_before_word_;
+  unsigned offset = *tstep_index_ + blanks_before_word_;
   if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
     return nullptr;
   }

diff --git a/src/ccmain/osdetect.cpp b/src/ccmain/osdetect.cpp
@@ -381,7 +381,7 @@ bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
         for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
              choice_it.forward()) {
           int choice_script = choice_it.data()->script_id();
-          int s = 0;
+          unsigned s = 0;
           for (s = 0; s < allowed_scripts_->size(); ++s) {
             if ((*allowed_scripts_)[s] == choice_script) {
               choice = choice_it.data();
@@ -477,7 +477,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
       int id = choice->script_id();
       if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
         // Check that the choice is in an allowed script.
-        int s = 0;
+        size_t s = 0;
         for (s = 0; s < allowed_scripts_->size(); ++s) {
           if ((*allowed_scripts_)[s] == id) {
             break;

diff --git a/src/ccmain/output.cpp b/src/ccmain/output.cpp
@@ -101,7 +101,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                               bool force_eol) {  // override tilde crunch?
   WERD_RES *word = page_res_it.word();
   const UNICHARSET &uchset = *word->uch_set;
-  int i;
   bool need_reject = false;
   UNICHAR_ID space = uchset.unichar_to_id(" ");
 
@@ -181,15 +180,15 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
   if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
     if (tessedit_zero_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
-      for (i = 0; i < word->best_choice->length(); ++i) {
+      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
         if (word->reject_map[i].rejected()) {
           word->reject_map[i].setrej_minimal_rej_accept();
         }
       }
     }
     if (tessedit_minimal_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
-      for (i = 0; i < word->best_choice->length(); ++i) {
+      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
         if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
           word->reject_map[i].setrej_minimal_rej_accept();
         }
@@ -365,7 +364,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
 
 int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
   int count = 0;
-  for (int i = 0; i < word.length(); ++i) {
+  for (unsigned i = 0; i < word.length(); ++i) {
     if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
       count++;
     }
@@ -375,7 +374,7 @@ int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
 
 int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
   int count = 0;
-  for (int i = 0; i < word.length(); ++i) {
+  for (unsigned i = 0; i < word.length(); ++i) {
     if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
         word.unicharset()->get_isdigit(word.unichar_id(i))) {
       count++;