Integrated accumulated Symbol Choice in the Choice Iterator and made the api lstm_choice_mode independent

noahmetzger · noahmetzger · commit 5b3e2fe812d8 · 2019-03-12T09:15:10.000+01:00
Signed-off-by: Noah Metzger &lt;noah.metzger@bib.uni-mannheim.de&gt;
diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp
@@ -213,13 +213,17 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     }
 
     // Now, process the word...
-    std::vector<std::vector<std::pair<const char*, float>>>* confidencemap =
+    std::vector<std::vector<std::pair<const char*, float>>>* rawTimestepMap =
+        nullptr;
+    std::vector<std::vector<std::pair<const char*, float>>>* choiceMap =
         nullptr;
     std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
         symbolMap = nullptr;
     if (tesseract_->lstm_choice_mode) {
-      confidencemap = res_it->GetBestLSTMSymbolChoices();
-      symbolMap = res_it->GetBestSegmentedLSTMSymbolChoices();
+
+      choiceMap = res_it->GetBestLSTMSymbolChoices();
+      symbolMap = res_it->GetSegmentedLSTMTimesteps();
+      rawTimestepMap = res_it->GetRawLSTMTimesteps();
     }
     hocr_str << "\n      <span class='ocrx_word'"
              << " id='"
@@ -285,14 +289,14 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     if (italic) hocr_str << "</em>";
     if (bold) hocr_str << "</strong>";
     // If the lstm choice mode is required it is added here
-    if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr) {
-      for (size_t i = 0; i < confidencemap->size(); i++) {
+    if (tesseract_->lstm_choice_mode == 1 && rawTimestepMap != nullptr) {
+      for (size_t i = 0; i < rawTimestepMap->size(); i++) {
         hocr_str << "\n       <span class='ocrx_cinfo'"
                  << " id='"
                  << "timestep_" << page_id << "_" << wcnt << "_" << tcnt << "'"
                  << ">";
         std::vector<std::pair<const char*, float>> timestep =
-            (*confidencemap)[i];
+            (*rawTimestepMap)[i];
         for (std::pair<const char*, float> conf : timestep) {
           hocr_str << "<span class='ocr_glyph'"
                    << " id='"
@@ -304,17 +308,16 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
         hocr_str << "</span>";
         tcnt++;
       }
-    } else if (tesseract_->lstm_choice_mode == 2 && confidencemap != nullptr) {
-      for (size_t i = 0; i < confidencemap->size(); i++) {
+    } else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) {
+      for (size_t i = 0; i < choiceMap->size(); i++) {
         std::vector<std::pair<const char*, float>> timestep =
-            (*confidencemap)[i];
+            (*choiceMap)[i];
         if (timestep.size() > 0) {
           hocr_str << "\n       <span class='ocrx_cinfo'"
                    << " id='"
                    << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
-                   << "'"
-                   << " chosen='" << timestep[0].first << "'>";
-          for (size_t j = 1; j < timestep.size(); j++) {
+                   << "'>";
+          for (size_t j = 0; j < timestep.size(); j++) {
             hocr_str << "<span class='ocr_glyph'"
                      << " id='"
                      << "choice_" << page_id << "_" << wcnt << "_" << gcnt
@@ -333,10 +336,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
             (*symbolMap)[j];
         hocr_str << "\n       <span class='ocr_symbol'"
                  << " id='"
-                 << "symbolstep_" << page_id << "_" << wcnt << "_" << scnt
-                 << "'>"
-                 << timesteps[0][0].first;
-        for (size_t i = 1; i < timesteps.size(); i++) {
+                 << "symbol_" << page_id << "_" << wcnt << "_" << scnt
+                 << "'>";
+        for (size_t i = 0; i < timesteps.size(); i++) {
           hocr_str << "\n        <span class='ocrx_cinfo'"
                    << " id='"
                    << "timestep_" << page_id << "_" << wcnt << "_" << tcnt
diff --git a/src/ccmain/ltrresultiterator.cpp b/src/ccmain/ltrresultiterator.cpp
@@ -358,7 +358,17 @@ bool LTRResultIterator::SymbolIsDropcap() const {
 ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
   ASSERT_HOST(result_it.it_->word() != nullptr);
   word_res_ = result_it.it_->word();
+  oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
+  oemLegacy_ = word_res_->tesseract->AnyTessLang();
   BLOB_CHOICE_LIST* choices = nullptr;
+  tstep_index_ = &result_it.blob_index_;
+  if (oemLSTM_ && !oemLegacy_ && &word_res_->accumulated_timesteps != nullptr) {
+    if (word_res_->leadingSpace)
+      LSTM_choices_ = &word_res_->accumulated_timesteps[(*tstep_index_) + 1];
+    else
+      LSTM_choices_ = &word_res_->accumulated_timesteps[*tstep_index_];
+    filterSpaces();
+  }
   if (word_res_->ratings != nullptr)
     choices = word_res_->GetBlobChoices(result_it.blob_index_);
   if (choices != nullptr && !choices->empty()) {
@@ -367,49 +377,93 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
   } else {
     choice_it_ = nullptr;
   }
-  if (&word_res_->symbol_steps != nullptr && !word_res_->symbol_steps.empty()) {
-    symbol_step_it_ = word_res_->symbol_steps.begin();
+  if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    LSTM_mode_ = true;
+    LSTM_choice_it_ = LSTM_choices_->begin();
   }
 }
-
 ChoiceIterator::~ChoiceIterator() { delete choice_it_; }
 
 // Moves to the next choice for the symbol and returns false if there
 // are none left.
 bool ChoiceIterator::Next() {
-  if (choice_it_ == nullptr) return false;
-  if (&word_res_->symbol_steps != nullptr) {
-    if (symbol_step_it_ == word_res_->symbol_steps.end()) {
-      symbol_step_it_ = word_res_->symbol_steps.begin();
+  if (LSTM_mode_) {
+    if (LSTM_choice_it_ != LSTM_choices_->end() &&
+        next(LSTM_choice_it_) == LSTM_choices_->end()) {
+      return false;
     } else {
-      symbol_step_it_++;
-    }   
+      ++LSTM_choice_it_;
+      return true;
+    }
+  } else {
+    if (choice_it_ == nullptr) return false;
+    choice_it_->forward();
+    return !choice_it_->cycled_list();
   }
-  choice_it_->forward();
-  return !choice_it_->cycled_list();
 }
 
 // Returns the null terminated UTF-8 encoded text string for the current
 // choice. Do NOT use delete [] to free after use.
 const char* ChoiceIterator::GetUTF8Text() const {
-  if (choice_it_ == nullptr) return nullptr;
-  UNICHAR_ID id = choice_it_->data()->unichar_id();
-  return word_res_->uch_set->id_to_unichar_ext(id);
+  if (LSTM_mode_) {
+    std::pair<const char*, float> choice = *LSTM_choice_it_;
+    return choice.first;
+  } else {
+    if (choice_it_ == nullptr) return nullptr;
+    UNICHAR_ID id = choice_it_->data()->unichar_id();
+    return word_res_->uch_set->id_to_unichar_ext(id);
+  }
 }
 
-// Returns the confidence of the current choice.
-// The number should be interpreted as a percent probability. (0.0f-100.0f)
+// Returns the confidence of the current choice depending on the used language
+// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All 
+// choices for one symbol should roughly add up to 1.0f.
+// If only traineddata of the legacy engine is used, the number should be 
+// interpreted as a percent probability. (0.0f-100.0f) In this case probabilities
+// won't add up to 100. Each one stands on its own.
 float ChoiceIterator::Confidence() const {
-  if (choice_it_ == nullptr) return 0.0f;
-  float confidence = 100 + 5 * choice_it_->data()->certainty();
-  if (confidence < 0.0f) confidence = 0.0f;
-  if (confidence > 100.0f) confidence = 100.0f;
-  return confidence;
+  if (LSTM_mode_) {
+    std::pair<const char*, float> choice = *LSTM_choice_it_;
+    return choice.second;
+  } else {
+    if (choice_it_ == nullptr) return 0.0f;
+    float confidence = 100 + 5 * choice_it_->data()->certainty();
+    if (confidence < 0.0f) confidence = 0.0f;
+    if (confidence > 100.0f) confidence = 100.0f;
+    return confidence;
+  }
 }
 
+// Returns the set of timesteps which belong to the current symbol
 std::vector<std::vector<std::pair<const char*, float>>>*
 ChoiceIterator::Timesteps() const {
-  if (&word_res_->symbol_steps == nullptr) return nullptr;
-  return &*symbol_step_it_;
+  if (&word_res_->symbol_steps == nullptr || !LSTM_mode_) return nullptr;
+  if (word_res_->leadingSpace) {
+    return &word_res_->symbol_steps[*(tstep_index_) + 1];
+  } else {
+    return &word_res_->symbol_steps[*tstep_index_];
+  }
+}
+
+void ChoiceIterator::filterSpaces() {
+  if (LSTM_choices_->empty()) return;
+  std::vector<std::pair<const char*, float>>::iterator it =
+      LSTM_choices_->begin();
+  bool found_space = false;
+  float sum = 0;
+  for (it; it != LSTM_choices_->end();) {
+    if (!strcmp(it->first, " ")) {
+      it = LSTM_choices_->erase(it);
+      found_space = true;
+    } else {
+      sum += it->second;
+      ++it;
+    }
+  }
+  if (found_space) {
+    for (it = LSTM_choices_->begin(); it != LSTM_choices_->end(); ++it) {
+      it->second /= sum;
+    }
+  }
 }
 }  // namespace tesseract.
diff --git a/src/ccmain/ltrresultiterator.h b/src/ccmain/ltrresultiterator.h
@@ -208,25 +208,36 @@ class ChoiceIterator {
   // internal structure and should NOT be delete[]ed to free after use.
   const char* GetUTF8Text() const;
 
-  // Returns the confidence of the current choice.
-  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  // Returns the confidence of the current choice depending on the used language
+  // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
+  // choices for one symbol should roughly add up to 1.0f.
+  // If only traineddata of the legacy engine is used, the number should be
+  // interpreted as a percent probability. (0.0f-100.0f) In this case
+  // probabilities won't add up to 100. Each one stands on its own.
   float Confidence() const;
 
   // Returns a vector containing all timesteps, which belong to the currently
   // selected symbol. A timestep is a vector containing pairs of symbols and
   // floating point numbers. The number states the probability for the
   // corresponding symbol.
-  std::vector<std::vector<std::pair<const char*, float>>>*
-  Timesteps() const;
+  std::vector<std::vector<std::pair<const char*, float>>>* Timesteps() const;
 
  private:
+   //clears the remaining spaces out of the results and adapt the probabilities
+  void filterSpaces();
   // Pointer to the WERD_RES object owned by the API.
   WERD_RES* word_res_;
   // Iterator over the blob choices.
   BLOB_CHOICE_IT* choice_it_;
-  //Iterator over the symbol steps.
-  std::vector<std::vector<std::vector<std::pair<const char*, float>>>>::iterator
-      symbol_step_it_;
+  std::vector<std::pair<const char*, float>>* LSTM_choices_ = nullptr;
+  std::vector<std::pair<const char*, float>>::iterator LSTM_choice_it_;
+
+  const int* tstep_index_;
+  bool LSTM_mode_ = false;
+  //true when there is lstm engine related trained data
+  bool oemLSTM_;
+  // true when there is legacy engine related trained data
+  bool oemLegacy_;
 };
 
 }  // namespace tesseract.
diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp
@@ -604,18 +604,26 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
   strncpy(result, text.string(), length);
   return result;
 }
+std::vector<std::vector<std::pair<const char*, float>>>*
+ResultIterator::GetRawLSTMTimesteps() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->raw_timesteps;
+  } else {
+    return nullptr;
+  }
+}
 
 std::vector<std::vector<std::pair<const char*, float>>>*
   ResultIterator::GetBestLSTMSymbolChoices() const {
   if (it_->word() != nullptr) {
-    return &it_->word()->timesteps;
+    return &it_->word()->accumulated_timesteps;
   } else {
     return nullptr;
   }
 }
 
 std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
-  ResultIterator::GetBestSegmentedLSTMSymbolChoices() const {
+  ResultIterator::GetSegmentedLSTMTimesteps() const {
   if (it_->word() != nullptr) {
     return &it_->word()->symbol_steps;
   } else {
diff --git a/src/ccmain/resultiterator.h b/src/ccmain/resultiterator.h
@@ -100,10 +100,12 @@ class TESS_API ResultIterator : public LTRResultIterator {
   /**
    * Returns the LSTM choices for every LSTM timestep for the current word.
   */
+  virtual std::vector<std::vector<std::pair<const char*, float>>>*
+  GetRawLSTMTimesteps() const;
   virtual std::vector<std::vector<std::pair<const char*, float>>>*
     GetBestLSTMSymbolChoices() const;
   virtual std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
-    GetBestSegmentedLSTMSymbolChoices() const;
+    GetSegmentedLSTMTimesteps() const;
 
   /**
    * Return whether the current paragraph's dominant reading direction
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
@@ -524,11 +524,12 @@ Tesseract::Tesseract()
                     this->params()),
       INT_MEMBER(lstm_choice_mode, 0,
           "Allows to include alternative symbols choices in the hOCR output. "
-          "Valid input values are 0, 1 and 2. 0 is the default value. "
+          "Valid input values are 0, 1, 2 and 3. 0 is the default value. "
           "With 1 the alternative symbol choices per timestep are included. "
-          "With 2 the alternative symbol choices are accumulated per character."
-          "With 3 the alternative symbol choices per timestep are included and "
-          "separated by the suggested segmentation of Tesseract",
+          "With 2 the alternative symbol choices are accumulated per "
+          "character. "
+          "With 3 the alternative symbol choices per timestep are included "
+          "and separated by the suggested segmentation of Tesseract",
           this->params()),
 
       backup_config_file_(nullptr),
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
@@ -1124,12 +1124,14 @@ class Tesseract : public Wordrec {
   STRING_VAR_H(page_separator, "\f",
                "Page separator (default is form feed control character)");
   INT_VAR_H(lstm_choice_mode, 0,
-            "Allows to include alternative symbols choices in the hOCR output. "
-            "Valid input values are 0, 1 and 2. 0 is the default value. "
+            "Allows to include alternative symbols choices in the hOCR "
+            "output. "
+            "Valid input values are 0, 1, 2 and 3. 0 is the default value. "
             "With 1 the alternative symbol choices per timestep are included. "
-            "With 2 the alternative symbol choices are accumulated per character."
-            "With 3 the alternative symbol choices per timestep are included and "
-            "separated by the suggested segmentation of Tesseract");
+            "With 2 the alternative symbol choices are accumulated per "
+            "character. "
+            "With 3 the alternative symbol choices per timestep are included "
+            "and separated by the suggested segmentation of Tesseract");
 
   //// ambigsrecog.cpp /////////////////////////////////////////////////////////
   FILE *init_recog_training(const STRING &fname);
diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h
@@ -221,9 +221,12 @@ class WERD_RES : public ELIST_LINK {
   // blob i and blob i+1.
   GenericVector<int> blob_gaps;
   // Stores the lstm choices of every timestep
-  std::vector<std::vector<std::pair<const char*, float>>> timesteps;
+  std::vector<std::vector<std::pair<const char*, float>>> raw_timesteps;
+  std::vector<std::vector<std::pair<const char*, float>>> accumulated_timesteps;
   std::vector<std::vector<std::vector<std::pair<const char*, float>>>>
       symbol_steps;
+  //Stores if the timestep vector starts with a space
+  bool leadingSpace = false;
   // Ratings matrix contains classifier choices for each classified combination
   // of blobs. The dimension is the same as the number of blobs in chopped_word
   // and the leading diagonal corresponds to classifier results of the blobs
diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp
diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h