tesseract-ocr
diff --git a/‎api/baseapi.cpp
+25-4 b/‎api/baseapi.cpp
+25-4
diff --git a/‎api/baseapi.h
+5-3 b/‎api/baseapi.h
+5-3
diff --git a/‎ccmain/applybox.cpp
+3-3 b/‎ccmain/applybox.cpp
+3-3
diff --git a/‎classify/adaptmatch.cpp
+17-20 b/‎classify/adaptmatch.cpp
+17-20
diff --git a/‎classify/blobclass.cpp
+57-79 b/‎classify/blobclass.cpp
+57-79
@@ -51,6 +51,7 @@
 #include "allheaders.h"
 
 #include "baseapi.h"
+#include "blobclass.h"
 #include "resultiterator.h"
 #include "mutableiterator.h"
 #include "thresholder.h"
@@ -870,7 +871,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
     page_res_ = NULL;
     return -1;
   } else if (tesseract_->tessedit_train_from_boxes) {
-    tesseract_->ApplyBoxTraining(*output_file_, page_res_);
+    STRING fontname;
+    ExtractFontName(*output_file_, &fontname);
+    tesseract_->ApplyBoxTraining(fontname, page_res_);
   } else if (tesseract_->tessedit_ambigs_training) {
     FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
     // OCR the page segmented into words by tesseract.
@@ -1051,6 +1054,23 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
   return true;
 }
 
+// Master ProcessPages calls ProcessPagesInternal and then does any post-
+// processing required due to being in a training mode.
+bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
+                               int timeout_millisec,
+                               TessResultRenderer* renderer) {
+  bool result =
+      ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
+  if (result) {
+    if (tesseract_->tessedit_train_from_boxes &&
+        !tesseract_->WriteTRFile(*output_file_)) {
+      tprintf("Write of TR file failed: %s\n", output_file_->string());
+      return false;
+    }
+  }
+  return result;
+}
+
 // In the ideal scenario, Tesseract will start working on data as soon
 // as it can. For example, if you steam a filelist through stdin, we
 // should start the OCR process as soon as the first filename is
@@ -1063,9 +1083,10 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
 // identify the scenario that really matters: filelists on
 // stdin. We'll still do our best if the user likes pipes.  That means
 // piling up any data coming into stdin into a memory buffer.
-bool TessBaseAPI::ProcessPages(const char* filename,
-                               const char* retry_config, int timeout_millisec,
-                               TessResultRenderer* renderer) {
+bool TessBaseAPI::ProcessPagesInternal(const char* filename,
+                                       const char* retry_config,
+                                       int timeout_millisec,
+                                       TessResultRenderer* renderer) {
   PERF_COUNT_START("ProcessPages")
   bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
   if (stdInput) {
 
@@ -538,9 +538,11 @@ class TESS_API TessBaseAPI {
    *
    * Returns true if successful, false on error.
    */
-  bool ProcessPages(const char* filename,
-                    const char* retry_config, int timeout_millisec,
-                    TessResultRenderer* renderer);
+  bool ProcessPages(const char* filename, const char* retry_config,
+                    int timeout_millisec, TessResultRenderer* renderer);
+  // Does the real work of ProcessPages.
+  bool ProcessPagesInternal(const char* filename, const char* retry_config,
+                            int timeout_millisec, TessResultRenderer* renderer);
 
   /**
    * Turn a single image into symbolic text.
 
@@ -775,13 +775,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
 }
 
 // Calls LearnWord to extract features for labelled blobs within each word.
-// Features are written to the given filename.
-void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
+// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   int word_count = 0;
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
-    LearnWord(filename.string(), word_res);
+    LearnWord(fontname.string(), word_res);
     ++word_count;
   }
   tprintf("Generated training data for %d words\n", word_count);
 
@@ -220,17 +220,15 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
 
 // Learns the given word using its chopped_word, seam_array, denorm,
 // box_word, best_state, and correct_text to learn both correctly and
-// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
-// is called and the data will be written to a file for static training.
+// incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
+// is called and the data will be saved in an internal buffer.
 // Otherwise AdaptToBlob is called for adaption within a document.
-// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
-// be learned, otherwise all chars with good correct_text are learned.
-void Classify::LearnWord(const char* filename, WERD_RES *word) {
+void Classify::LearnWord(const char* fontname, WERD_RES* word) {
   int word_len = word->correct_text.size();
   if (word_len == 0) return;
 
   float* thresholds = NULL;
-  if (filename == NULL) {
+  if (fontname == NULL) {
     // Adaption mode.
     if (!EnableLearning || word->best_choice == NULL)
       return;  // Can't or won't adapt.
@@ -267,8 +265,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
     if (word->correct_text[ch].length() > 0) {
       float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
 
-      LearnPieces(filename, start_blob, word->best_state[ch],
-                  threshold, CST_WHOLE, word->correct_text[ch].string(), word);
+      LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
+                  CST_WHOLE, word->correct_text[ch].string(), word);
 
       if (word->best_state[ch] > 1 && !disable_character_fragments) {
         // Check that the character breaks into meaningful fragments
@@ -301,8 +299,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
                 if (i != tokens.size() - 1)
                   full_string += ' ';
               }
-              LearnPieces(filename, start_blob + frag, 1,
-                          threshold, CST_FRAGMENT, full_string.string(), word);
+              LearnPieces(fontname, start_blob + frag, 1, threshold,
+                          CST_FRAGMENT, full_string.string(), word);
             }
           }
         }
@@ -314,13 +312,13 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
       if (word->best_state[ch] > 1) {
         // If the next blob is good, make junk with the rightmost fragment.
         if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
-          LearnPieces(filename, start_blob + word->best_state[ch] - 1,
+          LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
                       word->best_state[ch + 1] + 1,
                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
         }
         // If the previous blob is good, make junk with the leftmost fragment.
         if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
-          LearnPieces(filename, start_blob - word->best_state[ch - 1],
+          LearnPieces(fontname, start_blob - word->best_state[ch - 1],
                       word->best_state[ch - 1] + 1,
                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
         }
@@ -329,7 +327,7 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
       if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
         STRING joined_text = word->correct_text[ch];
         joined_text += word->correct_text[ch + 1];
-        LearnPieces(filename, start_blob,
+        LearnPieces(fontname, start_blob,
                     word->best_state[ch] + word->best_state[ch + 1],
                     threshold, CST_NGRAM, joined_text.string(), word);
       }
@@ -342,16 +340,16 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
 
 // Builds a blob of length fragments, from the word, starting at start,
 // and then learns it, as having the given correct_text.
-// If filename is not NULL, then LearnBlob
-// is called and the data will be written to a file for static training.
+// If fontname is not NULL, then LearnBlob is called and the data will be
+// saved in an internal buffer for static training.
 // Otherwise AdaptToBlob is called for adaption within a document.
 // threshold is a magic number required by AdaptToChar and generated by
 // ComputeAdaptionThresholds.
 // Although it can be partly inferred from the string, segmentation is
 // provided to explicitly clarify the character segmentation.
-void Classify::LearnPieces(const char* filename, int start, int length,
+void Classify::LearnPieces(const char* fontname, int start, int length,
                            float threshold, CharSegmentationType segmentation,
-                           const char* correct_text, WERD_RES *word) {
+                           const char* correct_text, WERD_RES* word) {
   // TODO(daria) Remove/modify this if/when we want
   // to train and/or adapt to n-grams.
   if (segmentation != CST_WHOLE &&
@@ -385,16 +383,15 @@ void Classify::LearnPieces(const char* filename, int start, int length,
   }
   #endif  // GRAPHICS_DISABLED
 
-  if (filename != NULL) {
+  if (fontname != NULL) {
     classify_norm_method.set_value(character);  // force char norm spc 30/11/93
     tess_bn_matching.set_value(false);    // turn it off
     tess_cn_matching.set_value(false);
     DENORM bl_denorm, cn_denorm;
     INT_FX_RESULT_STRUCT fx_info;
     SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
                      &bl_denorm, &cn_denorm, &fx_info);
-    LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm,
-              fx_info, correct_text);
+    LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
   } else if (unicharset.contains_unichar(correct_text)) {
     UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
     int font_id = word->fontinfo != NULL
 
@@ -20,111 +20,89 @@
       Include Files and Type Defines
 ----------------------------------------------------------------------------**/
 #include "blobclass.h"
-#include "extract.h"
-#include "efio.h"
-#include "featdefs.h"
-#include "callcpp.h"
 
-#include <math.h>
 #include <stdio.h>
-#include <signal.h>
 
-#define MAXFILENAME             80
-#define MAXMATCHES              10
+#include "classify.h"
+#include "efio.h"
+#include "featdefs.h"
+#include "mf.h"
+#include "normfeat.h"
 
 static const char kUnknownFontName[] = "UnknownFont";
 
 STRING_VAR(classify_font_name, kUnknownFontName,
            "Default font name to be used in training");
 
-/**----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
-----------------------------------------------------------------------------**/
-/* name of current image file being processed */
-extern char imagefile[];
-
+namespace tesseract {
 /**----------------------------------------------------------------------------
             Public Code
 ----------------------------------------------------------------------------**/
-
-/*---------------------------------------------------------------------------*/
-// As all TBLOBs, Blob is in baseline normalized coords.
-// See SetupBLCNDenorms in intfx.cpp for other args.
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
-/*
- **      Parameters:
- **              Blob            blob whose micro-features are to be learned
- **              Row             row of text that blob came from
- **              BlobText        text that corresponds to blob
- **              TextLength      number of characters in blob
- **      Globals:
- **              imagefile       base filename of the page being learned
- **              classify_font_name
- **                              name of font currently being trained on
- **      Operation:
- **              Extract micro-features from the specified blob and append
- **              them to the appropriate file.
- **      Return: none
- **      Exceptions: none
- **      History: 7/28/89, DSJ, Created.
- */
-#define TRAIN_SUFFIX    ".tr"
-  static FILE *FeatureFile = NULL;
-  STRING Filename(filename);
-
-  // If no fontname was set, try to extract it from the filename
-  STRING CurrFontName = classify_font_name;
-  if (CurrFontName == kUnknownFontName) {
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+void ExtractFontName(const STRING& filename, STRING* fontname) {
+  *fontname = classify_font_name;
+  if (*fontname == kUnknownFontName) {
     // filename is expected to be of the form [lang].[fontname].exp[num]
     // The [lang], [fontname] and [num] fields should not have '.' characters.
     const char *basename = strrchr(filename.string(), '/');
     const char *firstdot = strchr(basename ? basename : filename.string(), '.');
     const char *lastdot  = strrchr(filename.string(), '.');
     if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
       ++firstdot;
-      CurrFontName = firstdot;
-      CurrFontName[lastdot - firstdot] = '\0';
+      *fontname = firstdot;
+      fontname->truncate_at(lastdot - firstdot);
     }
   }
+}
 
-  // if a feature file is not yet open, open it
-  // the name of the file is the name of the image plus TRAIN_SUFFIX
-  if (FeatureFile == NULL) {
-    Filename += TRAIN_SUFFIX;
-    FeatureFile = Efopen(Filename.string(), "wb");
-    cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
-  }
-
-  LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
-            BlobText, CurrFontName.string());
-}                                // LearnBlob
-
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
-               TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info,
-               const char* BlobText, const char* FontName) {
-  CHAR_DESC CharDesc;
-
-  ASSERT_HOST(FeatureFile != NULL);
-
-  CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
-                                 Blob);
-  if (CharDesc == NULL) {
-    cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
-    return;
-  }
-
-  if (ValidCharDescription(FeatureDefs, CharDesc)) {
-    // label the features with a class name and font name
-    fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
+/*---------------------------------------------------------------------------*/
+// Extracts features from the given blob and saves them in the tr_file_data_
+// member variable.
+// fontname:  Name of font that this blob was printed in.
+// cn_denorm: Character normalization transformation to apply to the blob.
+// fx_info:   Character normalization parameters computed with cn_denorm.
+// blob_text: Ground truth text for the blob.
+void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
+                         const DENORM& cn_denorm,
+                         const INT_FX_RESULT_STRUCT& fx_info,
+                         const char* blob_text) {
+  CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
+  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
+  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
+  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
+  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
+
+  if (ValidCharDescription(feature_defs_, CharDesc)) {
+    // Label the features with a class name and font name.
+    tr_file_data_ += "\n";
+    tr_file_data_ += fontname;
+    tr_file_data_ += " ";
+    tr_file_data_ += blob_text;
+    tr_file_data_ += "\n";
 
     // write micro-features to file and clean up
-    WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
+    WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
   } else {
     tprintf("Blob learned was invalid!\n");
   }
   FreeCharDescription(CharDesc);
-
 }                                // LearnBlob
+
+// Writes stored training data to a .tr file based on the given filename.
+// Returns false on error.
+bool Classify::WriteTRFile(const STRING& filename) {
+  STRING tr_filename = filename + ".tr";
+  FILE* fp = Efopen(tr_filename.string(), "wb");
+  int len = tr_file_data_.length();
+  bool result =
+      fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
+  fclose(fp);
+  tr_file_data_.truncate_at(0);
+  return result;
+}
+
+}  // namespace tesseract.