Skip to content

Commit 53fc445

Browse files
committed
Fixed issue 1252: Refactored LearnBlob and its call hierarchy to make it a member of Classify.
Eliminated the flexfx scheme for calling global feature extractor functions through an array of function pointers. Deleted dead code I found as a by-product. This CL does not change BlobToTrainingSample or ExtractFeatures to be full members of Classify (the eventual goal) as that would make it even bigger, since there are a lot of callers to these functions. When ExtractFeatures and BlobToTrainingSample are members of Classify they will be able to access control parameters in Classify, which will greatly simplify developing variations to the feature extraction process.
1 parent e735a90 commit 53fc445

31 files changed

+220
-745
lines changed

api/baseapi.cpp

+25-4
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include "allheaders.h"
5252

5353
#include "baseapi.h"
54+
#include "blobclass.h"
5455
#include "resultiterator.h"
5556
#include "mutableiterator.h"
5657
#include "thresholder.h"
@@ -870,7 +871,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
870871
page_res_ = NULL;
871872
return -1;
872873
} else if (tesseract_->tessedit_train_from_boxes) {
873-
tesseract_->ApplyBoxTraining(*output_file_, page_res_);
874+
STRING fontname;
875+
ExtractFontName(*output_file_, &fontname);
876+
tesseract_->ApplyBoxTraining(fontname, page_res_);
874877
} else if (tesseract_->tessedit_ambigs_training) {
875878
FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
876879
// OCR the page segmented into words by tesseract.
@@ -1051,6 +1054,23 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
10511054
return true;
10521055
}
10531056

1057+
// Master ProcessPages calls ProcessPagesInternal and then does any post-
1058+
// processing required due to being in a training mode.
1059+
bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
1060+
int timeout_millisec,
1061+
TessResultRenderer* renderer) {
1062+
bool result =
1063+
ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1064+
if (result) {
1065+
if (tesseract_->tessedit_train_from_boxes &&
1066+
!tesseract_->WriteTRFile(*output_file_)) {
1067+
tprintf("Write of TR file failed: %s\n", output_file_->string());
1068+
return false;
1069+
}
1070+
}
1071+
return result;
1072+
}
1073+
10541074
// In the ideal scenario, Tesseract will start working on data as soon
10551075
// as it can. For example, if you steam a filelist through stdin, we
10561076
// should start the OCR process as soon as the first filename is
@@ -1063,9 +1083,10 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
10631083
// identify the scenario that really matters: filelists on
10641084
// stdin. We'll still do our best if the user likes pipes. That means
10651085
// piling up any data coming into stdin into a memory buffer.
1066-
bool TessBaseAPI::ProcessPages(const char* filename,
1067-
const char* retry_config, int timeout_millisec,
1068-
TessResultRenderer* renderer) {
1086+
bool TessBaseAPI::ProcessPagesInternal(const char* filename,
1087+
const char* retry_config,
1088+
int timeout_millisec,
1089+
TessResultRenderer* renderer) {
10691090
PERF_COUNT_START("ProcessPages")
10701091
bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
10711092
if (stdInput) {

api/baseapi.h

+5-3
Original file line numberDiff line numberDiff line change
@@ -538,9 +538,11 @@ class TESS_API TessBaseAPI {
538538
*
539539
* Returns true if successful, false on error.
540540
*/
541-
bool ProcessPages(const char* filename,
542-
const char* retry_config, int timeout_millisec,
543-
TessResultRenderer* renderer);
541+
bool ProcessPages(const char* filename, const char* retry_config,
542+
int timeout_millisec, TessResultRenderer* renderer);
543+
// Does the real work of ProcessPages.
544+
bool ProcessPagesInternal(const char* filename, const char* retry_config,
545+
int timeout_millisec, TessResultRenderer* renderer);
544546

545547
/**
546548
* Turn a single image into symbolic text.

ccmain/applybox.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -775,13 +775,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
775775
}
776776

777777
// Calls LearnWord to extract features for labelled blobs within each word.
778-
// Features are written to the given filename.
779-
void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
778+
// Features are stored in an internal buffer.
779+
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
780780
PAGE_RES_IT pr_it(page_res);
781781
int word_count = 0;
782782
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
783783
word_res = pr_it.forward()) {
784-
LearnWord(filename.string(), word_res);
784+
LearnWord(fontname.string(), word_res);
785785
++word_count;
786786
}
787787
tprintf("Generated training data for %d words\n", word_count);

classify/adaptmatch.cpp

+17-20
Original file line numberDiff line numberDiff line change
@@ -220,17 +220,15 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
220220

221221
// Learns the given word using its chopped_word, seam_array, denorm,
222222
// box_word, best_state, and correct_text to learn both correctly and
223-
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
224-
// is called and the data will be written to a file for static training.
223+
// incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
224+
// is called and the data will be saved in an internal buffer.
225225
// Otherwise AdaptToBlob is called for adaption within a document.
226-
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
227-
// be learned, otherwise all chars with good correct_text are learned.
228-
void Classify::LearnWord(const char* filename, WERD_RES *word) {
226+
void Classify::LearnWord(const char* fontname, WERD_RES* word) {
229227
int word_len = word->correct_text.size();
230228
if (word_len == 0) return;
231229

232230
float* thresholds = NULL;
233-
if (filename == NULL) {
231+
if (fontname == NULL) {
234232
// Adaption mode.
235233
if (!EnableLearning || word->best_choice == NULL)
236234
return; // Can't or won't adapt.
@@ -267,8 +265,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
267265
if (word->correct_text[ch].length() > 0) {
268266
float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
269267

270-
LearnPieces(filename, start_blob, word->best_state[ch],
271-
threshold, CST_WHOLE, word->correct_text[ch].string(), word);
268+
LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
269+
CST_WHOLE, word->correct_text[ch].string(), word);
272270

273271
if (word->best_state[ch] > 1 && !disable_character_fragments) {
274272
// Check that the character breaks into meaningful fragments
@@ -301,8 +299,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
301299
if (i != tokens.size() - 1)
302300
full_string += ' ';
303301
}
304-
LearnPieces(filename, start_blob + frag, 1,
305-
threshold, CST_FRAGMENT, full_string.string(), word);
302+
LearnPieces(fontname, start_blob + frag, 1, threshold,
303+
CST_FRAGMENT, full_string.string(), word);
306304
}
307305
}
308306
}
@@ -314,13 +312,13 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
314312
if (word->best_state[ch] > 1) {
315313
// If the next blob is good, make junk with the rightmost fragment.
316314
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
317-
LearnPieces(filename, start_blob + word->best_state[ch] - 1,
315+
LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
318316
word->best_state[ch + 1] + 1,
319317
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
320318
}
321319
// If the previous blob is good, make junk with the leftmost fragment.
322320
if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
323-
LearnPieces(filename, start_blob - word->best_state[ch - 1],
321+
LearnPieces(fontname, start_blob - word->best_state[ch - 1],
324322
word->best_state[ch - 1] + 1,
325323
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
326324
}
@@ -329,7 +327,7 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
329327
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
330328
STRING joined_text = word->correct_text[ch];
331329
joined_text += word->correct_text[ch + 1];
332-
LearnPieces(filename, start_blob,
330+
LearnPieces(fontname, start_blob,
333331
word->best_state[ch] + word->best_state[ch + 1],
334332
threshold, CST_NGRAM, joined_text.string(), word);
335333
}
@@ -342,16 +340,16 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
342340

343341
// Builds a blob of length fragments, from the word, starting at start,
344342
// and then learns it, as having the given correct_text.
345-
// If filename is not NULL, then LearnBlob
346-
// is called and the data will be written to a file for static training.
343+
// If fontname is not NULL, then LearnBlob is called and the data will be
344+
// saved in an internal buffer for static training.
347345
// Otherwise AdaptToBlob is called for adaption within a document.
348346
// threshold is a magic number required by AdaptToChar and generated by
349347
// ComputeAdaptionThresholds.
350348
// Although it can be partly inferred from the string, segmentation is
351349
// provided to explicitly clarify the character segmentation.
352-
void Classify::LearnPieces(const char* filename, int start, int length,
350+
void Classify::LearnPieces(const char* fontname, int start, int length,
353351
float threshold, CharSegmentationType segmentation,
354-
const char* correct_text, WERD_RES *word) {
352+
const char* correct_text, WERD_RES* word) {
355353
// TODO(daria) Remove/modify this if/when we want
356354
// to train and/or adapt to n-grams.
357355
if (segmentation != CST_WHOLE &&
@@ -385,16 +383,15 @@ void Classify::LearnPieces(const char* filename, int start, int length,
385383
}
386384
#endif // GRAPHICS_DISABLED
387385

388-
if (filename != NULL) {
386+
if (fontname != NULL) {
389387
classify_norm_method.set_value(character); // force char norm spc 30/11/93
390388
tess_bn_matching.set_value(false); // turn it off
391389
tess_cn_matching.set_value(false);
392390
DENORM bl_denorm, cn_denorm;
393391
INT_FX_RESULT_STRUCT fx_info;
394392
SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
395393
&bl_denorm, &cn_denorm, &fx_info);
396-
LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm,
397-
fx_info, correct_text);
394+
LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
398395
} else if (unicharset.contains_unichar(correct_text)) {
399396
UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
400397
int font_id = word->fontinfo != NULL

classify/blobclass.cpp

+57-79
Original file line numberDiff line numberDiff line change
@@ -20,111 +20,89 @@
2020
Include Files and Type Defines
2121
----------------------------------------------------------------------------**/
2222
#include "blobclass.h"
23-
#include "extract.h"
24-
#include "efio.h"
25-
#include "featdefs.h"
26-
#include "callcpp.h"
2723

28-
#include <math.h>
2924
#include <stdio.h>
30-
#include <signal.h>
3125

32-
#define MAXFILENAME 80
33-
#define MAXMATCHES 10
26+
#include "classify.h"
27+
#include "efio.h"
28+
#include "featdefs.h"
29+
#include "mf.h"
30+
#include "normfeat.h"
3431

3532
static const char kUnknownFontName[] = "UnknownFont";
3633

3734
STRING_VAR(classify_font_name, kUnknownFontName,
3835
"Default font name to be used in training");
3936

40-
/**----------------------------------------------------------------------------
41-
Global Data Definitions and Declarations
42-
----------------------------------------------------------------------------**/
43-
/* name of current image file being processed */
44-
extern char imagefile[];
45-
37+
namespace tesseract {
4638
/**----------------------------------------------------------------------------
4739
Public Code
4840
----------------------------------------------------------------------------**/
49-
50-
/*---------------------------------------------------------------------------*/
51-
// As all TBLOBs, Blob is in baseline normalized coords.
52-
// See SetupBLCNDenorms in intfx.cpp for other args.
53-
void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
54-
TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
55-
const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
56-
/*
57-
** Parameters:
58-
** Blob blob whose micro-features are to be learned
59-
** Row row of text that blob came from
60-
** BlobText text that corresponds to blob
61-
** TextLength number of characters in blob
62-
** Globals:
63-
** imagefile base filename of the page being learned
64-
** classify_font_name
65-
** name of font currently being trained on
66-
** Operation:
67-
** Extract micro-features from the specified blob and append
68-
** them to the appropriate file.
69-
** Return: none
70-
** Exceptions: none
71-
** History: 7/28/89, DSJ, Created.
72-
*/
73-
#define TRAIN_SUFFIX ".tr"
74-
static FILE *FeatureFile = NULL;
75-
STRING Filename(filename);
76-
77-
// If no fontname was set, try to extract it from the filename
78-
STRING CurrFontName = classify_font_name;
79-
if (CurrFontName == kUnknownFontName) {
41+
// Finds the name of the training font and returns it in fontname, by cutting
42+
// it out based on the expectation that the filename is of the form:
43+
// /path/to/dir/[lang].[fontname].exp[num]
44+
// The [lang], [fontname] and [num] fields should not have '.' characters.
45+
// If the global parameter classify_font_name is set, its value is used instead.
46+
void ExtractFontName(const STRING& filename, STRING* fontname) {
47+
*fontname = classify_font_name;
48+
if (*fontname == kUnknownFontName) {
8049
// filename is expected to be of the form [lang].[fontname].exp[num]
8150
// The [lang], [fontname] and [num] fields should not have '.' characters.
8251
const char *basename = strrchr(filename.string(), '/');
8352
const char *firstdot = strchr(basename ? basename : filename.string(), '.');
8453
const char *lastdot = strrchr(filename.string(), '.');
8554
if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
8655
++firstdot;
87-
CurrFontName = firstdot;
88-
CurrFontName[lastdot - firstdot] = '\0';
56+
*fontname = firstdot;
57+
fontname->truncate_at(lastdot - firstdot);
8958
}
9059
}
60+
}
9161

92-
// if a feature file is not yet open, open it
93-
// the name of the file is the name of the image plus TRAIN_SUFFIX
94-
if (FeatureFile == NULL) {
95-
Filename += TRAIN_SUFFIX;
96-
FeatureFile = Efopen(Filename.string(), "wb");
97-
cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
98-
}
99-
100-
LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
101-
BlobText, CurrFontName.string());
102-
} // LearnBlob
103-
104-
void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
105-
TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
106-
const INT_FX_RESULT_STRUCT& fx_info,
107-
const char* BlobText, const char* FontName) {
108-
CHAR_DESC CharDesc;
109-
110-
ASSERT_HOST(FeatureFile != NULL);
111-
112-
CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
113-
Blob);
114-
if (CharDesc == NULL) {
115-
cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
116-
return;
117-
}
118-
119-
if (ValidCharDescription(FeatureDefs, CharDesc)) {
120-
// label the features with a class name and font name
121-
fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
62+
/*---------------------------------------------------------------------------*/
63+
// Extracts features from the given blob and saves them in the tr_file_data_
64+
// member variable.
65+
// fontname: Name of font that this blob was printed in.
66+
// cn_denorm: Character normalization transformation to apply to the blob.
67+
// fx_info: Character normalization parameters computed with cn_denorm.
68+
// blob_text: Ground truth text for the blob.
69+
void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
70+
const DENORM& cn_denorm,
71+
const INT_FX_RESULT_STRUCT& fx_info,
72+
const char* blob_text) {
73+
CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
74+
CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
75+
CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
76+
CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
77+
CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
78+
79+
if (ValidCharDescription(feature_defs_, CharDesc)) {
80+
// Label the features with a class name and font name.
81+
tr_file_data_ += "\n";
82+
tr_file_data_ += fontname;
83+
tr_file_data_ += " ";
84+
tr_file_data_ += blob_text;
85+
tr_file_data_ += "\n";
12286

12387
// write micro-features to file and clean up
124-
WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
88+
WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
12589
} else {
12690
tprintf("Blob learned was invalid!\n");
12791
}
12892
FreeCharDescription(CharDesc);
129-
13093
} // LearnBlob
94+
95+
// Writes stored training data to a .tr file based on the given filename.
96+
// Returns false on error.
97+
bool Classify::WriteTRFile(const STRING& filename) {
98+
STRING tr_filename = filename + ".tr";
99+
FILE* fp = Efopen(tr_filename.string(), "wb");
100+
int len = tr_file_data_.length();
101+
bool result =
102+
fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
103+
fclose(fp);
104+
tr_file_data_.truncate_at(0);
105+
return result;
106+
}
107+
108+
} // namespace tesseract.

0 commit comments

Comments
 (0)