Skip to content

Commit f24ef67

Browse files
committed
Limited max height to 48 even in variable height input, enabled neural nets via ocr engine mode
1 parent c1c1e42 commit f24ef67

File tree

8 files changed

+61
-21
lines changed

8 files changed

+61
-21
lines changed

ChangeLog

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
2016-11-11 - V4.00.00
2+
* Added new neural network system based on LSTMs, with major accuracy gains.
3+
* Improvements to PDF rendering.
4+
* Fixes to trainingdata rendering.
5+
16
2016-02-17 - V3.04.01
27
* Added OSD renderer for psm 0. Works for single page and multi-page images.
38
* Improve tesstrain.sh script.

api/tesseractmain.cpp

+30-6
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ void PrintVersionInfo() {
9090
void PrintUsage(const char* program) {
9191
printf(
9292
"Usage:\n"
93-
" %s --help | --help-psm | --version\n"
93+
" %s --help | --help-psm | --help-oem | --version\n"
9494
" %s --list-langs [--tessdata-dir PATH]\n"
9595
" %s --print-parameters [options...] [configfile...]\n"
9696
" %s imagename|stdin outputbase|stdout [options...] [configfile...]\n",
@@ -120,6 +120,18 @@ void PrintHelpForPSM() {
120120
printf("%s", msg);
121121
}
122122

123+
void PrintHelpForOEM() {
124+
const char* msg =
125+
"OCR Engine modes:\n"
126+
" 0 Original Tesseract only.\n"
127+
" 1 Cube only.\n"
128+
" 2 Tesseract + cube.\n"
129+
" 3 Default, based on what is available.\n"
130+
" 4 Neural nets (LSTM) only.\n";
131+
132+
printf("%s", msg);
133+
}
134+
123135
void PrintHelpMessage(const char* program) {
124136
PrintUsage(program);
125137

@@ -132,15 +144,18 @@ void PrintHelpMessage(const char* program) {
132144
" -c VAR=VALUE Set value for config variables.\n"
133145
" Multiple -c arguments are allowed.\n"
134146
" -psm NUM Specify page segmentation mode.\n"
147+
" -oem NUM Specify OCR Engine mode.\n"
135148
"NOTE: These options must occur before any configfile.\n";
136149

137150
printf("\n%s\n", ocr_options);
138151
PrintHelpForPSM();
152+
PrintHelpForOEM();
139153

140154
const char* single_options =
141155
"Single options:\n"
142156
" -h, --help Show this help message.\n"
143157
" --help-psm Show page segmentation modes.\n"
158+
" --help-oem Show OCR Engine modes.\n"
144159
" -v, --version Show version information.\n"
145160
" --list-langs List available languages for tesseract engine.\n"
146161
" --print-parameters Print tesseract parameters to stdout.\n";
@@ -214,7 +229,8 @@ void ParseArgs(const int argc, char** argv, const char** lang,
214229
const char** datapath, bool* list_langs, bool* print_parameters,
215230
GenericVector<STRING>* vars_vec,
216231
GenericVector<STRING>* vars_values, int* arg_i,
217-
tesseract::PageSegMode* pagesegmode) {
232+
tesseract::PageSegMode* pagesegmode,
233+
tesseract::OcrEngineMode* enginemode) {
218234
if (argc == 1) {
219235
PrintHelpMessage(argv[0]);
220236
exit(0);
@@ -229,6 +245,10 @@ void ParseArgs(const int argc, char** argv, const char** lang,
229245
PrintHelpForPSM();
230246
exit(0);
231247
}
248+
if ((strcmp(argv[1], "--help-oem") == 0)) {
249+
PrintHelpForOEM();
250+
exit(0);
251+
}
232252
if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
233253
PrintVersionInfo();
234254
exit(0);
@@ -258,6 +278,9 @@ void ParseArgs(const int argc, char** argv, const char** lang,
258278
} else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) {
259279
*pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
260280
++i;
281+
} else if (strcmp(argv[i], "-oem") == 0 && i + 1 < argc) {
282+
*enginemode = static_cast<tesseract::OcrEngineMode>(atoi(argv[i + 1]));
283+
++i;
261284
} else if (strcmp(argv[i], "--print-parameters") == 0) {
262285
noocr = true;
263286
*print_parameters = true;
@@ -355,6 +378,7 @@ int main(int argc, char** argv) {
355378
bool print_parameters = false;
356379
int arg_i = 1;
357380
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
381+
tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;
358382
/* main() calls functions like ParseArgs which call exit().
359383
* This results in memory leaks if vars_vec and vars_values are
360384
* declared as auto variables (destructor is not called then). */
@@ -367,7 +391,8 @@ int main(int argc, char** argv) {
367391
#endif /* HAVE_TIFFIO_H && _WIN32 */
368392

369393
ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
370-
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode);
394+
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
395+
&enginemode);
371396

372397
bool banner = false;
373398
if (outputbase != NULL && strcmp(outputbase, "-") &&
@@ -380,9 +405,8 @@ int main(int argc, char** argv) {
380405

381406
api.SetOutputName(outputbase);
382407

383-
int init_failed =
384-
api.Init(datapath, lang, tesseract::OEM_DEFAULT, &(argv[arg_i]),
385-
argc - arg_i, &vars_vec, &vars_values, false);
408+
int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
409+
argc - arg_i, &vars_vec, &vars_values, false);
386410
if (init_failed) {
387411
fprintf(stderr, "Could not initialize tesseract.\n");
388412
exit(1);

ccmain/tessedit.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,11 @@ bool Tesseract::init_tesseract_lang_data(
218218
if (tessdata_manager_debug_level)
219219
tprintf("Loaded Cube with combiner\n");
220220
} else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
221-
if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
221+
if (tessdata_manager.swap()) {
222+
tprintf("Error: LSTM requested on big-endian hardware!!\n");
223+
tprintf("Big-endian not yet supported! Loading tesseract.\n");
224+
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
225+
} else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
222226
lstm_recognizer_ = new LSTMRecognizer;
223227
TFile fp;
224228
fp.Open(tessdata_manager.GetDataFilePtr(), -1);

ccstruct/imagedata.cpp

+7-3
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ Pix* ImageData::GetPix() const {
217217
// The return value is the scaled Pix, which must be pixDestroyed after use,
218218
// and scale_factor (if not NULL) is set to the scale factor that was applied
219219
// to the image to achieve the target_height.
220-
Pix* ImageData::PreScale(int target_height, float* scale_factor,
220+
Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
221221
int* scaled_width, int* scaled_height,
222222
GenericVector<TBOX>* boxes) const {
223223
int input_width = 0;
@@ -226,8 +226,12 @@ Pix* ImageData::PreScale(int target_height, float* scale_factor,
226226
ASSERT_HOST(src_pix != NULL);
227227
input_width = pixGetWidth(src_pix);
228228
input_height = pixGetHeight(src_pix);
229-
if (target_height == 0)
230-
target_height = input_height;
229+
if (target_height == 0) {
230+
if (input_height > max_height)
231+
target_height = max_height;
232+
else
233+
target_height = input_height;
234+
}
231235
float im_factor = static_cast<float>(target_height) / input_height;
232236
if (scaled_width != NULL)
233237
*scaled_width = IntCastRounded(im_factor * input_width);

ccstruct/imagedata.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,9 @@ class ImageData {
165165
// The return value is the scaled Pix, which must be pixDestroyed after use,
166166
// and scale_factor (if not NULL) is set to the scale factor that was applied
167167
// to the image to achieve the target_height.
168-
Pix* PreScale(int target_height, float* scale_factor, int* scaled_width,
169-
int* scaled_height, GenericVector<TBOX>* boxes) const;
168+
Pix* PreScale(int target_height, int max_height, float* scale_factor,
169+
int* scaled_width, int* scaled_height,
170+
GenericVector<TBOX>* boxes) const;
170171

171172
int MemoryUsed() const;
172173

lstm/input.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525

2626
namespace tesseract {
2727

28+
// Max height for variable height inputs before scaling anyway.
29+
const int kMaxInputHeight = 48;
30+
2831
Input::Input(const STRING& name, int ni, int no)
2932
: Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {}
3033
Input::Input(const STRING& name, const StaticShape& shape)
@@ -92,8 +95,8 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
9295
// Note that NumInputs() is defined as input image height.
9396
int target_height = network->NumInputs();
9497
int width, height;
95-
Pix* pix =
96-
image_data.PreScale(target_height, image_scale, &width, &height, nullptr);
98+
Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale,
99+
&width, &height, nullptr);
97100
if (pix == nullptr) {
98101
tprintf("Bad pix from ImageData!\n");
99102
return nullptr;

lstm/lstmtrainer.cpp

+4-6
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@
3434

3535
#include "callcpp.h"
3636

37-
using std::string;
38-
3937
namespace tesseract {
4038

4139
// Min actual error rate increase to constitute divergence.
@@ -203,7 +201,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
203201

204202
// Initializes a trainer from a serialized TFNetworkModel proto.
205203
// Returns the global step of TensorFlow graph or 0 if failed.
206-
int LSTMTrainer::InitTensorFlowNetwork(const string& tf_proto) {
204+
int LSTMTrainer::InitTensorFlowNetwork(const std::string& tf_proto) {
207205
#ifdef INCLUDE_TENSORFLOW
208206
delete network_;
209207
TFNetwork* tf_net = new TFNetwork("TensorFlow");
@@ -1199,22 +1197,22 @@ double LSTMTrainer::ComputeCharError(const GenericVector<int>& truth_str,
11991197
// Computes a very simple bag of words word recall error rate.
12001198
// NOTE that this is destructive on both input strings.
12011199
double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
1202-
typedef TessHashMap<string, int, std::hash<string> > StrMap;
1200+
typedef TessHashMap<std::string, int, std::hash<std::string> > StrMap;
12031201
GenericVector<STRING> truth_words, ocr_words;
12041202
truth_str->split(' ', &truth_words);
12051203
if (truth_words.empty()) return 0.0;
12061204
ocr_str->split(' ', &ocr_words);
12071205
StrMap word_counts;
12081206
for (int i = 0; i < truth_words.size(); ++i) {
1209-
string truth_word(truth_words[i].string());
1207+
std::string truth_word(truth_words[i].string());
12101208
StrMap::iterator it = word_counts.find(truth_word);
12111209
if (it == word_counts.end())
12121210
word_counts.insert(make_pair(truth_word, 1));
12131211
else
12141212
++it->second;
12151213
}
12161214
for (int i = 0; i < ocr_words.size(); ++i) {
1217-
string ocr_word(ocr_words[i].string());
1215+
std::string ocr_word(ocr_words[i].string());
12181216
StrMap::iterator it = word_counts.find(ocr_word);
12191217
if (it == word_counts.end())
12201218
word_counts.insert(make_pair(ocr_word, -1));

training/pango_font_info.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ string PangoFontInfo::DescriptionName() const {
127127
/* static */
128128
void PangoFontInfo::SoftInitFontConfig() {
129129
if (fonts_dir_.empty()) {
130-
HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str());
130+
HardInitFontConfig(FLAGS_fonts_dir.c_str(),
131+
FLAGS_fontconfig_tmpdir.c_str());
131132
}
132133
}
133134

0 commit comments

Comments
 (0)