Skip to content

Commit 77c44cd

Browse files
committed
Added convert to int and directory listing to combine_tessdata
1 parent 2ef1aea commit 77c44cd

File tree

3 files changed

+42
-8
lines changed

3 files changed

+42
-8
lines changed

lstm/lstmrecognizer.h

+8
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ class LSTMRecognizer {
127127
series->ScaleLayerLearningRate(&id[1], factor);
128128
}
129129

130+
// Converts the network to int if not already.
131+
void ConvertToInt() {
132+
if ((training_flags_ & TF_INT_MODE) == 0) {
133+
network_->ConvertToInt();
134+
training_flags_ |= TF_INT_MODE;
135+
}
136+
}
137+
130138
// Provides access to the UNICHARSET that this classifier works with.
131139
const UNICHARSET& GetUnicharset() const { return ccutil_.unicharset; }
132140
// Provides access to the UnicharCompress that this classifier works with.

lstm/lstmtrainer.h

-8
Original file line numberDiff line numberDiff line change
@@ -251,14 +251,6 @@ class LSTMTrainer : public LSTMRecognizer {
251251
const UnicharCompress* recoder, bool simple_text,
252252
int null_char, GenericVector<int>* labels);
253253

254-
// Converts the network to int if not already.
255-
void ConvertToInt() {
256-
if ((training_flags_ & TF_INT_MODE) == 0) {
257-
network_->ConvertToInt();
258-
training_flags_ |= TF_INT_MODE;
259-
}
260-
}
261-
262254
// Performs forward-backward on the given trainingdata.
263255
// Returns the sample that was used or NULL if the next sample was deemed
264256
// unusable. samples_trainer could be this or an alternative trainer that

training/combine_tessdata.cpp

+34
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
//
1919
///////////////////////////////////////////////////////////////////////
2020

21+
#include "lstmrecognizer.h"
2122
#include "tessdatamanager.h"
2223

2324
// Main program to combine/extract/overwrite tessdata components
@@ -122,6 +123,31 @@ int main(int argc, char **argv) {
122123

123124
// Write the updated traineddata file.
124125
tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
126+
} else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
127+
tm.Init(argv[2]);
128+
tesseract::TFile fp;
129+
if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
130+
tprintf("No LSTM Component found in %s!\n", argv[2]);
131+
exit(1);
132+
}
133+
tesseract::LSTMRecognizer recognizer;
134+
if (!recognizer.DeSerialize(&tm, &fp)) {
135+
tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
136+
exit(1);
137+
}
138+
recognizer.ConvertToInt();
139+
GenericVector<char> lstm_data;
140+
fp.OpenWrite(&lstm_data);
141+
ASSERT_HOST(recognizer.Serialize(&tm, &fp));
142+
tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
143+
lstm_data.size());
144+
if (!tm.SaveFile(argv[2], nullptr)) {
145+
tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
146+
exit(1);
147+
}
148+
} else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
149+
// Initialize TessdataManager with the data in the given traineddata file.
150+
tm.Init(argv[2]);
125151
} else {
126152
printf("Usage for combining tessdata components:\n"
127153
" %s language_data_path_prefix\n"
@@ -137,6 +163,14 @@ int main(int argc, char **argv) {
137163
printf("Usage for unpacking all tessdata components:\n"
138164
" %s -u traineddata_file output_path_prefix\n"
139165
" (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
166+
printf(
167+
"Usage for listing directory of components:\n"
168+
" %s -d traineddata_file\n",
169+
argv[0]);
170+
printf(
171+
"Usage for compacting LSTM component to int:\n"
172+
" %s -c traineddata_file\n",
173+
argv[0]);
140174
return 1;
141175
}
142176
tm.Directory();

0 commit comments

Comments
 (0)