Skip to content

Commit dc8745e

Browse files
committed
Move LSTM unicharset and recoder to traineddata with version string part1. Backwards compatible - maybe.
1 parent 7588540 commit dc8745e

14 files changed

+257
-130
lines changed

api/apitypes.h

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#define TESSERACT_API_APITYPES_H_
2222

2323
#include "publictypes.h"
24+
#include "version.h"
2425

2526
// The types used by the API and Page/ResultIterator can be found in:
2627
// ccstruct/publictypes.h

api/baseapi.h

-4
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@
2020
#ifndef TESSERACT_API_BASEAPI_H_
2121
#define TESSERACT_API_BASEAPI_H_
2222

23-
#define TESSERACT_VERSION_STR "4.00.00alpha"
24-
#define TESSERACT_VERSION 0x040000
25-
#define MAKE_VERSION(major, minor, patch) (((major) << 16) | ((minor) << 8) | \
26-
(patch))
2723
#include <stdio.h>
2824
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
2925
// complexity of includes here. Use forward declarations wherever possible

ccmain/tessedit.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,10 @@ bool Tesseract::init_tesseract_lang_data(
188188
#ifndef ANDROID_BUILD
189189
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
190190
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
191-
if (mgr->GetComponent(TESSDATA_LSTM, &fp)) {
191+
if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
192192
lstm_recognizer_ = new LSTMRecognizer;
193-
ASSERT_HOST(lstm_recognizer_->DeSerialize(&fp));
194-
if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr);
193+
ASSERT_HOST(
194+
lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr));
195195
} else {
196196
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
197197
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);

ccutil/Makefile.am

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ endif
1515
include_HEADERS = \
1616
basedir.h errcode.h fileerr.h genericvector.h helpers.h host.h memry.h \
1717
ndminx.h params.h ocrclass.h platform.h serialis.h strngs.h \
18-
tesscallback.h unichar.h unicharcompress.h unicharmap.h unicharset.h
18+
tesscallback.h unichar.h unicharcompress.h unicharmap.h unicharset.h \
19+
version.h
1920

2021
noinst_HEADERS = \
2122
ambigs.h bits16.h bitvector.h ccutil.h clst.h doubleptr.h elst2.h \

ccutil/tessdatamanager.cpp

+24
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
7878
if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
7979
}
8080
}
81+
if (entries_[TESSDATA_VERSION].empty()) {
82+
SetVersionString("Pre-4.0.0");
83+
}
8184
is_loaded_ = true;
8285
return true;
8386
}
@@ -139,6 +142,7 @@ void TessdataManager::Clear() {
139142

140143
// Prints a directory of contents.
141144
void TessdataManager::Directory() const {
145+
tprintf("Version string:%s\n", VersionString().c_str());
142146
int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
143147
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
144148
if (!entries_[i].empty()) {
@@ -153,12 +157,32 @@ void TessdataManager::Directory() const {
153157
// Returns false in case of failure.
154158
bool TessdataManager::GetComponent(TessdataType type, TFile *fp) {
155159
if (!is_loaded_ && !Init(data_file_name_.string())) return false;
160+
const TessdataManager *const_this = this;
161+
return const_this->GetComponent(type, fp);
162+
}
163+
164+
// As non-const version except it can't load the component if not already
165+
// loaded.
166+
bool TessdataManager::GetComponent(TessdataType type, TFile *fp) const {
167+
ASSERT_HOST(is_loaded_);
156168
if (entries_[type].empty()) return false;
157169
fp->Open(&entries_[type][0], entries_[type].size());
158170
fp->set_swap(swap_);
159171
return true;
160172
}
161173

174+
// Returns the current version string.
175+
string TessdataManager::VersionString() const {
176+
return string(&entries_[TESSDATA_VERSION][0],
177+
entries_[TESSDATA_VERSION].size());
178+
}
179+
180+
// Sets the version string to the given v_str.
181+
void TessdataManager::SetVersionString(const string &v_str) {
182+
entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
183+
memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
184+
}
185+
162186
bool TessdataManager::CombineDataFiles(
163187
const char *language_data_path_prefix,
164188
const char *output_filename) {

ccutil/tessdatamanager.h

+28-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "host.h"
2626
#include "strngs.h"
2727
#include "tprintf.h"
28+
#include "version.h"
2829

2930
static const char kTrainedDataSuffix[] = "traineddata";
3031

@@ -51,6 +52,9 @@ static const char kLSTMModelFileSuffix[] = "lstm";
5152
static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
5253
static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
5354
static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
55+
static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
56+
static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
57+
static const char kVersionFileSuffix[] = "version";
5458

5559
namespace tesseract {
5660

@@ -76,6 +80,9 @@ enum TessdataType {
7680
TESSDATA_LSTM_PUNC_DAWG, // 18
7781
TESSDATA_LSTM_SYSTEM_DAWG, // 19
7882
TESSDATA_LSTM_NUMBER_DAWG, // 20
83+
TESSDATA_LSTM_UNICHARSET, // 21
84+
TESSDATA_LSTM_RECODER, // 22
85+
TESSDATA_VERSION, // 23
7986

8087
TESSDATA_NUM_ENTRIES
8188
};
@@ -106,6 +113,9 @@ static const char *const kTessdataFileSuffixes[] = {
106113
kLSTMPuncDawgFileSuffix, // 18
107114
kLSTMSystemDawgFileSuffix, // 19
108115
kLSTMNumberDawgFileSuffix, // 20
116+
kLSTMUnicharsetFileSuffix, // 21
117+
kLSTMRecoderFileSuffix, // 22
118+
kVersionFileSuffix, // 23
109119
};
110120

111121
/**
@@ -120,9 +130,13 @@ static const int kMaxNumTessdataEntries = 1000;
120130

121131
class TessdataManager {
122132
public:
123-
TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {}
133+
TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
134+
SetVersionString(TESSERACT_VERSION_STR);
135+
}
124136
explicit TessdataManager(FileReader reader)
125-
: reader_(reader), is_loaded_(false), swap_(false) {}
137+
: reader_(reader), is_loaded_(false), swap_(false) {
138+
SetVersionString(TESSERACT_VERSION_STR);
139+
}
126140
~TessdataManager() {}
127141

128142
bool swap() const { return swap_; }
@@ -152,9 +166,21 @@ class TessdataManager {
152166
// Prints a directory of contents.
153167
void Directory() const;
154168

169+
// Returns true if the component requested is present.
170+
bool IsComponentAvailable(TessdataType type) const {
171+
return !entries_[type].empty();
172+
}
155173
// Opens the given TFile pointer to the given component type.
156174
// Returns false in case of failure.
157175
bool GetComponent(TessdataType type, TFile *fp);
176+
// As non-const version except it can't load the component if not already
177+
// loaded.
178+
bool GetComponent(TessdataType type, TFile *fp) const;
179+
180+
// Returns the current version string.
181+
string VersionString() const;
182+
// Sets the version string to the given v_str.
183+
void SetVersionString(const string &v_str);
158184

159185
// Returns true if the base Tesseract components are present.
160186
bool IsBaseAvailable() const {

ccutil/version.h

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#ifndef TESSERACT_CCUTIL_VERSION_H_
2+
#define TESSERACT_CCUTIL_VERSION_H_
3+
4+
#define TESSERACT_VERSION_STR "4.00.00alpha"
5+
#define TESSERACT_VERSION 0x040000
6+
#define MAKE_VERSION(major, minor, patch) \
7+
(((major) << 16) | ((minor) << 8) | (patch))
8+
9+
#endif // TESSERACT_CCUTIL_VERSION_H_

lstm/lstmrecognizer.cpp

+42-7
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,24 @@ LSTMRecognizer::~LSTMRecognizer() {
6868
delete search_;
6969
}
7070

71+
// Loads a model from mgr, including the dictionary only if lang is not null.
72+
bool LSTMRecognizer::Load(const char* lang, TessdataManager* mgr) {
73+
TFile fp;
74+
if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) return false;
75+
if (!DeSerialize(mgr, &fp)) return false;
76+
if (lang == nullptr) return true;
77+
// Allow it to run without a dictionary.
78+
LoadDictionary(lang, mgr);
79+
return true;
80+
}
81+
7182
// Writes to the given file. Returns false in case of error.
72-
bool LSTMRecognizer::Serialize(TFile* fp) const {
83+
bool LSTMRecognizer::Serialize(const TessdataManager* mgr, TFile* fp) const {
84+
bool include_charsets = mgr == nullptr ||
85+
!mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) ||
86+
!mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET);
7387
if (!network_->Serialize(fp)) return false;
74-
if (!GetUnicharset().save_to_file(fp)) return false;
88+
if (include_charsets && !GetUnicharset().save_to_file(fp)) return false;
7589
if (!network_str_.Serialize(fp)) return false;
7690
if (fp->FWrite(&training_flags_, sizeof(training_flags_), 1) != 1)
7791
return false;
@@ -83,16 +97,20 @@ bool LSTMRecognizer::Serialize(TFile* fp) const {
8397
if (fp->FWrite(&weight_range_, sizeof(weight_range_), 1) != 1) return false;
8498
if (fp->FWrite(&learning_rate_, sizeof(learning_rate_), 1) != 1) return false;
8599
if (fp->FWrite(&momentum_, sizeof(momentum_), 1) != 1) return false;
86-
if (IsRecoding() && !recoder_.Serialize(fp)) return false;
100+
if (include_charsets && IsRecoding() && !recoder_.Serialize(fp)) return false;
87101
return true;
88102
}
89103

90104
// Reads from the given file. Returns false in case of error.
91-
bool LSTMRecognizer::DeSerialize(TFile* fp) {
105+
bool LSTMRecognizer::DeSerialize(const TessdataManager* mgr, TFile* fp) {
92106
delete network_;
93107
network_ = Network::CreateFromFile(fp);
94108
if (network_ == NULL) return false;
95-
if (!ccutil_.unicharset.load_from_file(fp, false)) return false;
109+
bool include_charsets = mgr == nullptr ||
110+
!mgr->IsComponentAvailable(TESSDATA_LSTM_RECODER) ||
111+
!mgr->IsComponentAvailable(TESSDATA_LSTM_UNICHARSET);
112+
if (include_charsets && !ccutil_.unicharset.load_from_file(fp, false))
113+
return false;
96114
if (!network_str_.DeSerialize(fp)) return false;
97115
if (fp->FReadEndian(&training_flags_, sizeof(training_flags_), 1) != 1)
98116
return false;
@@ -107,6 +125,25 @@ bool LSTMRecognizer::DeSerialize(TFile* fp) {
107125
if (fp->FReadEndian(&learning_rate_, sizeof(learning_rate_), 1) != 1)
108126
return false;
109127
if (fp->FReadEndian(&momentum_, sizeof(momentum_), 1) != 1) return false;
128+
if (include_charsets && !LoadRecoder(fp)) return false;
129+
if (!include_charsets && !LoadCharsets(mgr)) return false;
130+
network_->SetRandomizer(&randomizer_);
131+
network_->CacheXScaleFactor(network_->XScaleFactor());
132+
return true;
133+
}
134+
135+
// Loads the charsets from mgr.
136+
bool LSTMRecognizer::LoadCharsets(const TessdataManager* mgr) {
137+
TFile fp;
138+
if (!mgr->GetComponent(TESSDATA_LSTM_UNICHARSET, &fp)) return false;
139+
if (!ccutil_.unicharset.load_from_file(&fp, false)) return false;
140+
if (!mgr->GetComponent(TESSDATA_LSTM_RECODER, &fp)) return false;
141+
if (!LoadRecoder(&fp)) return false;
142+
return true;
143+
}
144+
145+
// Loads the Recoder.
146+
bool LSTMRecognizer::LoadRecoder(TFile* fp) {
110147
if (IsRecoding()) {
111148
if (!recoder_.DeSerialize(fp)) return false;
112149
RecodedCharID code;
@@ -119,8 +156,6 @@ bool LSTMRecognizer::DeSerialize(TFile* fp) {
119156
recoder_.SetupPassThrough(GetUnicharset());
120157
training_flags_ |= TF_COMPRESS_UNICHARSET;
121158
}
122-
network_->SetRandomizer(&randomizer_);
123-
network_->CacheXScaleFactor(network_->XScaleFactor());
124159
return true;
125160
}
126161

lstm/lstmrecognizer.h

+12-2
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,20 @@ class LSTMRecognizer {
155155
}
156156
int null_char() const { return null_char_; }
157157

158+
// Loads a model from mgr, including the dictionary only if lang is not null.
159+
bool Load(const char* lang, TessdataManager* mgr);
160+
158161
// Writes to the given file. Returns false in case of error.
159-
bool Serialize(TFile* fp) const;
162+
// If mgr contains a unicharset and recoder, then they are not encoded to fp.
163+
bool Serialize(const TessdataManager* mgr, TFile* fp) const;
160164
// Reads from the given file. Returns false in case of error.
161-
bool DeSerialize(TFile* fp);
165+
// If mgr contains a unicharset and recoder, then they are taken from there,
166+
// otherwise, they are part of the serialization in fp.
167+
bool DeSerialize(const TessdataManager* mgr, TFile* fp);
168+
// Loads the charsets from mgr.
169+
bool LoadCharsets(const TessdataManager* mgr);
170+
// Loads the Recoder.
171+
bool LoadRecoder(TFile* fp);
162172
// Loads the dictionary if possible from the traineddata file.
163173
// Prints a warning message, and returns false but otherwise fails silently
164174
// and continues to work without it if loading fails.

0 commit comments

Comments
 (0)