Skip to content

Commit 6be2515

Browse files
committed
Major updates to training system as a result of extensive testing on 100 languages
1 parent 21805e6 commit 6be2515

11 files changed

+2104
-732
lines changed

training/language-specific.sh

+1,131
Large diffs are not rendered by default.

training/ligature_table.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static string EncodeAsUTF8(const char32 ch32) {
4343
// from. Note that this range does not contain the custom ligatures that we
4444
// encode in the private use area.
4545
const int kMinLigature = 0xfb00;
46-
const int kMaxLigature = 0xfb4f;
46+
const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in.
4747

4848
/* static */
4949
SmartPtr<LigatureTable> LigatureTable::instance_;

training/pango_font_info.cpp

+103-44
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
5151
BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
5252
"Does a one-time deletion of cache files from the "
5353
"fontconfig_tmpdir before initializing fontconfig.");
54+
BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
55+
"Does a one-time reset of the fontconfig config file to point"
56+
" to fonts_dir before initializing fontconfig. Set to true"
57+
" if fontconfig_refresh_cache is true. Set it to false to use"
58+
" multiple instances in separate processes without having to"
59+
" rescan the fonts_dir, using a previously setup font cache");
5460

5561
#ifndef USE_STD_NAMESPACE
5662
#include "ocr/trainingdata/typesetting/legacy_fonts.h"
@@ -67,6 +73,8 @@ namespace tesseract {
6773
// in pixels.
6874
const int kDefaultResolution = 300;
6975

76+
bool PangoFontInfo::fontconfig_initialized_ = false;
77+
7078
PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
7179
Clear();
7280
}
@@ -103,34 +111,35 @@ string PangoFontInfo::DescriptionName() const {
103111

104112
// Initializes Fontconfig for use by writing a fake fonts.conf file into the
105113
// FLAGS_fontconfigs_tmpdir directory, that points to the supplied
106-
// FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
107-
// to point to this fonts.conf file.
108-
static void InitFontconfig() {
109-
static bool init_fontconfig = false;
110-
if (init_fontconfig || FLAGS_fonts_dir.empty()) {
111-
init_fontconfig = true;
114+
// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
115+
// to point to this fonts.conf file. If force_clear, the cache is refreshed
116+
// even if it has already been initialized.
117+
void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
118+
if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
119+
fontconfig_initialized_ = true;
112120
return;
113121
}
114-
if (FLAGS_fontconfig_refresh_cache) {
115-
tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str());
122+
if (FLAGS_fontconfig_refresh_cache || force_clear) {
116123
File::DeleteMatchingFiles(File::JoinPath(
117-
FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str());
118-
}
119-
tprintf("Initializing fontconfig\n");
120-
const int MAX_FONTCONF_FILESIZE = 1024;
121-
char fonts_conf_template[MAX_FONTCONF_FILESIZE];
122-
snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
123-
"<?xml version=\"1.0\"?>\n"
124-
"<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
125-
"<fontconfig>\n"
126-
"<dir>%s</dir>\n"
127-
"<cachedir>%s</cachedir>\n"
128-
"<config></config>\n"
129-
"</fontconfig>", FLAGS_fonts_dir.c_str(),
130-
FLAGS_fontconfig_tmpdir.c_str());
131-
string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
132-
"fonts.conf");
133-
File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
124+
FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
125+
}
126+
if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
127+
force_clear) {
128+
const int MAX_FONTCONF_FILESIZE = 1024;
129+
char fonts_conf_template[MAX_FONTCONF_FILESIZE];
130+
snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
131+
"<?xml version=\"1.0\"?>\n"
132+
"<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
133+
"<fontconfig>\n"
134+
"<dir>%s</dir>\n"
135+
"<cachedir>%s</cachedir>\n"
136+
"<config></config>\n"
137+
"</fontconfig>", fonts_dir.c_str(),
138+
FLAGS_fontconfig_tmpdir.c_str());
139+
string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
140+
"fonts.conf");
141+
File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
142+
}
134143
#ifdef _WIN32
135144
std::string env("FONTCONFIG_PATH=");
136145
env.append(FLAGS_fontconfig_tmpdir.c_str());
@@ -141,12 +150,18 @@ static void InitFontconfig() {
141150
// Fix the locale so that the reported font names are consistent.
142151
setenv("LANG", "en_US.utf8", true);
143152
#endif // _WIN32
144-
init_fontconfig = true;
153+
if (!fontconfig_initialized_ || force_clear) {
154+
if (FcInitReinitialize() != FcTrue) {
155+
tprintf("FcInitiReinitialize failed!!\n");
156+
}
157+
}
158+
fontconfig_initialized_ = true;
159+
FontUtils::ReInit();
145160
}
146161

147162
static void ListFontFamilies(PangoFontFamily*** families,
148163
int* n_families) {
149-
InitFontconfig();
164+
PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
150165
PangoFontMap* font_map = pango_cairo_font_map_get_default();
151166
DISABLE_HEAP_LEAK_CHECK;
152167
pango_font_map_list_families(font_map, families, n_families);
@@ -220,7 +235,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
220235
// in the font map. Note that if the font is wholly missing, this could
221236
// correspond to a completely different font family and face.
222237
PangoFont* PangoFontInfo::ToPangoFont() const {
223-
InitFontconfig();
238+
InitFontConfig(false, FLAGS_fonts_dir);
224239
PangoFontMap* font_map = pango_cairo_font_map_get_default();
225240
PangoContext* context = pango_context_new();
226241
pango_cairo_context_set_resolution(context, resolution_);
@@ -253,6 +268,28 @@ bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const
253268
return true;
254269
}
255270

271+
// This variant of strncpy permits src and dest to overlap. It will copy the
272+
// first byte first.
273+
static char* my_strnmove(char* dest, const char* src, size_t n) {
274+
char* ret = dest;
275+
276+
// Copy characters until n reaches zero or the src byte is a nul.
277+
do {
278+
*dest = *src;
279+
--n;
280+
++dest;
281+
++src;
282+
} while (n && src[0]);
283+
284+
// If we reached a nul byte and there are more 'n' left, zero them out.
285+
while (n) {
286+
*dest = '\0';
287+
--n;
288+
++dest;
289+
}
290+
return ret;
291+
}
292+
256293
int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
257294
PangoFont* font = ToPangoFont();
258295
PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
@@ -265,23 +302,30 @@ int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
265302
UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
266303
const UNICHAR::const_iterator it_end =
267304
UNICHAR::end(utf8_text->c_str(), utf8_text->length());
268-
for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
305+
for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
269306
// Skip bad utf-8.
270-
if (!it.is_legal())
271-
continue; // One suitable error message will still be issued.
272-
if (!IsWhitespace(*it) && !pango_is_zero_width(*it) &&
273-
pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
307+
if (!it.is_legal()) {
308+
++it; // One suitable error message will still be issued.
309+
continue;
310+
}
311+
int unicode = *it;
312+
int utf8_len = it.utf8_len();
313+
const char* utf8_char = it.utf8_data();
314+
// Move it forward before the data gets modified.
315+
++it;
316+
if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
317+
pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
274318
if (TLOG_IS_ON(2)) {
275-
char tmp[5];
276-
int len = it.get_utf8(tmp);
277-
tmp[len] = '\0';
278-
tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
319+
UNICHAR unichar(unicode);
320+
char* str = unichar.utf8_str();
321+
tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
322+
delete[] str;
279323
}
280324
++num_dropped_chars;
281325
continue;
282326
}
283-
strncpy(out, it.utf8_data(), it.utf8_len());
284-
out += it.utf8_len();
327+
my_strnmove(out, utf8_char, utf8_len);
328+
out += utf8_len;
285329
}
286330
utf8_text->resize(out - utf8_text->c_str());
287331
return num_dropped_chars;
@@ -438,6 +482,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
438482

439483

440484
// ------------------------ FontUtils ------------------------------------
485+
vector<string> FontUtils::available_fonts_; // cache list
441486

442487
// Returns whether the specified font description is available in the fonts
443488
// directory.
@@ -449,7 +494,8 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
449494
// from the font_map, and then check what we loaded to see if it has the
450495
// description we expected. If it is not, then the font is deemed unavailable.
451496
/* static */
452-
bool FontUtils::IsAvailableFont(const char* input_query_desc) {
497+
bool FontUtils::IsAvailableFont(const char* input_query_desc,
498+
string* best_match) {
453499
string query_desc(input_query_desc);
454500
if (PANGO_VERSION <= 12005) {
455501
// Strip commas and any ' Medium' substring in the name.
@@ -466,7 +512,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
466512
query_desc.c_str());
467513
PangoFont* selected_font = NULL;
468514
{
469-
InitFontconfig();
515+
PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
470516
PangoFontMap* font_map = pango_cairo_font_map_get_default();
471517
PangoContext* context = pango_context_new();
472518
pango_context_set_font_map(context, font_map);
@@ -490,7 +536,16 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
490536
char* selected_desc_str = pango_font_description_to_string(selected_desc);
491537
tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
492538
selected_desc_str);
493-
539+
if (!equal && best_match != NULL) {
540+
*best_match = selected_desc_str;
541+
// Clip the ending ' 0' if there is one. It seems that, if there is no
542+
// point size on the end of the fontname, then Pango always appends ' 0'.
543+
int len = best_match->size();
544+
if (len > 2 && best_match->at(len - 1) == '0' &&
545+
best_match->at(len - 2) == ' ') {
546+
*best_match = best_match->substr(0, len - 2);
547+
}
548+
}
494549
g_free(selected_desc_str);
495550
pango_font_description_free(selected_desc);
496551
g_object_unref(selected_font);
@@ -512,7 +567,6 @@ static bool ShouldIgnoreFontFamilyName(const char* query) {
512567
// Outputs description names of available fonts.
513568
/* static */
514569
const vector<string>& FontUtils::ListAvailableFonts() {
515-
static vector<string> available_fonts_; // cache list
516570
if (available_fonts_.size()) {
517571
return available_fonts_;
518572
}
@@ -536,8 +590,9 @@ const vector<string>& FontUtils::ListAvailableFonts() {
536590
for (int i = 0; i < n_families; ++i) {
537591
const char* family_name = pango_font_family_get_name(families[i]);
538592
tlog(2, "Listing family %s\n", family_name);
539-
if (ShouldIgnoreFontFamilyName(family_name))
593+
if (ShouldIgnoreFontFamilyName(family_name)) {
540594
continue;
595+
}
541596

542597
int n_faces;
543598
PangoFontFace** faces = NULL;
@@ -733,4 +788,8 @@ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
733788
return false;
734789
}
735790

791+
// PangoFontInfo is reinitialized, so clear the static list of fonts.
792+
/* static */
793+
void FontUtils::ReInit() { available_fonts_.clear(); }
794+
736795
} // namespace tesseract

training/pango_font_info.h

+22-1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ class PangoFontInfo {
8383
bool GetSpacingProperties(const string& utf8_char,
8484
int* x_bearing, int* x_advance) const;
8585

86+
// Initializes FontConfig by setting its environment variable and creating
87+
// a fonts.conf file that points to the given fonts_dir. Once initialized,
88+
// it is not re-initialized unless force_clear is true.
89+
static void InitFontConfig(bool force_clear, const string& fonts_dir);
90+
8691
// Accessors
8792
string DescriptionName() const;
8893
// Font Family name eg. "Arial"
@@ -123,6 +128,10 @@ class PangoFontInfo {
123128
// Default output resolution to assume for GetSpacingProperties() and any
124129
// other methods that returns pixel values.
125130
int resolution_;
131+
// Fontconfig operates through an environment variable, so it intrinsically
132+
// cannot be thread-friendly, but you can serialize multiple independent
133+
// font configurations by calling InitFontConfig(true, path).
134+
static bool fontconfig_initialized_;
126135

127136
private:
128137
PangoFontInfo(const PangoFontInfo&);
@@ -135,7 +144,13 @@ class FontUtils {
135144
public:
136145
// Returns true if the font of the given description name is available in the
137146
// target directory specified by --fonts_dir
138-
static bool IsAvailableFont(const char* font_desc);
147+
static bool IsAvailableFont(const char* font_desc) {
148+
return IsAvailableFont(font_desc, NULL);
149+
}
150+
// Returns true if the font of the given description name is available in the
151+
// target directory specified by --fonts_dir. If false is returned, and
152+
// best_match is not NULL, the closest matching font is returned there.
153+
static bool IsAvailableFont(const char* font_desc, string* best_match);
139154
// Outputs description names of available fonts.
140155
static const vector<string>& ListAvailableFonts();
141156

@@ -181,6 +196,12 @@ class FontUtils {
181196
static int FontScore(const unordered_map<char32, inT64>& ch_map,
182197
const string& fontname, int* raw_score,
183198
vector<bool>* ch_flags);
199+
200+
// PangoFontInfo is reinitialized, so clear the static list of fonts.
201+
static void ReInit();
202+
203+
private:
204+
static vector<string> available_fonts_; // cache list
184205
};
185206
} // namespace tesseract
186207

0 commit comments

Comments
 (0)