Skip to content

Commit d74c625

Browse files
committed
Fixed blob division params to fix CJK training speed.
1 parent 4c7ab0c commit d74c625

File tree

3 files changed

+128
-114
lines changed

3 files changed

+128
-114
lines changed

classify/classify.cpp

+121-113
Original file line numberDiff line numberDiff line change
@@ -33,127 +33,135 @@
3333

3434
namespace tesseract {
3535
Classify::Classify()
36-
: BOOL_MEMBER(prioritize_division, FALSE,
37-
"Prioritize blob division over chopping", this->params()),
38-
INT_MEMBER(tessedit_single_match, FALSE,
39-
"Top choice only from CP", this->params()),
40-
BOOL_MEMBER(classify_enable_learning, true,
41-
"Enable adaptive classifier", this->params()),
42-
INT_MEMBER(classify_debug_level, 0, "Classify debug level",
43-
this->params()),
44-
INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
45-
this->params()),
46-
double_MEMBER(classify_char_norm_range, 0.2,
47-
"Character Normalization Range ...", this->params()),
48-
double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
49-
this->params()), /* PREV DEFAULT 0.1 */
50-
double_MEMBER(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...",
51-
this->params()), /* PREV DEFAULT 0.3 */
52-
double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
53-
this->params()), /* PREV DEFAULT 0.1 */
54-
double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
55-
this->params()), /* PREV DEFAULT 0.3 */
56-
double_MEMBER(classify_max_rating_ratio, 1.5,
57-
"Veto ratio between classifier ratings", this->params()),
58-
double_MEMBER(classify_max_certainty_margin, 5.5,
59-
"Veto difference between classifier certainties",
36+
: BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
6037
this->params()),
61-
BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
62-
this->params()),
63-
BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
64-
this->params()),
65-
BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
66-
"Enable adaptive classifier",
67-
this->params()),
68-
BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
69-
"Use pre-adapted classifier templates", this->params()),
70-
BOOL_MEMBER(classify_save_adapted_templates, 0,
71-
"Save adapted templates to a file", this->params()),
72-
BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
73-
this->params()),
74-
BOOL_MEMBER(classify_nonlinear_norm, 0,
75-
"Non-linear stroke-density normalization", this->params()),
76-
INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
77-
INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
78-
INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
79-
this->params()),
80-
double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
38+
BOOL_MEMBER(prioritize_division, FALSE,
39+
"Prioritize blob division over chopping", this->params()),
40+
INT_MEMBER(tessedit_single_match, FALSE, "Top choice only from CP",
41+
this->params()),
42+
BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
8143
this->params()),
82-
double_MEMBER(matcher_great_threshold, 0.0, "Great Match (0-1)",
44+
INT_MEMBER(classify_debug_level, 0, "Classify debug level",
45+
this->params()),
46+
INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
47+
this->params()),
48+
double_MEMBER(classify_char_norm_range, 0.2,
49+
"Character Normalization Range ...", this->params()),
50+
double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
51+
this->params()), /* PREV DEFAULT 0.1 */
52+
double_MEMBER(classify_max_norm_scale_x, 0.325,
53+
"Max char x-norm scale ...",
54+
this->params()), /* PREV DEFAULT 0.3 */
55+
double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
56+
this->params()), /* PREV DEFAULT 0.1 */
57+
double_MEMBER(classify_max_norm_scale_y, 0.325,
58+
"Max char y-norm scale ...",
59+
this->params()), /* PREV DEFAULT 0.3 */
60+
double_MEMBER(classify_max_rating_ratio, 1.5,
61+
"Veto ratio between classifier ratings", this->params()),
62+
double_MEMBER(classify_max_certainty_margin, 5.5,
63+
"Veto difference between classifier certainties",
64+
this->params()),
65+
BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
8366
this->params()),
84-
double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
67+
BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
8568
this->params()),
86-
double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
69+
BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
70+
"Enable adaptive classifier", this->params()),
71+
BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
72+
"Use pre-adapted classifier templates", this->params()),
73+
BOOL_MEMBER(classify_save_adapted_templates, 0,
74+
"Save adapted templates to a file", this->params()),
75+
BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
8776
this->params()),
88-
double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
77+
BOOL_MEMBER(classify_nonlinear_norm, 0,
78+
"Non-linear stroke-density normalization", this->params()),
79+
INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
80+
INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
81+
INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
82+
this->params()),
83+
double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
84+
this->params()),
85+
double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
86+
this->params()),
87+
double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
88+
this->params()),
89+
double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
90+
this->params()),
91+
double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
92+
this->params()),
93+
double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
94+
this->params()),
95+
INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
96+
this->params()),
97+
INT_MEMBER(matcher_min_examples_for_prototyping, 3,
98+
"Reliable Config Threshold", this->params()),
99+
INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
100+
"Enable adaption even if the ambiguities have not been seen",
101+
this->params()),
102+
double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
103+
"Maximum angle delta for prototype clustering",
104+
this->params()),
105+
double_MEMBER(classify_misfit_junk_penalty, 0.0,
106+
"Penalty to apply when a non-alnum is vertically out of "
107+
"its expected textline position",
108+
this->params()),
109+
double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
110+
double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
111+
this->params()),
112+
double_MEMBER(tessedit_class_miss_scale, 0.00390625,
113+
"Scale factor for features not used", this->params()),
114+
double_MEMBER(
115+
classify_adapted_pruning_factor, 2.5,
116+
"Prune poor adapted results this much worse than best result",
117+
this->params()),
118+
double_MEMBER(classify_adapted_pruning_threshold, -1.0,
119+
"Threshold at which classify_adapted_pruning_factor starts",
120+
this->params()),
121+
INT_MEMBER(classify_adapt_proto_threshold, 230,
122+
"Threshold for good protos during adaptive 0-255",
123+
this->params()),
124+
INT_MEMBER(classify_adapt_feature_threshold, 230,
125+
"Threshold for good features during adaptive 0-255",
126+
this->params()),
127+
BOOL_MEMBER(disable_character_fragments, TRUE,
128+
"Do not include character fragments in the"
129+
" results of the classifier",
89130
this->params()),
90-
double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
131+
double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
132+
-3.0,
133+
"Exclude fragments that do not look like whole"
134+
" characters from training and adaption",
135+
this->params()),
136+
BOOL_MEMBER(classify_debug_character_fragments, FALSE,
137+
"Bring up graphical debugging windows for fragments training",
91138
this->params()),
92-
INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
93-
this->params()),
94-
INT_MEMBER(matcher_min_examples_for_prototyping, 3,
95-
"Reliable Config Threshold", this->params()),
96-
INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
97-
"Enable adaption even if the ambiguities have not been seen",
98-
this->params()),
99-
double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
100-
"Maximum angle delta for prototype clustering",
139+
BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
140+
"Use two different windows for debugging the matching: "
141+
"One for the protos and one for the features.",
101142
this->params()),
102-
double_MEMBER(classify_misfit_junk_penalty, 0.0,
103-
"Penalty to apply when a non-alnum is vertically out of "
104-
"its expected textline position",
105-
this->params()),
106-
double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
107-
double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
108-
this->params()),
109-
double_MEMBER(tessedit_class_miss_scale, 0.00390625,
110-
"Scale factor for features not used", this->params()),
111-
double_MEMBER(classify_adapted_pruning_factor, 2.5,
112-
"Prune poor adapted results this much worse than best result",
113-
this->params()),
114-
double_MEMBER(classify_adapted_pruning_threshold, -1.0,
115-
"Threshold at which classify_adapted_pruning_factor starts",
116-
this->params()),
117-
INT_MEMBER(classify_adapt_proto_threshold, 230,
118-
"Threshold for good protos during adaptive 0-255",
119-
this->params()),
120-
INT_MEMBER(classify_adapt_feature_threshold, 230,
121-
"Threshold for good features during adaptive 0-255",
122-
this->params()),
123-
BOOL_MEMBER(disable_character_fragments, TRUE,
124-
"Do not include character fragments in the"
125-
" results of the classifier", this->params()),
126-
double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
127-
-3.0, "Exclude fragments that do not look like whole"
128-
" characters from training and adaption", this->params()),
129-
BOOL_MEMBER(classify_debug_character_fragments, FALSE,
130-
"Bring up graphical debugging windows for fragments training",
131-
this->params()),
132-
BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
133-
"Use two different windows for debugging the matching: "
134-
"One for the protos and one for the features.", this->params()),
135-
STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
136-
this->params()),
137-
INT_MEMBER(classify_class_pruner_threshold, 229,
138-
"Class Pruner Threshold 0-255", this->params()),
139-
INT_MEMBER(classify_class_pruner_multiplier, 15,
140-
"Class Pruner Multiplier 0-255: ", this->params()),
141-
INT_MEMBER(classify_cp_cutoff_strength, 7,
142-
"Class Pruner CutoffStrength: ", this->params()),
143-
INT_MEMBER(classify_integer_matcher_multiplier, 10,
144-
"Integer Matcher Multiplier 0-255: ", this->params()),
145-
EnableLearning(true),
146-
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
147-
this->params()),
148-
BOOL_MEMBER(classify_bln_numeric_mode, 0,
149-
"Assume the input is numbers [0-9].", this->params()),
150-
double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
151-
this->params()),
152-
double_MEMBER(speckle_rating_penalty, 10.0,
153-
"Penalty to add to worst rating for noise", this->params()),
154-
shape_table_(NULL),
155-
dict_(this),
156-
static_classifier_(NULL) {
143+
STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
144+
this->params()),
145+
INT_MEMBER(classify_class_pruner_threshold, 229,
146+
"Class Pruner Threshold 0-255", this->params()),
147+
INT_MEMBER(classify_class_pruner_multiplier, 15,
148+
"Class Pruner Multiplier 0-255: ", this->params()),
149+
INT_MEMBER(classify_cp_cutoff_strength, 7,
150+
"Class Pruner CutoffStrength: ", this->params()),
151+
INT_MEMBER(classify_integer_matcher_multiplier, 10,
152+
"Integer Matcher Multiplier 0-255: ", this->params()),
153+
EnableLearning(true),
154+
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
155+
this->params()),
156+
BOOL_MEMBER(classify_bln_numeric_mode, 0,
157+
"Assume the input is numbers [0-9].", this->params()),
158+
double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
159+
this->params()),
160+
double_MEMBER(speckle_rating_penalty, 10.0,
161+
"Penalty to add to worst rating for noise", this->params()),
162+
shape_table_(NULL),
163+
dict_(this),
164+
static_classifier_(NULL) {
157165
fontinfo_table_.set_compare_callback(
158166
NewPermanentTessCallback(CompareFontInfo));
159167
fontinfo_table_.set_clear_callback(

classify/classify.h

+6
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,12 @@ class Classify : public CCStruct {
374374
// Member variables.
375375

376376
// Parameters.
377+
// Set during training (in lang.config) to indicate whether the divisible
378+
// blobs chopper should be used (true for latin script.)
379+
BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping");
380+
// Set during training (in lang.config) to indicate whether the divisible
381+
// blobs chopper should be used in preference to chopping. Set to true for
382+
// southern Indic scripts.
377383
BOOL_VAR_H(prioritize_division, FALSE,
378384
"Prioritize blob division over chopping");
379385
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");

wordrec/chopper.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
200200
if (seam == NULL) {
201201
if (repair_unchopped_blobs)
202202
restore_outline_tree(blob->outlines);
203-
if (word->latin_script) {
203+
if (allow_blob_division && !prioritize_division) {
204204
// If the blob can simply be divided into outlines, then do that.
205205
TPOINT location;
206206
if (divisible_blob(blob, italic_blob, &location)) {

0 commit comments

Comments
 (0)