@@ -26,6 +26,7 @@ OVERWRITE=0
26
26
RUN_SHAPE_CLUSTERING=0
27
27
EXTRACT_FONT_PROPERTIES=1
28
28
WORKSPACE_DIR=" /tmp/tesstrain"
29
+ EXPOSURES=0
29
30
30
31
# Logging helper functions.
31
32
tlog () {
@@ -98,6 +99,16 @@ parse_flags() {
98
99
FONTS=( ${ARGV[$j]} )
99
100
IFS=$ofs
100
101
i=$j ;;
102
+ --exposures)
103
+ exp=" "
104
+ while test $j -lt ${# ARGV[@]} ; do
105
+ test -z ${ARGV[$j]} && break
106
+ test ` echo ${ARGV[$j]} | cut -c -2` = " --" && break
107
+ exp=" $exp ${ARGV[$j]} "
108
+ j=$(( j+ 1 ))
109
+ done
110
+ parse_value " EXPOSURES" " $exp "
111
+ i=$(( j- 1 )) ;;
101
112
--fonts_dir)
102
113
parse_value " FONTS_DIR" ${ARGV[$j]}
103
114
i=$j ;;
@@ -226,35 +237,36 @@ phase_I_generate_image() {
226
237
err_exit " Could not find training text file ${TRAINING_TEXT} "
227
238
fi
228
239
CHAR_SPACING=" 0.0"
229
- EXPOSURE=" 0"
230
-
231
- if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
232
- # Parse .bigram_freqs file and compose a .train_ngrams file with text
233
- # for tesseract to recognize during training. Take only the ngrams whose
234
- # combined weight accounts for 95% of all the bigrams in the language.
235
- NGRAM_FRAC=$( cat ${BIGRAM_FREQS_FILE} \
236
- | awk ' {s=s+$2}; END {print (s/100)*p}' p=99)
237
- cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
238
- | awk ' {s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
239
- x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
240
- check_file_readable ${TRAIN_NGRAMS_FILE}
241
- fi
242
240
243
- local counter=0
244
- for font in " ${FONTS[@]} " ; do
245
- generate_font_image " ${font} " &
246
- let counter=counter+1
247
- let rem=counter%par_factor
248
- if [[ " ${rem} " -eq 0 ]]; then
249
- wait
241
+ for EXPOSURE in $EXPOSURES ; do
242
+ if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
243
+ # Parse .bigram_freqs file and compose a .train_ngrams file with text
244
+ # for tesseract to recognize during training. Take only the ngrams whose
245
+ # combined weight accounts for 95% of all the bigrams in the language.
246
+ NGRAM_FRAC=$( cat ${BIGRAM_FREQS_FILE} \
247
+ | awk ' {s=s+$2}; END {print (s/100)*p}' p=99)
248
+ cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
249
+ | awk ' {s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
250
+ x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
251
+ check_file_readable ${TRAIN_NGRAMS_FILE}
250
252
fi
251
- done
252
- wait
253
- # Check that each process was successful.
254
- for font in " ${FONTS[@]} " ; do
255
- local fontname=$( echo ${font} | tr ' ' ' _' | sed ' s/,//g' )
256
- local outbase=${TRAINING_DIR} /${LANG_CODE} .${fontname} .exp${EXPOSURE}
257
- check_file_readable ${outbase} .box ${outbase} .tif
253
+
254
+ local counter=0
255
+ for font in " ${FONTS[@]} " ; do
256
+ generate_font_image " ${font} " &
257
+ let counter=counter+1
258
+ let rem=counter%par_factor
259
+ if [[ " ${rem} " -eq 0 ]]; then
260
+ wait
261
+ fi
262
+ done
263
+ wait
264
+ # Check that each process was successful.
265
+ for font in " ${FONTS[@]} " ; do
266
+ local fontname=$( echo ${font} | tr ' ' ' _' | sed ' s/,//g' )
267
+ local outbase=${TRAINING_DIR} /${LANG_CODE} .${fontname} .exp${EXPOSURE}
268
+ check_file_readable ${outbase} .box ${outbase} .tif
269
+ done
258
270
done
259
271
}
260
272
@@ -359,10 +371,9 @@ phase_E_extract_features() {
359
371
par_factor=1
360
372
fi
361
373
tlog " \n=== Phase E: Extracting features ==="
362
- TRAIN_EXPOSURES=' 0'
363
374
364
375
local img_files=" "
365
- for exposure in ${TRAIN_EXPOSURES } ; do
376
+ for exposure in ${EXPOSURES } ; do
366
377
img_files=${img_files} ' ' $( ls ${TRAINING_DIR} /* .exp${exposure} .tif)
367
378
done
368
379
0 commit comments