Skip to content

Commit c0133ec

Browse files
committed
Add --exposures option to tesstrain.sh
This flag can be used to specify multiple different exposure levels for a training. There was some code already in tesstrain_utils.sh to deal with multiple exposure levels, so it looks like this functionality was always intended. The default usage does not change, with exposure level 0 being the only one used if --exposures is not used.
1 parent 8e71c79 commit c0133ec

File tree

2 files changed

+41
-29
lines changed

2 files changed

+41
-29
lines changed

training/tesstrain.sh

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
# --output_dir OUTPUTDIR # Location of output traineddata file.
2525
# --overwrite # Safe to overwrite files in output_dir.
2626
# --run_shape_clustering # Run shape clustering (use for Indic langs).
27+
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
2728
#
2829
# OPTIONAL flags for input data. If unspecified we will look for them in
2930
# the langdata_dir directory.

training/tesstrain_utils.sh

+40-29
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ OVERWRITE=0
2626
RUN_SHAPE_CLUSTERING=0
2727
EXTRACT_FONT_PROPERTIES=1
2828
WORKSPACE_DIR="/tmp/tesstrain"
29+
EXPOSURES=0
2930

3031
# Logging helper functions.
3132
tlog() {
@@ -98,6 +99,16 @@ parse_flags() {
9899
FONTS=( ${ARGV[$j]} )
99100
IFS=$ofs
100101
i=$j ;;
102+
--exposures)
103+
exp=""
104+
while test $j -lt ${#ARGV[@]}; do
105+
test -z ${ARGV[$j]} && break
106+
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
107+
exp="$exp ${ARGV[$j]}"
108+
j=$((j+1))
109+
done
110+
parse_value "EXPOSURES" "$exp"
111+
i=$((j-1)) ;;
101112
--fonts_dir)
102113
parse_value "FONTS_DIR" ${ARGV[$j]}
103114
i=$j ;;
@@ -226,35 +237,36 @@ phase_I_generate_image() {
226237
err_exit "Could not find training text file ${TRAINING_TEXT}"
227238
fi
228239
CHAR_SPACING="0.0"
229-
EXPOSURE="0"
230-
231-
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
232-
# Parse .bigram_freqs file and compose a .train_ngrams file with text
233-
# for tesseract to recognize during training. Take only the ngrams whose
234-
# combined weight accounts for 95% of all the bigrams in the language.
235-
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
236-
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
237-
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
238-
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
239-
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
240-
check_file_readable ${TRAIN_NGRAMS_FILE}
241-
fi
242240

243-
local counter=0
244-
for font in "${FONTS[@]}"; do
245-
generate_font_image "${font}" &
246-
let counter=counter+1
247-
let rem=counter%par_factor
248-
if [[ "${rem}" -eq 0 ]]; then
249-
wait
241+
for EXPOSURE in $EXPOSURES; do
242+
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
243+
# Parse .bigram_freqs file and compose a .train_ngrams file with text
244+
# for tesseract to recognize during training. Take only the ngrams whose
245+
# combined weight accounts for 95% of all the bigrams in the language.
246+
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
247+
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
248+
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
249+
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
250+
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
251+
check_file_readable ${TRAIN_NGRAMS_FILE}
250252
fi
251-
done
252-
wait
253-
# Check that each process was successful.
254-
for font in "${FONTS[@]}"; do
255-
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
256-
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
257-
check_file_readable ${outbase}.box ${outbase}.tif
253+
254+
local counter=0
255+
for font in "${FONTS[@]}"; do
256+
generate_font_image "${font}" &
257+
let counter=counter+1
258+
let rem=counter%par_factor
259+
if [[ "${rem}" -eq 0 ]]; then
260+
wait
261+
fi
262+
done
263+
wait
264+
# Check that each process was successful.
265+
for font in "${FONTS[@]}"; do
266+
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
267+
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
268+
check_file_readable ${outbase}.box ${outbase}.tif
269+
done
258270
done
259271
}
260272

@@ -359,10 +371,9 @@ phase_E_extract_features() {
359371
par_factor=1
360372
fi
361373
tlog "\n=== Phase E: Extracting features ==="
362-
TRAIN_EXPOSURES='0'
363374

364375
local img_files=""
365-
for exposure in ${TRAIN_EXPOSURES}; do
376+
for exposure in ${EXPOSURES}; do
366377
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
367378
done
368379

0 commit comments

Comments
 (0)