Skip to content

Commit b34cf9d

Browse files
committed
Javanese script training
1 parent e1c387c commit b34cf9d

6 files changed

+31
-14
lines changed

src/training/language-specific.sh

+10-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
2222
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
2323
ell eng enm epo est eus fas fil fin fra frk frm gle glg
2424
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
25-
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
25+
jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat
2626
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
2727
pan pol por pus ron rus san sin slk slv snd spa spa_old
2828
sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur
@@ -603,6 +603,10 @@ BURMESE_FONTS=( \
603603
"Padauk" \
604604
"TharLon" \
605605
)
606+
607+
JAVANESE_FONTS=( \
608+
"Prada" \
609+
)
606610

607611
NORTH_AMERICAN_ABORIGINAL_FONTS=( \
608612
"Aboriginal Sans" \
@@ -1065,6 +1069,10 @@ set_lang_specific_parameters() {
10651069
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
10661070

10671071
# SouthEast Asian scripts.
1072+
jav_java ) MEAN_COUNT="15"
1073+
WORD_DAWG_FACTOR=0.15
1074+
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
1075+
test -z "$FONTS" && FONTS=( "${JAVANESE_FONTS[@]}" ) ;;
10681076
khm ) MEAN_COUNT="15"
10691077
WORD_DAWG_FACTOR=0.15
10701078
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
@@ -1172,7 +1180,7 @@ set_lang_specific_parameters() {
11721180
LANG_IS_RTL="1"
11731181
NORM_MODE="2" ;;
11741182
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
1175-
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
1183+
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav | jav_java)
11761184
LANG_IS_RTL="0"
11771185
NORM_MODE="2" ;;
11781186
* )

src/training/tesstrain.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ initialize_fontconfig
6161
phase_I_generate_image 8
6262
phase_UP_generate_unicharset
6363
if ((LINEDATA)); then
64-
phase_E_extract_features "lstm.train" 8 "lstmf"
64+
phase_E_extract_features " --psm 6 lstm.train " 8 "lstmf"
6565
make__lstmdata
6666
tlog "\nCreated starter traineddata for language '${LANG_CODE}'\n"
6767
tlog "\nRun lstmtraining to do the LSTM training for language '${LANG_CODE}'\n"

src/training/tesstrain_utils.sh

+6-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ else
2323
FONTS_DIR="/usr/share/fonts/"
2424
FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
2525
fi
26+
MAX_PAGES=0
2627
OUTPUT_DIR="/tmp/tesstrain/tessdata"
2728
OVERWRITE=0
2829
LINEDATA=0
@@ -130,6 +131,9 @@ parse_flags() {
130131
--langdata_dir)
131132
parse_value "LANGDATA_ROOT" ${ARGV[$j]}
132133
i=$j ;;
134+
--maxpages)
135+
parse_value "MAX_PAGES" ${ARGV[$j]}
136+
i=$j ;;
133137
--output_dir)
134138
parse_value "OUTPUT_DIR" ${ARGV[$j]}
135139
i=$j ;;
@@ -221,7 +225,7 @@ generate_font_image() {
221225
common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
222226
common_args+=" --leading=${LEADING}"
223227
common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
224-
common_args+=" --outputbase=${outbase} --max_pages=0"
228+
common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}"
225229

226230
# add --writing_mode=vertical-upright to common_args if the font is
227231
# specified to be rendered vertically.
@@ -233,7 +237,7 @@ generate_font_image() {
233237
done
234238

235239
run_command text2image ${common_args} --font="${font}" \
236-
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
240+
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
237241
check_file_readable ${outbase}.box ${outbase}.tif
238242

239243
if ((EXTRACT_FONT_PROPERTIES)) &&
@@ -246,7 +250,6 @@ generate_font_image() {
246250
fi
247251
}
248252

249-
250253
# Phase I : Generate (I)mages from training text for each font.
251254
phase_I_generate_image() {
252255
local par_factor=$1

src/training/validate_indic.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
6565
return CharClass::kConsonant;
6666
// Sinhala doesn't have Nukta or Avagraha.
6767
if (off == 0x3c) return CharClass::kNukta;
68-
if (off == 0x3d) return CharClass::kVowel;
68+
if (off == 0x3d) return CharClass::kVowel; // avagraha
6969
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
7070
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
7171
if (off == 0x4d) return CharClass::kVirama;

src/training/validate_javanese.cpp

+12-6
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ namespace tesseract {
2626
// Taken from unicode standard:
2727
// http://www.unicode.org/charts/PDF/UA980.pdf
2828
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
29+
// The Consonant class here includes independent vowels.
2930
// The order of components in an orthographic syllable as expressed in BNF is:
3031
// {C F} C {{R}Y} {V{A}} {Z}
3132
// Translated to the codes used by the CharClass enum:
32-
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
33-
// Also the Consonant class here includes independent vowels, as they are
34-
// treated the same anyway.
33+
// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
34+
// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
35+
// Validation rules copied from validate_indic.cpp and modified for Javanese.
3536
// Indic - for reference
3637
// + vowel Grapheme: V[D](v)*
3738
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
@@ -63,7 +64,6 @@ bool ValidateJavanese::ConsumeGraphemeIfValid() {
6364
}
6465

6566
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
66-
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
6767
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
6868
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
6969
// Offset from the start of the relevant unicode code block aka code page.
@@ -74,6 +74,8 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
7474
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
7575
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
7676
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
77+
if (off <= 0x39) return CharClass::kMatra;
78+
if (off <= 0x3a) return CharClass::kMatraPiece; // A9BA TALING
7779
if (off <= 0x3d) return CharClass::kMatra;
7880
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
7981
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
@@ -229,6 +231,11 @@ bool ValidateJavanese::ConsumeConsonantTailIfValid() {
229231
if (UseMultiCode(1)) return true;
230232
}
231233
}
234+
// Tarung also used for long versions of u and o vowels and vocalic r
235+
// Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
236+
while (codes_[codes_used_].first == CharClass::kMatraPiece) {
237+
if (UseMultiCode(1)) return true;
238+
}
232239
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
233240
if (UseMultiCode(1)) return true;
234241
}
@@ -259,5 +266,4 @@ bool ValidateJavanese::ConsumeVowelIfValid() {
259266
return true;
260267
}
261268

262-
} // namespace tesseract
263-
269+
} // namespace tesseract

src/training/validate_javanese.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,4 @@ class ValidateJavanese : public Validator {
6060

6161
} // namespace tesseract
6262

63-
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
63+
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_

0 commit comments

Comments
 (0)