@@ -26,12 +26,13 @@ namespace tesseract {
26
26
// Taken from unicode standard:
27
27
// http://www.unicode.org/charts/PDF/UA980.pdf
28
28
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
29
+ // The Consonant class here includes independent vowels.
29
30
// The order of components in an orthographic syllable as expressed in BNF is:
30
31
// {C F} C {{R}Y} {V{A}} {Z}
31
32
// Translated to the codes used by the CharClass enum:
32
- // [(V|C[N])(H)] (V|C[N]) [[R]Y ] [M[D]] [D ]
33
- // Also the Consonant class here includes independent vowels, as they are
34
- // treated the same anyway .
33
+ // [(V|C[N])(H)] (V|C[N]) [[N]N ] [M[D]] [v ]
34
+ // Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
35
+ // Validation rules copied from validate_indic.cpp and modified for Javanese .
35
36
// Indic - for reference
36
37
// + vowel Grapheme: V[D](v)*
37
38
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
@@ -63,7 +64,6 @@ bool ValidateJavanese::ConsumeGraphemeIfValid() {
63
64
}
64
65
65
66
Validator::CharClass ValidateJavanese::UnicodeToCharClass (char32 ch) const {
66
- if (IsVedicAccent (ch)) return CharClass::kVedicMark ;
67
67
if (ch == kZeroWidthNonJoiner ) return CharClass::kZeroWidthNonJoiner ;
68
68
if (ch == kZeroWidthJoiner ) return CharClass::kZeroWidthJoiner ;
69
69
// Offset from the start of the relevant unicode code block aka code page.
@@ -74,6 +74,8 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
74
74
if (off <= 0x32 ) return CharClass::kConsonant ; // includes independent vowels
75
75
if (off == 0x33 ) return CharClass::kNukta ; // A9B3 CECAK TELU
76
76
if (off == 0x34 ) return CharClass::kMatraPiece ; // A9B4 TARUNG two part vowels
77
+ if (off <= 0x39 ) return CharClass::kMatra ;
78
+ if (off <= 0x3a ) return CharClass::kMatraPiece ; // A9BA TALING
77
79
if (off <= 0x3d ) return CharClass::kMatra ;
78
80
if (off <= 0x3f ) return CharClass::kNukta ; // A9BE-A9BF PENGKAL-CAKRA medial consonants
79
81
if (off == 0x40 ) return CharClass::kVirama ; // A9C0 PANGKON
@@ -229,6 +231,11 @@ bool ValidateJavanese::ConsumeConsonantTailIfValid() {
229
231
if (UseMultiCode (1 )) return true ;
230
232
}
231
233
}
234
+ // Tarung also used for long versions of u and o vowels and vocalic r
235
+ // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
236
+ while (codes_[codes_used_].first == CharClass::kMatraPiece ) {
237
+ if (UseMultiCode (1 )) return true ;
238
+ }
232
239
while (codes_[codes_used_].first == CharClass::kVowelModifier ) {
233
240
if (UseMultiCode (1 )) return true ;
234
241
}
@@ -259,5 +266,4 @@ bool ValidateJavanese::ConsumeVowelIfValid() {
259
266
return true ;
260
267
}
261
268
262
- } // namespace tesseract
263
-
269
+ } // namespace tesseract
0 commit comments