Skip to content

Commit a912967

Browse files
author
Ray Smith
committed
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files.
1 parent c773eb5 commit a912967

File tree

7 files changed

+120
-166
lines changed

7 files changed

+120
-166
lines changed

ccstruct/boxread.cpp

+8-3
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,14 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
5757
return false;
5858
// Convert the array of bytes to a string, so it can be used by the parser.
5959
box_data.push_back('\0');
60-
return ReadMemBoxes(target_page, skip_blanks, &box_data[0], boxes, texts,
61-
box_texts, pages);
60+
return ReadMemBoxes(target_page, skip_blanks, &box_data[0],
61+
/*continue_on_failure*/ true, boxes, texts, box_texts,
62+
pages);
6263
}
6364

6465
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
6566
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
67+
bool continue_on_failure,
6668
GenericVector<TBOX>* boxes,
6769
GenericVector<STRING>* texts,
6870
GenericVector<STRING>* box_texts,
@@ -77,7 +79,10 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
7779
STRING utf8_str;
7880
TBOX box;
7981
if (!ParseBoxFileStr(lines[i].string(), &page, &utf8_str, &box)) {
80-
continue;
82+
if (continue_on_failure)
83+
continue;
84+
else
85+
return false;
8186
}
8287
if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) continue;
8388
if (target_page >= 0 && page != target_page) continue;

ccstruct/boxread.h

+4
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,11 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
4848
GenericVector<int>* pages);
4949

5050
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
51+
// continue_on_failure allows reading to continue even if an invalid box is
52+
// encountered and will return true if it succeeds in reading some boxes.
53+
// It otherwise gives up and returns false on encountering an invalid box.
5154
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
55+
bool continue_on_failure,
5256
GenericVector<TBOX>* boxes,
5357
GenericVector<STRING>* texts,
5458
GenericVector<STRING>* box_texts,

ccstruct/imagedata.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -353,8 +353,9 @@ bool ImageData::AddBoxes(const char* box_text) {
353353
GenericVector<TBOX> boxes;
354354
GenericVector<STRING> texts;
355355
GenericVector<int> box_pages;
356-
if (ReadMemBoxes(page_number_, false, box_text, &boxes,
357-
&texts, NULL, &box_pages)) {
356+
if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
357+
/*continue_on_failure*/ true, &boxes, &texts, NULL,
358+
&box_pages)) {
358359
AddBoxes(boxes, texts, box_pages);
359360
return true;
360361
} else {

ccutil/unicharset.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -879,10 +879,9 @@ bool UNICHARSET::load_via_fgets(
879879
this->set_bearing_stats(id, bearing, bearing_sd);
880880
this->set_advance_stats(id, advance, advance_sd);
881881
this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
882-
ASSERT_HOST(other_case < unicharset_size);
883-
this->set_other_case(id, (v>3) ? other_case : id);
884-
ASSERT_HOST(mirror < unicharset_size);
885-
this->set_mirror(id, (v>8) ? mirror : id);
882+
this->set_other_case(
883+
id, (v > 3 && other_case < unicharset_size) ? other_case : id);
884+
this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
886885
this->set_normed(id, (v>16) ? normed : unichar);
887886
}
888887
post_load_setup();

training/language-specific.sh

+14-5
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
2929
uig ukr urd uzb uzb_cyrl vie yid"
3030

3131
# Codes for which we have webtext but no fonts:
32-
# armenian, dhivehi, mongolian (we support mongolian cyrillic as in the webtext,
33-
# but not mongolian script with vertical writing direction), sindhi (for which
34-
# we have persian script webtext, but real sindhi text can be in persian OR
35-
# devanagari script)
36-
UNUSABLE_LANGUAGE_CODES="hye div mon snd"
32+
UNUSABLE_LANGUAGE_CODES=""
3733

3834
FRAKTUR_FONTS=(
3935
"CaslonishFraxx Medium" \
@@ -1142,6 +1138,19 @@ set_lang_specific_parameters() {
11421138

11431139
# Default to 0 exposure if it hasn't been set
11441140
test -z "$EXPOSURES" && EXPOSURES=0
1141+
# Set right-to-left and normalization mode.
1142+
case "${LANG_CODE}" in
1143+
ara | div| fas | pus | snd | syr | uig | urd | kur_ara | heb | yid )
1144+
LANG_IS_RTL="1"
1145+
NORM_MODE="2" ;;
1146+
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
1147+
dzo | sin | san | bod | ori | khm | mya | tha | lao )
1148+
LANG_IS_RTL="0"
1149+
NORM_MODE="2" ;;
1150+
* )
1151+
LANG_IS_RTL="0"
1152+
NORM_MODE="1" ;;
1153+
esac
11451154
}
11461155

11471156
#=============================================================================

training/tesstrain_utils.sh

+12-24
Original file line numberDiff line numberDiff line change
@@ -287,14 +287,12 @@ phase_UP_generate_unicharset() {
287287
tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
288288

289289
local box_files=$(ls ${TRAINING_DIR}/*.box)
290-
run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files}
291-
local outfile=${TRAINING_DIR}/unicharset
292290
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
293-
check_file_readable ${outfile}
294-
mv ${outfile} ${UNICHARSET_FILE}
291+
run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
292+
--norm_mode "${NORM_MODE}" ${box_files}
293+
check_file_readable ${UNICHARSET_FILE}
295294

296295
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
297-
check_file_readable ${UNICHARSET_FILE}
298296
run_command set_unicharset_properties \
299297
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
300298
--script_dir=${LANGDATA_ROOT}
@@ -347,11 +345,9 @@ phase_D_generate_dawg() {
347345
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
348346
# 2/RRP_FORCE_REVERSE for the punctuation DAWG.
349347
local punc_reverse_policy=0;
350-
case ${LANG_CODE} in
351-
ara | div| fas | pus | snd | syr | uig | urd | heb | yid )
352-
punc_reverse_policy=2 ;;
353-
* ) ;;
354-
esac
348+
if [[ "${LANG_IS_RTL}" == "1" ]]; then
349+
punc_reverse_policy=2
350+
fi
355351
if [[ ! -s ${PUNC_FILE} ]]; then
356352
PUNC_FILE="${LANGDATA_ROOT}/common.punc"
357353
fi
@@ -504,21 +500,13 @@ make__lstmdata() {
504500
mkdir -p "${OUTPUT_DIR}"
505501
fi
506502
local lang_is_rtl=""
507-
# TODO(rays) set using script lang lists.
508-
case "${LANG_CODE}" in
509-
ara | div| fas | pus | snd | syr | uig | urd | kur_ara | heb | yid )
510-
lang_is_rtl="--lang_is_rtl" ;;
511-
* ) ;;
512-
esac
503+
if [[ "${LANG_IS_RTL}" == "1" ]]; then
504+
lang_is_rtl="--lang_is_rtl"
505+
fi
513506
local pass_through=""
514-
# TODO(rays) set using script lang lists.
515-
case "${LANG_CODE}" in
516-
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
517-
dzo | sin | san | bod | ori | khm | mya | tha | lao | heb | yid | ara | \
518-
fas | pus | snd | urd | div | syr | uig | kur_ara )
519-
pass_through="--pass_through_recoder" ;;
520-
* ) ;;
521-
esac
507+
if [[ "${NORM_MODE}" -ge "2" ]]; then
508+
pass_through="--pass_through_recoder"
509+
fi
522510

523511
# Build the starter traineddata from the inputs.
524512
run_command combine_lang_model \

training/unicharset_extractor.cpp

+76-128
Original file line numberDiff line numberDiff line change
@@ -17,146 +17,94 @@
1717
//
1818
///////////////////////////////////////////////////////////////////////
1919

20-
// Given a list of box files on the command line, this program generates a file
21-
// containing a unicharset, a list of all the characters used by Tesseract
22-
//
23-
// The file contains the size of the set on the first line, and then one
24-
// unichar per line.
25-
26-
#include <stdio.h>
27-
#if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3)
28-
#include <wchar.h>
29-
#include <wctype.h>
30-
#define USING_WCTYPE
31-
#endif
32-
#include <locale.h>
20+
// Given a list of box files or text files on the command line, this program
21+
// normalizes the text according to command-line options and generates
22+
// a unicharset.
3323

24+
#include <cstdlib>
3425
#include "boxread.h"
35-
#include "rect.h"
26+
#include "commandlineflags.h"
27+
#include "genericvector.h"
28+
#include "lang_model_helpers.h"
29+
#include "normstrngs.h"
3630
#include "strngs.h"
37-
#include "tessopt.h"
38-
#include "unichar.h"
31+
#include "tprintf.h"
3932
#include "unicharset.h"
40-
41-
using tesseract::UNICHAR;
42-
43-
static const char* const kUnicharsetFileName = "unicharset";
44-
45-
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
46-
UNICHAR uch(wc);
47-
char *unichar = uch.utf8_str();
48-
UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
49-
delete[] unichar;
50-
return unichar_id;
51-
}
52-
53-
// Set character properties using wctype if we have it.
54-
// Contributed by piggy@gmail.com.
55-
// Modified by Ray to use UNICHAR for unicode conversion
56-
// and to check for wctype using autoconf/presence of windows.
57-
void set_properties(UNICHARSET *unicharset, const char* const c_string) {
58-
#ifdef USING_WCTYPE
59-
UNICHAR_ID id;
60-
int wc;
61-
62-
// Convert the string to a unichar id.
63-
id = unicharset->unichar_to_id(c_string);
64-
65-
// Set the other_case property to be this unichar id by default.
66-
unicharset->set_other_case(id, id);
67-
68-
int step = UNICHAR::utf8_step(c_string);
69-
if (step == 0)
70-
return; // Invalid utf-8.
71-
72-
// Get the next Unicode code point in the string.
73-
UNICHAR ch(c_string, step);
74-
wc = ch.first_uni();
75-
76-
/* Copy the properties. */
77-
if (iswalpha(wc)) {
78-
unicharset->set_isalpha(id, 1);
79-
if (iswlower(wc)) {
80-
unicharset->set_islower(id, 1);
81-
unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
82-
towupper(wc)));
83-
}
84-
if (iswupper(wc)) {
85-
unicharset->set_isupper(id, 1);
86-
unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
87-
towlower(wc)));
33+
#include "unicharset_training_utils.h"
34+
35+
STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
36+
INT_PARAM_FLAG(norm_mode, 1,
37+
"Normalization mode: 1=Combine graphemes, "
38+
"2=Split graphemes, 3=Pure unicode");
39+
40+
namespace tesseract {
41+
42+
// Helper normalizes and segments the given strings according to norm_mode, and
43+
// adds the segmented parts to unicharset.
44+
static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
45+
int norm_mode, UNICHARSET* unicharset) {
46+
for (int i = 0; i < strings.size(); ++i) {
47+
std::vector<string> normalized;
48+
if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
49+
static_cast<GraphemeNormMode>(norm_mode),
50+
/*report_errors*/ true,
51+
strings[i].string(), &normalized)) {
52+
for (const string& normed : normalized) {
53+
if (normed.empty() || IsWhitespace(normed[0])) continue;
54+
unicharset->unichar_insert(normed.c_str());
55+
}
56+
} else {
57+
tprintf("Normalization failed for string '%s'\n", strings[i].c_str());
8858
}
8959
}
90-
if (iswdigit(wc))
91-
unicharset->set_isdigit(id, 1);
92-
if(iswpunct(wc))
93-
unicharset->set_ispunctuation(id, 1);
94-
95-
#endif
9660
}
9761

98-
int main(int argc, char** argv) {
99-
int option;
100-
const char* output_directory = ".";
101-
STRING unicharset_file_name;
102-
// Special characters are now included by default.
62+
int Main(int argc, char** argv) {
10363
UNICHARSET unicharset;
104-
105-
setlocale(LC_ALL, "");
106-
107-
// Print usage
108-
if (argc <= 1) {
109-
printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
110-
#ifdef USING_WCTYPE
111-
printf("Character properties using wctype is enabled\n");
112-
#else
113-
printf("WARNING: Character properties using wctype is DISABLED\n");
114-
#endif
115-
exit(1);
116-
117-
}
118-
119-
// Parse arguments
120-
while ((option = tessopt(argc, argv, "D" )) != EOF) {
121-
switch (option) {
122-
case 'D':
123-
output_directory = tessoptarg;
124-
++tessoptind;
125-
break;
64+
// Load input files
65+
for (int arg = 1; arg < argc; ++arg) {
66+
STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
67+
if (file_data.length() == 0) continue;
68+
GenericVector<STRING> texts;
69+
if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
70+
/*continue_on_failure*/ false, /*boxes*/ nullptr,
71+
&texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
72+
tprintf("Extracting unicharset from box file %s\n", argv[arg]);
73+
} else {
74+
tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
75+
texts.truncate(0);
76+
file_data.split('\n', &texts);
12677
}
78+
AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
12779
}
128-
129-
// Save file name
130-
unicharset_file_name = output_directory;
131-
unicharset_file_name += "/";
132-
unicharset_file_name += kUnicharsetFileName;
133-
134-
// Load box files
135-
for (; tessoptind < argc; ++tessoptind) {
136-
printf("Extracting unicharset from %s\n", argv[tessoptind]);
137-
138-
FILE* box_file = fopen(argv[tessoptind], "rb");
139-
if (box_file == nullptr) {
140-
printf("Cannot open box file %s\n", argv[tessoptind]);
141-
return -1;
142-
}
143-
144-
TBOX box;
145-
STRING unichar_string;
146-
int line_number = 0;
147-
while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
148-
unicharset.unichar_insert(unichar_string.string());
149-
set_properties(&unicharset, unichar_string.string());
150-
}
80+
SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false,
81+
&unicharset);
82+
// Write unicharset file.
83+
if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
84+
tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
85+
} else {
86+
tprintf("Cannot save unicharset file %s\n",
87+
FLAGS_output_unicharset.c_str());
88+
return EXIT_FAILURE;
15189
}
90+
return EXIT_SUCCESS;
91+
}
15292

153-
// Write unicharset file
154-
if (unicharset.save_to_file(unicharset_file_name.string())) {
155-
printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
156-
}
157-
else {
158-
printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
159-
return -1;
93+
} // namespace tesseract
94+
95+
int main(int argc, char** argv) {
96+
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
97+
if (argc < 2) {
98+
tprintf(
99+
"Usage: %s [--output_unicharset filename] [--norm_mode mode]"
100+
" box_or_text_file [...]\n",
101+
argv[0]);
102+
tprintf("Where mode means:\n");
103+
tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
104+
tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
105+
tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
106+
tprintf("Reads box or plain text files to extract the unicharset.\n");
107+
return EXIT_FAILURE;
160108
}
161-
return 0;
109+
return tesseract::Main(argc, argv);
162110
}

0 commit comments

Comments
 (0)