|
17 | 17 | //
|
18 | 18 | ///////////////////////////////////////////////////////////////////////
|
19 | 19 |
|
20 |
| -// Given a list of box files on the command line, this program generates a file |
21 |
| -// containing a unicharset, a list of all the characters used by Tesseract |
22 |
| -// |
23 |
| -// The file contains the size of the set on the first line, and then one |
24 |
| -// unichar per line. |
25 |
| - |
26 |
| -#include <stdio.h> |
27 |
| -#if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3) |
28 |
| -#include <wchar.h> |
29 |
| -#include <wctype.h> |
30 |
| -#define USING_WCTYPE |
31 |
| -#endif |
32 |
| -#include <locale.h> |
| 20 | +// Given a list of box files or text files on the command line, this program |
| 21 | +// normalizes the text according to command-line options and generates |
| 22 | +// a unicharset. |
33 | 23 |
|
| 24 | +#include <cstdlib> |
34 | 25 | #include "boxread.h"
|
35 |
| -#include "rect.h" |
| 26 | +#include "commandlineflags.h" |
| 27 | +#include "genericvector.h" |
| 28 | +#include "lang_model_helpers.h" |
| 29 | +#include "normstrngs.h" |
36 | 30 | #include "strngs.h"
|
37 |
| -#include "tessopt.h" |
38 |
| -#include "unichar.h" |
| 31 | +#include "tprintf.h" |
39 | 32 | #include "unicharset.h"
|
40 |
| - |
41 |
| -using tesseract::UNICHAR; |
42 |
| - |
43 |
| -static const char* const kUnicharsetFileName = "unicharset"; |
44 |
| - |
45 |
| -UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { |
46 |
| - UNICHAR uch(wc); |
47 |
| - char *unichar = uch.utf8_str(); |
48 |
| - UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); |
49 |
| - delete[] unichar; |
50 |
| - return unichar_id; |
51 |
| -} |
52 |
| - |
53 |
| -// Set character properties using wctype if we have it. |
54 |
| -// Contributed by piggy@gmail.com. |
55 |
| -// Modified by Ray to use UNICHAR for unicode conversion |
56 |
| -// and to check for wctype using autoconf/presence of windows. |
57 |
| -void set_properties(UNICHARSET *unicharset, const char* const c_string) { |
58 |
| -#ifdef USING_WCTYPE |
59 |
| - UNICHAR_ID id; |
60 |
| - int wc; |
61 |
| - |
62 |
| - // Convert the string to a unichar id. |
63 |
| - id = unicharset->unichar_to_id(c_string); |
64 |
| - |
65 |
| - // Set the other_case property to be this unichar id by default. |
66 |
| - unicharset->set_other_case(id, id); |
67 |
| - |
68 |
| - int step = UNICHAR::utf8_step(c_string); |
69 |
| - if (step == 0) |
70 |
| - return; // Invalid utf-8. |
71 |
| - |
72 |
| - // Get the next Unicode code point in the string. |
73 |
| - UNICHAR ch(c_string, step); |
74 |
| - wc = ch.first_uni(); |
75 |
| - |
76 |
| - /* Copy the properties. */ |
77 |
| - if (iswalpha(wc)) { |
78 |
| - unicharset->set_isalpha(id, 1); |
79 |
| - if (iswlower(wc)) { |
80 |
| - unicharset->set_islower(id, 1); |
81 |
| - unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, |
82 |
| - towupper(wc))); |
83 |
| - } |
84 |
| - if (iswupper(wc)) { |
85 |
| - unicharset->set_isupper(id, 1); |
86 |
| - unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, |
87 |
| - towlower(wc))); |
| 33 | +#include "unicharset_training_utils.h" |
| 34 | + |
| 35 | +STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path"); |
| 36 | +INT_PARAM_FLAG(norm_mode, 1, |
| 37 | + "Normalization mode: 1=Combine graphemes, " |
| 38 | + "2=Split graphemes, 3=Pure unicode"); |
| 39 | + |
| 40 | +namespace tesseract { |
| 41 | + |
| 42 | +// Helper normalizes and segments the given strings according to norm_mode, and |
| 43 | +// adds the segmented parts to unicharset. |
| 44 | +static void AddStringsToUnicharset(const GenericVector<STRING>& strings, |
| 45 | + int norm_mode, UNICHARSET* unicharset) { |
| 46 | + for (int i = 0; i < strings.size(); ++i) { |
| 47 | + std::vector<string> normalized; |
| 48 | + if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone, |
| 49 | + static_cast<GraphemeNormMode>(norm_mode), |
| 50 | + /*report_errors*/ true, |
| 51 | + strings[i].string(), &normalized)) { |
| 52 | + for (const string& normed : normalized) { |
| 53 | + if (normed.empty() || IsWhitespace(normed[0])) continue; |
| 54 | + unicharset->unichar_insert(normed.c_str()); |
| 55 | + } |
| 56 | + } else { |
| 57 | + tprintf("Normalization failed for string '%s'\n", strings[i].c_str()); |
88 | 58 | }
|
89 | 59 | }
|
90 |
| - if (iswdigit(wc)) |
91 |
| - unicharset->set_isdigit(id, 1); |
92 |
| - if(iswpunct(wc)) |
93 |
| - unicharset->set_ispunctuation(id, 1); |
94 |
| - |
95 |
| -#endif |
96 | 60 | }
|
97 | 61 |
|
98 |
| -int main(int argc, char** argv) { |
99 |
| - int option; |
100 |
| - const char* output_directory = "."; |
101 |
| - STRING unicharset_file_name; |
102 |
| - // Special characters are now included by default. |
| 62 | +int Main(int argc, char** argv) { |
103 | 63 | UNICHARSET unicharset;
|
104 |
| - |
105 |
| - setlocale(LC_ALL, ""); |
106 |
| - |
107 |
| - // Print usage |
108 |
| - if (argc <= 1) { |
109 |
| - printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); |
110 |
| -#ifdef USING_WCTYPE |
111 |
| - printf("Character properties using wctype is enabled\n"); |
112 |
| -#else |
113 |
| - printf("WARNING: Character properties using wctype is DISABLED\n"); |
114 |
| -#endif |
115 |
| - exit(1); |
116 |
| - |
117 |
| - } |
118 |
| - |
119 |
| - // Parse arguments |
120 |
| - while ((option = tessopt(argc, argv, "D" )) != EOF) { |
121 |
| - switch (option) { |
122 |
| - case 'D': |
123 |
| - output_directory = tessoptarg; |
124 |
| - ++tessoptind; |
125 |
| - break; |
| 64 | + // Load input files |
| 65 | + for (int arg = 1; arg < argc; ++arg) { |
| 66 | + STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr); |
| 67 | + if (file_data.length() == 0) continue; |
| 68 | + GenericVector<STRING> texts; |
| 69 | + if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0], |
| 70 | + /*continue_on_failure*/ false, /*boxes*/ nullptr, |
| 71 | + &texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) { |
| 72 | + tprintf("Extracting unicharset from box file %s\n", argv[arg]); |
| 73 | + } else { |
| 74 | + tprintf("Extracting unicharset from plain text file %s\n", argv[arg]); |
| 75 | + texts.truncate(0); |
| 76 | + file_data.split('\n', &texts); |
126 | 77 | }
|
| 78 | + AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset); |
127 | 79 | }
|
128 |
| - |
129 |
| - // Save file name |
130 |
| - unicharset_file_name = output_directory; |
131 |
| - unicharset_file_name += "/"; |
132 |
| - unicharset_file_name += kUnicharsetFileName; |
133 |
| - |
134 |
| - // Load box files |
135 |
| - for (; tessoptind < argc; ++tessoptind) { |
136 |
| - printf("Extracting unicharset from %s\n", argv[tessoptind]); |
137 |
| - |
138 |
| - FILE* box_file = fopen(argv[tessoptind], "rb"); |
139 |
| - if (box_file == nullptr) { |
140 |
| - printf("Cannot open box file %s\n", argv[tessoptind]); |
141 |
| - return -1; |
142 |
| - } |
143 |
| - |
144 |
| - TBOX box; |
145 |
| - STRING unichar_string; |
146 |
| - int line_number = 0; |
147 |
| - while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { |
148 |
| - unicharset.unichar_insert(unichar_string.string()); |
149 |
| - set_properties(&unicharset, unichar_string.string()); |
150 |
| - } |
| 80 | + SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false, |
| 81 | + &unicharset); |
| 82 | + // Write unicharset file. |
| 83 | + if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) { |
| 84 | + tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str()); |
| 85 | + } else { |
| 86 | + tprintf("Cannot save unicharset file %s\n", |
| 87 | + FLAGS_output_unicharset.c_str()); |
| 88 | + return EXIT_FAILURE; |
151 | 89 | }
|
| 90 | + return EXIT_SUCCESS; |
| 91 | +} |
152 | 92 |
|
153 |
| - // Write unicharset file |
154 |
| - if (unicharset.save_to_file(unicharset_file_name.string())) { |
155 |
| - printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); |
156 |
| - } |
157 |
| - else { |
158 |
| - printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); |
159 |
| - return -1; |
| 93 | +} // namespace tesseract |
| 94 | + |
| 95 | +int main(int argc, char** argv) { |
| 96 | + tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); |
| 97 | + if (argc < 2) { |
| 98 | + tprintf( |
| 99 | + "Usage: %s [--output_unicharset filename] [--norm_mode mode]" |
| 100 | + " box_or_text_file [...]\n", |
| 101 | + argv[0]); |
| 102 | + tprintf("Where mode means:\n"); |
| 103 | + tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n"); |
| 104 | + tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n"); |
| 105 | + tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n"); |
| 106 | + tprintf("Reads box or plain text files to extract the unicharset.\n"); |
| 107 | + return EXIT_FAILURE; |
160 | 108 | }
|
161 |
| - return 0; |
| 109 | + return tesseract::Main(argc, argv); |
162 | 110 | }
|
0 commit comments