Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files.

Ray Smith · Ray Smith · commit a912967cc3e9 · 2017-09-08T11:49:57.000+01:00
diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp
@@ -57,12 +57,14 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
     return false;
   // Convert the array of bytes to a string, so it can be used by the parser.
   box_data.push_back('\0');
-  return ReadMemBoxes(target_page, skip_blanks, &box_data[0], boxes, texts,
-                      box_texts, pages);
+  return ReadMemBoxes(target_page, skip_blanks, &box_data[0],
+                      /*continue_on_failure*/ true, boxes, texts, box_texts,
+                      pages);
 }
 
 // Reads all boxes from the string. Otherwise, as ReadAllBoxes.
 bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
+                  bool continue_on_failure,
                   GenericVector<TBOX>* boxes,
                   GenericVector<STRING>* texts,
                   GenericVector<STRING>* box_texts,
@@ -77,7 +79,10 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
     STRING utf8_str;
     TBOX box;
     if (!ParseBoxFileStr(lines[i].string(), &page, &utf8_str, &box)) {
-      continue;
+      if (continue_on_failure)
+        continue;
+      else
+        return false;
     }
     if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) continue;
     if (target_page >= 0 && page != target_page) continue;
diff --git a/ccstruct/boxread.h b/ccstruct/boxread.h
@@ -48,7 +48,11 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
                   GenericVector<int>* pages);
 
 // Reads all boxes from the string. Otherwise, as ReadAllBoxes.
+// continue_on_failure allows reading to continue even if an invalid box is
+// encountered and will return true if it succeeds in reading some boxes.
+// It otherwise gives up and returns false on encountering an invalid box.
 bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
+                  bool continue_on_failure,
                   GenericVector<TBOX>* boxes,
                   GenericVector<STRING>* texts,
                   GenericVector<STRING>* box_texts,
diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp
@@ -353,8 +353,9 @@ bool ImageData::AddBoxes(const char* box_text) {
     GenericVector<TBOX> boxes;
     GenericVector<STRING> texts;
     GenericVector<int> box_pages;
-    if (ReadMemBoxes(page_number_, false, box_text, &boxes,
-                     &texts, NULL, &box_pages)) {
+    if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
+                     /*continue_on_failure*/ true, &boxes, &texts, NULL,
+                     &box_pages)) {
       AddBoxes(boxes, texts, box_pages);
       return true;
     } else {
diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp
@@ -879,10 +879,9 @@ bool UNICHARSET::load_via_fgets(
     this->set_bearing_stats(id, bearing, bearing_sd);
     this->set_advance_stats(id, advance, advance_sd);
     this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
-    ASSERT_HOST(other_case < unicharset_size);
-    this->set_other_case(id, (v>3) ? other_case : id);
-    ASSERT_HOST(mirror < unicharset_size);
-    this->set_mirror(id, (v>8) ? mirror : id);
+    this->set_other_case(
+        id, (v > 3 && other_case < unicharset_size) ? other_case : id);
+    this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
     this->set_normed(id, (v>16) ? normed : unichar);
   }
   post_load_setup();
diff --git a/training/language-specific.sh b/training/language-specific.sh
@@ -29,11 +29,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
                       uig ukr urd uzb uzb_cyrl vie yid"
 
 # Codes for which we have webtext but no fonts:
-# armenian, dhivehi, mongolian (we support mongolian cyrillic as in the webtext,
-# but not mongolian script with vertical writing direction), sindhi (for which
-# we have persian script webtext, but real sindhi text can be in persian OR
-# devanagari script)
-UNUSABLE_LANGUAGE_CODES="hye div mon snd"
+UNUSABLE_LANGUAGE_CODES=""
 
 FRAKTUR_FONTS=(
     "CaslonishFraxx Medium" \
@@ -1142,6 +1138,19 @@ set_lang_specific_parameters() {
 
   # Default to 0 exposure if it hasn't been set
   test -z "$EXPOSURES" && EXPOSURES=0
+  # Set right-to-left and normalization mode.
+  case "${LANG_CODE}" in
+    ara | div| fas | pus | snd | syr | uig | urd | kur_ara | heb | yid )
+      LANG_IS_RTL="1"
+      NORM_MODE="2" ;;
+    asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
+    dzo | sin | san | bod | ori | khm | mya | tha | lao )
+      LANG_IS_RTL="0"
+      NORM_MODE="2" ;;
+    * )
+      LANG_IS_RTL="0"
+      NORM_MODE="1" ;;
+  esac
 }
 
 #=============================================================================
diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh
@@ -287,14 +287,12 @@ phase_UP_generate_unicharset() {
     tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
 
     local box_files=$(ls ${TRAINING_DIR}/*.box)
-    run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files}
-    local outfile=${TRAINING_DIR}/unicharset
     UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
-    check_file_readable ${outfile}
-    mv ${outfile} ${UNICHARSET_FILE}
+    run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
+      --norm_mode "${NORM_MODE}" ${box_files}
+    check_file_readable ${UNICHARSET_FILE}
 
     XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
-    check_file_readable ${UNICHARSET_FILE}
     run_command set_unicharset_properties \
         -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
         --script_dir=${LANGDATA_ROOT}
@@ -347,11 +345,9 @@ phase_D_generate_dawg() {
     # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
     # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
     local punc_reverse_policy=0;
-    case ${LANG_CODE} in
-      ara | div| fas | pus | snd | syr | uig | urd | heb | yid )
-        punc_reverse_policy=2 ;;
-      * ) ;;
-    esac
+    if [[ "${LANG_IS_RTL}" == "1" ]]; then
+      punc_reverse_policy=2
+    fi
     if [[ ! -s ${PUNC_FILE} ]]; then
         PUNC_FILE="${LANGDATA_ROOT}/common.punc"
     fi
@@ -504,21 +500,13 @@ make__lstmdata() {
       mkdir -p "${OUTPUT_DIR}"
   fi
   local lang_is_rtl=""
-  # TODO(rays) set using script lang lists.
-  case "${LANG_CODE}" in
-    ara | div| fas | pus | snd | syr | uig | urd | kur_ara | heb | yid )
-      lang_is_rtl="--lang_is_rtl" ;;
-    * ) ;;
-  esac
+  if [[ "${LANG_IS_RTL}" == "1" ]]; then
+    lang_is_rtl="--lang_is_rtl"
+  fi
   local pass_through=""
-  # TODO(rays) set using script lang lists.
-  case "${LANG_CODE}" in
-    asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
-    dzo | sin | san | bod | ori | khm | mya | tha | lao | heb | yid | ara | \
-    fas | pus | snd | urd | div | syr | uig | kur_ara )
-      pass_through="--pass_through_recoder" ;;
-    * ) ;;
-  esac
+  if [[ "${NORM_MODE}" -ge "2" ]]; then
+    pass_through="--pass_through_recoder"
+  fi
 
   # Build the starter traineddata from the inputs.
   run_command combine_lang_model \
diff --git a/training/unicharset_extractor.cpp b/training/unicharset_extractor.cpp
@@ -17,146 +17,94 @@
 //
 ///////////////////////////////////////////////////////////////////////
 
-// Given a list of box files on the command line, this program generates a file
-// containing a unicharset, a list of all the characters used by Tesseract
-//
-// The file contains the size of the set on the first line, and then one
-// unichar per line.
-
-#include <stdio.h>
-#if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3)
-#include <wchar.h>
-#include <wctype.h>
-#define USING_WCTYPE
-#endif
-#include <locale.h>
+// Given a list of box files or text files on the command line, this program
+// normalizes the text according to command-line options and generates
+// a unicharset.
 
+#include <cstdlib>
 #include "boxread.h"
-#include "rect.h"
+#include "commandlineflags.h"
+#include "genericvector.h"
+#include "lang_model_helpers.h"
+#include "normstrngs.h"
 #include "strngs.h"
-#include "tessopt.h"
-#include "unichar.h"
+#include "tprintf.h"
 #include "unicharset.h"
-
-using tesseract::UNICHAR;
-
-static const char* const kUnicharsetFileName = "unicharset";
-
-UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
-  UNICHAR uch(wc);
-  char *unichar = uch.utf8_str();
-  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
-  delete[] unichar;
-  return unichar_id;
-}
-
-// Set character properties using wctype if we have it.
-// Contributed by piggy@gmail.com.
-// Modified by Ray to use UNICHAR for unicode conversion
-// and to check for wctype using autoconf/presence of windows.
-void set_properties(UNICHARSET *unicharset, const char* const c_string) {
-#ifdef USING_WCTYPE
-  UNICHAR_ID id;
-  int wc;
-
-  // Convert the string to a unichar id.
-  id = unicharset->unichar_to_id(c_string);
-
-  // Set the other_case property to be this unichar id by default.
-  unicharset->set_other_case(id, id);
-
-  int step = UNICHAR::utf8_step(c_string);
-  if (step == 0)
-    return; // Invalid utf-8.
-
-  // Get the next Unicode code point in the string.
-  UNICHAR ch(c_string, step);
-  wc = ch.first_uni();
-
-  /* Copy the properties. */
-  if (iswalpha(wc)) {
-    unicharset->set_isalpha(id, 1);
-    if (iswlower(wc)) {
-      unicharset->set_islower(id, 1);
-      unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
-                                                      towupper(wc)));
-    }
-    if (iswupper(wc)) {
-      unicharset->set_isupper(id, 1);
-      unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
-                                                      towlower(wc)));
+#include "unicharset_training_utils.h"
+
+STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
+INT_PARAM_FLAG(norm_mode, 1,
+               "Normalization mode: 1=Combine graphemes, "
+               "2=Split graphemes, 3=Pure unicode");
+
+namespace tesseract {
+
+// Helper normalizes and segments the given strings according to norm_mode, and
+// adds the segmented parts to unicharset.
+static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
+                                   int norm_mode, UNICHARSET* unicharset) {
+  for (int i = 0; i < strings.size(); ++i) {
+    std::vector<string> normalized;
+    if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                     static_cast<GraphemeNormMode>(norm_mode),
+                                     /*report_errors*/ true,
+                                     strings[i].string(), &normalized)) {
+      for (const string& normed : normalized) {
+        if (normed.empty() || IsWhitespace(normed[0])) continue;
+        unicharset->unichar_insert(normed.c_str());
+      }
+    } else {
+      tprintf("Normalization failed for string '%s'\n", strings[i].c_str());
     }
   }
-  if (iswdigit(wc))
-    unicharset->set_isdigit(id, 1);
-  if(iswpunct(wc))
-    unicharset->set_ispunctuation(id, 1);
-
-#endif
 }
 
-int main(int argc, char** argv) {
-  int option;
-  const char* output_directory = ".";
-  STRING unicharset_file_name;
-  // Special characters are now included by default.
+int Main(int argc, char** argv) {
   UNICHARSET unicharset;
-
-  setlocale(LC_ALL, "");
-
-  // Print usage
-  if (argc <= 1) {
-    printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
-#ifdef USING_WCTYPE
-    printf("Character properties using wctype is enabled\n");
-#else
-    printf("WARNING: Character properties using wctype is DISABLED\n");
-#endif
-    exit(1);
-
-  }
-
-  // Parse arguments
-  while ((option = tessopt(argc, argv, "D" )) != EOF) {
-    switch (option) {
-      case 'D':
-        output_directory = tessoptarg;
-        ++tessoptind;
-        break;
+  // Load input files
+  for (int arg = 1; arg < argc; ++arg) {
+    STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
+    if (file_data.length() == 0) continue;
+    GenericVector<STRING> texts;
+    if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
+                     /*continue_on_failure*/ false, /*boxes*/ nullptr,
+                     &texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
+      tprintf("Extracting unicharset from box file %s\n", argv[arg]);
+    } else {
+      tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
+      texts.truncate(0);
+      file_data.split('\n', &texts);
     }
+    AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
   }
-
-  // Save file name
-  unicharset_file_name = output_directory;
-  unicharset_file_name += "/";
-  unicharset_file_name += kUnicharsetFileName;
-
-  // Load box files
-  for (; tessoptind < argc; ++tessoptind) {
-    printf("Extracting unicharset from %s\n", argv[tessoptind]);
-
-    FILE* box_file = fopen(argv[tessoptind], "rb");
-    if (box_file == nullptr) {
-      printf("Cannot open box file %s\n", argv[tessoptind]);
-      return -1;
-    }
-
-    TBOX box;
-    STRING unichar_string;
-    int line_number = 0;
-    while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
-      unicharset.unichar_insert(unichar_string.string());
-      set_properties(&unicharset, unichar_string.string());
-    }
+  SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false,
+                       &unicharset);
+  // Write unicharset file.
+  if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
+    tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
+  } else {
+    tprintf("Cannot save unicharset file %s\n",
+            FLAGS_output_unicharset.c_str());
+    return EXIT_FAILURE;
   }
+  return EXIT_SUCCESS;
+}
 
-  // Write unicharset file
-  if (unicharset.save_to_file(unicharset_file_name.string())) {
-    printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
-  }
-  else {
-    printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
-    return -1;
+}  // namespace tesseract
+
+int main(int argc, char** argv) {
+  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
+  if (argc < 2) {
+    tprintf(
+        "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
+        " box_or_text_file [...]\n",
+        argv[0]);
+    tprintf("Where mode means:\n");
+    tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
+    tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
+    tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
+    tprintf("Reads box or plain text files to extract the unicharset.\n");
+    return EXIT_FAILURE;
   }
-  return 0;
+  return tesseract::Main(argc, argv);
 }