Fix baseapi_test with locale de_DE.UTF-8

stweil · stweil · commit 36ed6da3499c · 2019-05-16T11:05:09.000+02:00
The unittest failed with LANG=de_DE.UTF-8:

    $ unittest/baseapi_test
    Running main() from ../../../../unittest/../googletest/googletest/src/gtest_main.cc
    [==========] Running 12 tests from 2 test suites.
    [----------] Global test environment set-up.
    [----------] 10 tests from TesseractTest
    [ RUN      ] TesseractTest.ArraySizeTest
    [       OK ] TesseractTest.ArraySizeTest (0 ms)
    [ RUN      ] TesseractTest.BasicTesseractTest
    [       OK ] TesseractTest.BasicTesseractTest (1251 ms)
    [ RUN      ] TesseractTest.IteratesParagraphsEvenIfNotDetected
    [       OK ] TesseractTest.IteratesParagraphsEvenIfNotDetected (347 ms)
    [ RUN      ] TesseractTest.HOCRWorksWithoutSetInputName
    [       OK ] TesseractTest.HOCRWorksWithoutSetInputName (403 ms)
    [ RUN      ] TesseractTest.HOCRContainsBaseline
    [       OK ] TesseractTest.HOCRContainsBaseline (389 ms)
    [ RUN      ] TesseractTest.RickSnyderNotFuckSnyder
    [       OK ] TesseractTest.RickSnyderNotFuckSnyder (346 ms)
    [ RUN      ] TesseractTest.AdaptToWordStrTest
    Trying to adapt "136
    " to "1 3 6"
    Trying to adapt "256
    " to "2 5 6"
    Trying to adapt "410
    " to "4 1 0"
    Trying to adapt "432
    " to "4 3 2"
    Trying to adapt "540
    " to "5 4 0"
    Trying to adapt "692
    " to "6 9 2"
    Trying to adapt "779
    " to "7 7 9"
    Trying to adapt "793
    " to "7 9 3"
    Trying to adapt "808
    " to "8 0 8"
    Trying to adapt "815
    " to "8 1 5"
    Trying to adapt "12
    " to "1 2"
    Trying to adapt "12
    " to "1 2"
    [       OK ] TesseractTest.AdaptToWordStrTest (788 ms)
    [ RUN      ] TesseractTest.BasicLSTMTest
    [       OK ] TesseractTest.BasicLSTMTest (4525 ms)
    [ RUN      ] TesseractTest.LSTMGeometryTest
    [       OK ] TesseractTest.LSTMGeometryTest (615 ms)
    [ RUN      ] TesseractTest.InitConfigOnlyTest
    Error: unichar ? in normproto file is not in unichar set.
    Error: unichar 0.232621 in normproto file is not in unichar set.
    Error: unichar 0.000400 in normproto file is not in unichar set.
    Error: unichar 0.231864 in normproto file is not in unichar set.
    [...]
    Error: unichar ? in normproto file is not in unichar set.
    Error: unichar 0.233915 in normproto file is not in unichar set.
    Error: unichar 0.000400 in normproto file is not in unichar set.
    Error: unichar 0.221755 in normproto file is not in unichar set.
    Error: unichar 0.000400 in normproto file is not in unichar set.
    Error: unichar ? in normproto file is not in unichar set.
    baseapi_test(21845,0x1134c45c0) malloc: *** error for object 0x927f96c28005e0: pointer being freed was not allocated
    baseapi_test(21845,0x1134c45c0) malloc: *** set a breakpoint in malloc_error_break to debug
    [INFO]  Lang eng took 327ms in regular init
    [INFO]  Lang chi_tra took 1422ms in regular init
    Abort trap: 6

TesseractTest.InitConfigOnlyTest is fixed by using std::istringstream
instead of sscanf.

Signed-off-by: Stefan Weil &lt;sw@weilnetz.de&gt;
diff --git a/src/classify/normmatch.cpp b/src/classify/normmatch.cpp
@@ -21,6 +21,7 @@
 
 #include <cstdio>
 #include <cmath>
+#include <sstream>          // for std::istringstream
 
 #include "classify.h"
 #include "clusttool.h"
@@ -113,7 +114,7 @@ float Classify::ComputeNormMatch(CLASS_ID ClassId,
       feature.Params[CharNormRx] * 8000.0 +
       feature.Params[CharNormRy] *
       feature.Params[CharNormRy] * 8000.0);
-    return (1.0 - NormEvidenceOf (Match));
+    return (1.0 - NormEvidenceOf(Match));
   }
 
   BestMatch = FLT_MAX;
@@ -209,7 +210,11 @@ NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
   const int kMaxLineSize = 100;
   char line[kMaxLineSize];
   while (fp->FGets(line, kMaxLineSize) != nullptr) {
-    if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
+    std::istringstream stream(line);
+    stream >> unichar >> NumProtos;
+    if (stream.fail()) {
+      continue;
+    }
     if (unicharset.contains_unichar(unichar)) {
       unichar_id = unicharset.unichar_to_id(unichar);
       Protos = NormProtos->Protos[unichar_id];