Skip to content

Commit 11f82f5

Browse files
committed
unittest: Add baseapi_test
* Add Abseil sources to build process. * Add copyright comment. * InitConfigOnlyTest no longer tests hin.traineddata because it is LSTM only. * Fix std::string. * Deactivate tests with missing test data. Signed-off-by: Stefan Weil <sw@weilnetz.de>
1 parent db16fea commit 11f82f5

File tree

2 files changed

+81
-35
lines changed

2 files changed

+81
-35
lines changed

unittest/Makefile.am

+22
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/training
2626
endif
2727
AM_CPPFLAGS += -I$(top_srcdir)/src/viewer
2828
AM_CPPFLAGS += -I$(top_srcdir)/src/wordrec
29+
AM_CPPFLAGS += -I$(top_srcdir)/abseil
2930

3031
# Build googletest:
3132
check_LTLIBRARIES = libgtest.la libgtest_main.la libgmock.la libgmock_main.la
@@ -34,6 +35,22 @@ libgtest_la_CPPFLAGS = -I$(top_srcdir)/googletest/googletest/include -I$(top_src
3435
libgtest_main_la_SOURCES = ../googletest/googletest/src/gtest_main.cc
3536
## libgtest_main_la_LIBADD = libgtest.la
3637

38+
# Build Abseil (needed for some unit tests).
39+
check_LTLIBRARIES += libabseil.la
40+
libabseil_la_SOURCES =
41+
libabseil_la_SOURCES += ../abseil/absl/base/internal/cycleclock.cc
42+
libabseil_la_SOURCES += ../abseil/absl/base/internal/raw_logging.cc
43+
libabseil_la_SOURCES += ../abseil/absl/base/internal/spinlock.cc
44+
libabseil_la_SOURCES += ../abseil/absl/base/internal/spinlock_wait.cc
45+
libabseil_la_SOURCES += ../abseil/absl/base/internal/sysinfo.cc
46+
libabseil_la_SOURCES += ../abseil/absl/base/internal/throw_delegate.cc
47+
libabseil_la_SOURCES += ../abseil/absl/base/internal/unscaledcycleclock.cc
48+
libabseil_la_SOURCES += ../abseil/absl/numeric/int128.cc
49+
libabseil_la_SOURCES += ../abseil/absl/strings/ascii.cc
50+
libabseil_la_SOURCES += ../abseil/absl/strings/str_cat.cc
51+
libabseil_la_SOURCES += ../abseil/absl/time/clock.cc
52+
libabseil_la_SOURCES += ../abseil/absl/time/duration.cc
53+
3754
GMOCK_INCLUDES = -I$(top_srcdir)/googletest/googlemock/include \
3855
-I$(top_srcdir)/googletest/googlemock \
3956
-I$(top_srcdir)/googletest/googletest/include \
@@ -47,6 +64,7 @@ libgmock_main_la_CPPFLAGS = $(GMOCK_INCLUDES) \
4764
-pthread
4865

4966
# Build unittests
67+
ABSEIL_LIBS = libabseil.la
5068
GTEST_LIBS = libgtest.la libgtest_main.la
5169
GMOCK_LIBS = libgmock.la libgmock_main.la
5270
TESS_LIBS = $(top_builddir)/src/api/libtesseract.la
@@ -59,6 +77,7 @@ AM_CPPFLAGS += -isystem $(top_srcdir)/googletest/googletest/include \
5977
check_PROGRAMS = \
6078
apiexample_test \
6179
applybox_test \
80+
baseapi_test \
6281
bitvector_test \
6382
cleanapi_test \
6483
colpartition_test \
@@ -97,6 +116,9 @@ apiexample_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
97116
applybox_test_SOURCES = applybox_test.cc
98117
applybox_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
99118

119+
baseapi_test_SOURCES = baseapi_test.cc
120+
baseapi_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
121+
100122
bitvector_test_SOURCES = bitvector_test.cc
101123
bitvector_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
102124

unittest/baseapi_test.cc

+59-35
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,40 @@
1+
// (C) Copyright 2017, Google Inc.
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
// http://www.apache.org/licenses/LICENSE-2.0
6+
// Unless required by applicable law or agreed to in writing, software
7+
// distributed under the License is distributed on an "AS IS" BASIS,
8+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
// See the License for the specific language governing permissions and
10+
// limitations under the License.
111

2-
#include "tesseract/api/baseapi.h"
312
#include <memory>
413
#include <string>
514
#include <vector>
6-
#include "leptonica/include/allheaders.h"
7-
#include "tesseract/ccstruct/pageres.h"
15+
16+
#include "absl/strings/ascii.h"
17+
#include "absl/strings/str_cat.h"
18+
#include "allheaders.h"
19+
20+
#include "include_gunit.h"
21+
#include "gmock/gmock-matchers.h"
22+
23+
#include "baseapi.h"
24+
#include "cycletimer.h" // for CycleTimer
25+
#include "log.h" // for LOG
26+
#include "ocrblock.h" // for class BLOCK
27+
#include "pageres.h"
828

929
namespace {
1030

1131
using ::testing::ContainsRegex;
1232
using ::testing::HasSubstr;
1333

14-
const char* langs[] = {"eng", "vie", "hin", "ara", nullptr};
15-
const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif",
34+
static const char* langs[] = {"eng", "vie", "hin", "ara", nullptr};
35+
static const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif",
1636
"arabic.tif", nullptr};
17-
const char* gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
37+
static const char* gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
1838
"\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
1939
"\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a",
2040
nullptr};
@@ -23,10 +43,10 @@ class FriendlyTessBaseAPI : public tesseract::TessBaseAPI {
2343
FRIEND_TEST(TesseractTest, LSTMGeometryTest);
2444
};
2545

26-
string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) {
46+
std::string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) {
2747
tess->SetImage(pix);
2848
char* result = tess->GetUTF8Text();
29-
string ocr_result = result;
49+
std::string ocr_result = result;
3050
delete[] result;
3151
absl::StripAsciiWhitespace(&ocr_result);
3252
return ocr_result;
@@ -35,11 +55,11 @@ string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) {
3555
// The fixture for testing Tesseract.
3656
class TesseractTest : public testing::Test {
3757
protected:
38-
string TestDataNameToPath(const string& name) {
39-
return file::JoinPath(FLAGS_test_srcdir, "testdata/" + name);
58+
static std::string TestDataNameToPath(const std::string& name) {
59+
return file::JoinPath(TESTING_DIR, name);
4060
}
41-
string TessdataPath() {
42-
return file::JoinPath(FLAGS_test_srcdir, "tessdata");
61+
static std::string TessdataPath() {
62+
return TESSDATA_DIR;
4363
}
4464
};
4565

@@ -54,8 +74,8 @@ TEST_F(TesseractTest, ArraySizeTest) {
5474
// Tests that Tesseract gets exactly the right answer on phototest.
5575
TEST_F(TesseractTest, BasicTesseractTest) {
5676
tesseract::TessBaseAPI api;
57-
string truth_text;
58-
string ocr_text;
77+
std::string truth_text;
78+
std::string ocr_text;
5979
api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
6080
Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
6181
CHECK(src_pix);
@@ -74,6 +94,7 @@ TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
7494
api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
7595
api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
7696
api.SetVariable("paragraph_debug_level", "3");
97+
#if 0 // TODO: b622.png is missing
7798
Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
7899
CHECK(src_pix);
79100
api.SetImage(src_pix);
@@ -88,6 +109,7 @@ TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
88109
boxaDestroy(&block_boxes);
89110
boxaDestroy(&para_boxes);
90111
pixDestroy(&src_pix);
112+
#endif
91113
}
92114

93115
// We should get hOCR output and not seg fault, even if the api caller doesn't
@@ -130,6 +152,7 @@ TEST_F(TesseractTest, HOCRContainsBaseline) {
130152
TEST_F(TesseractTest, RickSnyderNotFuckSnyder) {
131153
tesseract::TessBaseAPI api;
132154
api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
155+
#if 0 // TODO: rick_snyder.jpeg is missing
133156
Pix* src_pix = pixRead(TestDataNameToPath("rick_snyder.jpeg").c_str());
134157
CHECK(src_pix);
135158
api.SetImage(src_pix);
@@ -138,6 +161,7 @@ TEST_F(TesseractTest, RickSnyderNotFuckSnyder) {
138161
EXPECT_THAT(result, Not(HasSubstr("FUCK")));
139162
delete[] result;
140163
pixDestroy(&src_pix);
164+
#endif
141165
}
142166

143167
// Tests that Tesseract gets exactly the right answer on some page numbers.
@@ -152,14 +176,14 @@ TEST_F(TesseractTest, AdaptToWordStrTest) {
152176
static const char* kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr};
153177
static const char* kTestText[] = {"324", "433", "12", nullptr};
154178
tesseract::TessBaseAPI api;
155-
string truth_text;
156-
string ocr_text;
179+
std::string truth_text;
180+
std::string ocr_text;
157181
api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
158182
api.SetVariable("matcher_sufficient_examples_for_prototyping", "1");
159183
api.SetVariable("classify_class_pruner_threshold", "220");
160184
// Train on the training text.
161185
for (int i = 0; kTrainingPages[i] != nullptr; ++i) {
162-
string image_file = TestDataNameToPath(kTrainingPages[i]);
186+
std::string image_file = TestDataNameToPath(kTrainingPages[i]);
163187
Pix* src_pix = pixRead(image_file.c_str());
164188
CHECK(src_pix);
165189
api.SetImage(src_pix);
@@ -185,8 +209,8 @@ TEST_F(TesseractTest, AdaptToWordStrTest) {
185209
// Tests that LSTM gets exactly the right answer on phototest.
186210
TEST_F(TesseractTest, BasicLSTMTest) {
187211
tesseract::TessBaseAPI api;
188-
string truth_text;
189-
string ocr_text;
212+
std::string truth_text;
213+
std::string ocr_text;
190214
api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY);
191215
Pix* src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
192216
CHECK(src_pix);
@@ -247,7 +271,7 @@ TEST_F(TesseractTest, LSTMGeometryTest) {
247271

248272
TEST_F(TesseractTest, InitConfigOnlyTest) {
249273
// Languages for testing initialization.
250-
const char* langs[] = {"eng", "chi_tra", "jpn", "vie", "hin"};
274+
const char* langs[] = {"eng", "chi_tra", "jpn", "vie"};
251275
std::unique_ptr<tesseract::TessBaseAPI> api;
252276
CycleTimer timer;
253277
for (int i = 0; i < ARRAYSIZE(langs); ++i) {
@@ -286,24 +310,24 @@ TEST(TesseractInstanceTest, TestMultipleTessInstances) {
286310
int num_langs = 0;
287311
while (langs[num_langs] != nullptr) ++num_langs;
288312

289-
const string kTessdataPath = file::JoinPath(FLAGS_test_srcdir, "tessdata");
313+
const std::string kTessdataPath = TESSDATA_DIR;
290314

291315
// Preload images and verify that OCR is correct on them individually.
292316
std::vector<Pix*> pix(num_langs);
293317
for (int i = 0; i < num_langs; ++i) {
294318
SCOPED_TRACE(absl::StrCat("Single instance test with lang = ", langs[i]));
295-
string path = FLAGS_test_srcdir + "/testdata/" + image_files[i];
319+
std::string path = file::JoinPath(TESTING_DIR, image_files[i]);
296320
pix[i] = pixRead(path.c_str());
297321
QCHECK(pix[i] != nullptr) << "Could not read " << path;
298322

299323
tesseract::TessBaseAPI tess;
300324
EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i]));
301-
string ocr_result = GetCleanedTextResult(&tess, pix[i]);
325+
std::string ocr_result = GetCleanedTextResult(&tess, pix[i]);
302326
EXPECT_STREQ(gt_text[i], ocr_result.c_str());
303327
}
304328

305329
// Process the images in all pairwise combinations of associated languages.
306-
string ocr_result[2];
330+
std::string ocr_result[2];
307331
for (int i = 0; i < num_langs; ++i) {
308332
for (int j = i + 1; j < num_langs; ++j) {
309333
tesseract::TessBaseAPI tess1, tess2;
@@ -324,21 +348,21 @@ TEST(TesseractInstanceTest, TestMultipleTessInstances) {
324348

325349
// Tests whether Tesseract parameters are correctly set for the two instances.
326350
TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
327-
string illegal_name = "an_illegal_name";
328-
string langs[2] = {"eng", "hin"};
329-
string int_param_name = "tessedit_pageseg_mode";
351+
std::string illegal_name = "an_illegal_name";
352+
std::string langs[2] = {"eng", "hin"};
353+
std::string int_param_name = "tessedit_pageseg_mode";
330354
int int_param[2] = {1, 2};
331-
string int_param_str[2] = {"1", "2"};
332-
string bool_param_name = "tessedit_ambigs_training";
355+
std::string int_param_str[2] = {"1", "2"};
356+
std::string bool_param_name = "tessedit_ambigs_training";
333357
bool bool_param[2] = {false, true};
334-
string bool_param_str[2] = {"F", "T"};
335-
string str_param_name = "tessedit_char_blacklist";
336-
string str_param[2] = {"abc", "def"};
337-
string double_param_name = "segment_penalty_dict_frequent_word";
338-
string double_param_str[2] = {"0.01", "2"};
358+
std::string bool_param_str[2] = {"F", "T"};
359+
std::string str_param_name = "tessedit_char_blacklist";
360+
std::string str_param[2] = {"abc", "def"};
361+
std::string double_param_name = "segment_penalty_dict_frequent_word";
362+
std::string double_param_str[2] = {"0.01", "2"};
339363
double double_param[2] = {0.01, 2};
340364

341-
const string kTessdataPath = file::JoinPath(FLAGS_test_srcdir, "tessdata");
365+
const std::string kTessdataPath = TESSDATA_DIR;
342366

343367
tesseract::TessBaseAPI tess1, tess2;
344368
for (int i = 0; i < 2; ++i) {

0 commit comments

Comments
 (0)