Skip to content

Commit c1c1e42

Browse files
committed
Added new LSTM-based neural network line recognizer
1 parent 5d21ecf commit c1c1e42

File tree

107 files changed

+15410
-354
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+15410
-354
lines changed

Makefile.am

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ endif
1616

1717
.PHONY: install-langs ScrollView.jar install-jars training
1818

19-
SUBDIRS = ccutil viewer cutil opencl ccstruct dict classify wordrec textord
19+
SUBDIRS = arch ccutil viewer cutil opencl ccstruct dict classify wordrec textord lstm
2020
if !NO_CUBE_BUILD
2121
SUBDIRS += neural_networks/runtime cube
2222
endif

api/Makefile.am

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
AM_CPPFLAGS += -DLOCALEDIR=\"$(localedir)\"\
22
-DUSE_STD_NAMESPACE \
3+
-I$(top_srcdir)/arch -I$(top_srcdir)/lstm \
34
-I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct -I$(top_srcdir)/cube \
45
-I$(top_srcdir)/viewer \
56
-I$(top_srcdir)/textord -I$(top_srcdir)/dict \
@@ -27,6 +28,9 @@ libtesseract_api_la_LIBADD = \
2728
../wordrec/libtesseract_wordrec.la \
2829
../classify/libtesseract_classify.la \
2930
../dict/libtesseract_dict.la \
31+
../arch/libtesseract_avx.la \
32+
../arch/libtesseract_sse.la \
33+
../lstm/libtesseract_lstm.la \
3034
../ccstruct/libtesseract_ccstruct.la \
3135
../cutil/libtesseract_cutil.la \
3236
../viewer/libtesseract_viewer.la \
@@ -57,6 +61,9 @@ libtesseract_la_LIBADD = \
5761
../wordrec/libtesseract_wordrec.la \
5862
../classify/libtesseract_classify.la \
5963
../dict/libtesseract_dict.la \
64+
../arch/libtesseract_avx.la \
65+
../arch/libtesseract_sse.la \
66+
../lstm/libtesseract_lstm.la \
6067
../ccstruct/libtesseract_ccstruct.la \
6168
../cutil/libtesseract_cutil.la \
6269
../viewer/libtesseract_viewer.la \

api/baseapi.cpp

+24-39
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ TessBaseAPI::TessBaseAPI()
121121
block_list_(NULL),
122122
page_res_(NULL),
123123
input_file_(NULL),
124-
input_image_(NULL),
125124
output_file_(NULL),
126125
datapath_(NULL),
127126
language_(NULL),
@@ -515,19 +514,19 @@ void TessBaseAPI::ClearAdaptiveClassifier() {
515514

516515
/**
517516
* Provide an image for Tesseract to recognize. Format is as
518-
* TesseractRect above. Does not copy the image buffer, or take
519-
* ownership. The source image may be destroyed after Recognize is called,
520-
* either explicitly or implicitly via one of the Get*Text functions.
517+
* TesseractRect above. Copies the image buffer and converts to Pix.
521518
* SetImage clears all recognition results, and sets the rectangle to the
522519
* full image, so it may be followed immediately by a GetUTF8Text, and it
523520
* will automatically perform recognition.
524521
*/
525522
void TessBaseAPI::SetImage(const unsigned char* imagedata,
526523
int width, int height,
527524
int bytes_per_pixel, int bytes_per_line) {
528-
if (InternalSetImage())
525+
if (InternalSetImage()) {
529526
thresholder_->SetImage(imagedata, width, height,
530527
bytes_per_pixel, bytes_per_line);
528+
SetInputImage(thresholder_->GetPixRect());
529+
}
531530
}
532531

533532
void TessBaseAPI::SetSourceResolution(int ppi) {
@@ -539,18 +538,17 @@ void TessBaseAPI::SetSourceResolution(int ppi) {
539538

540539
/**
541540
* Provide an image for Tesseract to recognize. As with SetImage above,
542-
* Tesseract doesn't take a copy or ownership or pixDestroy the image, so
543-
* it must persist until after Recognize.
541+
* Tesseract takes its own copy of the image, so it need not persist until
542+
* after Recognize.
544543
* Pix vs raw, which to use?
545-
* Use Pix where possible. A future version of Tesseract may choose to use Pix
546-
* as its internal representation and discard IMAGE altogether.
547-
* Because of that, an implementation that sources and targets Pix may end up
548-
* with less copies than an implementation that does not.
544+
* Use Pix where possible. Tesseract uses Pix as its internal representation
545+
* and it is therefore more efficient to provide a Pix directly.
549546
*/
550547
void TessBaseAPI::SetImage(Pix* pix) {
551-
if (InternalSetImage())
548+
if (InternalSetImage()) {
552549
thresholder_->SetImage(pix);
553-
SetInputImage(pix);
550+
SetInputImage(thresholder_->GetPixRect());
551+
}
554552
}
555553

556554
/**
@@ -693,8 +691,8 @@ Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
693691
if (pixa != NULL) {
694692
Pix* pix = NULL;
695693
if (raw_image) {
696-
pix = page_it->GetImage(level, raw_padding, input_image_,
697-
&left, &top);
694+
pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
695+
&top);
698696
} else {
699697
pix = page_it->GetBinaryImage(level);
700698
}
@@ -849,13 +847,17 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
849847
} else if (tesseract_->tessedit_resegment_from_boxes) {
850848
page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
851849
} else {
852-
// TODO(rays) LSTM here.
853-
page_res_ = new PAGE_RES(false,
850+
page_res_ = new PAGE_RES(tesseract_->AnyLSTMLang(),
854851
block_list_, &tesseract_->prev_word_best_choice_);
855852
}
856853
if (page_res_ == NULL) {
857854
return -1;
858855
}
856+
if (tesseract_->tessedit_train_line_recognizer) {
857+
tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_);
858+
tesseract_->CorrectClassifyWords(page_res_);
859+
return 0;
860+
}
859861
if (tesseract_->tessedit_make_boxes_from_boxes) {
860862
tesseract_->CorrectClassifyWords(page_res_);
861863
return 0;
@@ -938,17 +940,10 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
938940
return 0;
939941
}
940942

941-
void TessBaseAPI::SetInputImage(Pix *pix) {
942-
if (input_image_)
943-
pixDestroy(&input_image_);
944-
input_image_ = NULL;
945-
if (pix)
946-
input_image_ = pixCopy(NULL, pix);
947-
}
943+
// Takes ownership of the input pix.
944+
void TessBaseAPI::SetInputImage(Pix* pix) { tesseract_->set_pix_original(pix); }
948945

949-
Pix* TessBaseAPI::GetInputImage() {
950-
return input_image_;
951-
}
946+
Pix* TessBaseAPI::GetInputImage() { return tesseract_->pix_original(); }
952947

953948
const char * TessBaseAPI::GetInputName() {
954949
if (input_file_)
@@ -992,8 +987,7 @@ bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
992987
}
993988

994989
// Begin producing output
995-
const char* kUnknownTitle = "";
996-
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
990+
if (renderer && !renderer->BeginDocument(unknown_title_)) {
997991
return false;
998992
}
999993

@@ -1105,7 +1099,6 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
11051099
const char* retry_config,
11061100
int timeout_millisec,
11071101
TessResultRenderer* renderer) {
1108-
#ifndef ANDROID_BUILD
11091102
PERF_COUNT_START("ProcessPages")
11101103
bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
11111104
if (stdInput) {
@@ -1162,8 +1155,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
11621155
}
11631156

11641157
// Begin the output
1165-
const char* kUnknownTitle = "";
1166-
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
1158+
if (renderer && !renderer->BeginDocument(unknown_title_)) {
11671159
pixDestroy(&pix);
11681160
return false;
11691161
}
@@ -1185,9 +1177,6 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
11851177
}
11861178
PERF_COUNT_END
11871179
return true;
1188-
#else
1189-
return false;
1190-
#endif
11911180
}
11921181

11931182
bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
@@ -2107,10 +2096,6 @@ void TessBaseAPI::End() {
21072096
delete input_file_;
21082097
input_file_ = NULL;
21092098
}
2110-
if (input_image_ != NULL) {
2111-
pixDestroy(&input_image_);
2112-
input_image_ = NULL;
2113-
}
21142099
if (output_file_ != NULL) {
21152100
delete output_file_;
21162101
output_file_ = NULL;

api/baseapi.h

+14-12
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
#ifndef TESSERACT_API_BASEAPI_H__
2121
#define TESSERACT_API_BASEAPI_H__
2222

23-
#define TESSERACT_VERSION_STR "3.05.00dev"
24-
#define TESSERACT_VERSION 0x030500
23+
#define TESSERACT_VERSION_STR "4.00.00alpha"
24+
#define TESSERACT_VERSION 0x040000
2525
#define MAKE_VERSION(major, minor, patch) (((major) << 16) | ((minor) << 8) | \
2626
(patch))
2727

@@ -142,6 +142,7 @@ class TESS_API TessBaseAPI {
142142
* is stored in the PDF so we need that as well.
143143
*/
144144
const char* GetInputName();
145+
// Takes ownership of the input pix.
145146
void SetInputImage(Pix *pix);
146147
Pix* GetInputImage();
147148
int GetSourceYResolution();
@@ -333,9 +334,7 @@ class TESS_API TessBaseAPI {
333334

334335
/**
335336
* Provide an image for Tesseract to recognize. Format is as
336-
* TesseractRect above. Does not copy the image buffer, or take
337-
* ownership. The source image may be destroyed after Recognize is called,
338-
* either explicitly or implicitly via one of the Get*Text functions.
337+
* TesseractRect above. Copies the image buffer and converts to Pix.
339338
* SetImage clears all recognition results, and sets the rectangle to the
340339
* full image, so it may be followed immediately by a GetUTF8Text, and it
341340
* will automatically perform recognition.
@@ -345,13 +344,11 @@ class TESS_API TessBaseAPI {
345344

346345
/**
347346
* Provide an image for Tesseract to recognize. As with SetImage above,
348-
* Tesseract doesn't take a copy or ownership or pixDestroy the image, so
349-
* it must persist until after Recognize.
347+
* Tesseract takes its own copy of the image, so it need not persist until
348+
* after Recognize.
350349
* Pix vs raw, which to use?
351-
* Use Pix where possible. A future version of Tesseract may choose to use Pix
352-
* as its internal representation and discard IMAGE altogether.
353-
* Because of that, an implementation that sources and targets Pix may end up
354-
* with less copies than an implementation that does not.
350+
* Use Pix where possible. Tesseract uses Pix as its internal representation
351+
* and it is therefore more efficient to provide a Pix directly.
355352
*/
356353
void SetImage(Pix* pix);
357354

@@ -866,7 +863,6 @@ class TESS_API TessBaseAPI {
866863
BLOCK_LIST* block_list_; ///< The page layout.
867864
PAGE_RES* page_res_; ///< The page-level data.
868865
STRING* input_file_; ///< Name used by training code.
869-
Pix* input_image_; ///< Image used for searchable PDF
870866
STRING* output_file_; ///< Name used by debug code.
871867
STRING* datapath_; ///< Current location of tessdata.
872868
STRING* language_; ///< Last initialized language.
@@ -902,6 +898,12 @@ class TESS_API TessBaseAPI {
902898
int timeout_millisec,
903899
TessResultRenderer* renderer,
904900
int tessedit_page_number);
901+
// There's currently no way to pass a document title from the
902+
// Tesseract command line, and we have multiple places that choose
903+
// to set the title to an empty string. Using a single named
904+
// variable will hopefully reduce confusion if the situation changes
905+
// in the future.
906+
const char *unknown_title_ = "";
905907
}; // class TessBaseAPI.
906908

907909
/** Escape a char string - remove &<>"' with HTML codes. */

api/pdfrenderer.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,6 @@ bool TessPDFRenderer::BeginDocumentHandler() {
620620
AppendPDFObject(buf);
621621

622622
// FONT DESCRIPTOR
623-
const int kCharHeight = 2; // Effect: highlights are half height
624623
n = snprintf(buf, sizeof(buf),
625624
"7 0 obj\n"
626625
"<<\n"
@@ -636,10 +635,10 @@ bool TessPDFRenderer::BeginDocumentHandler() {
636635
" /Type /FontDescriptor\n"
637636
">>\n"
638637
"endobj\n",
639-
1000 / kCharHeight,
640-
1000 / kCharHeight,
638+
1000,
639+
1000,
641640
1000 / kCharWidth,
642-
1000 / kCharHeight,
641+
1000,
643642
8L // Font data
644643
);
645644
if (n >= sizeof(buf)) return false;

api/renderer.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class TESS_API TessResultRenderer {
7777
bool EndDocument();
7878

7979
const char* file_extension() const { return file_extension_; }
80-
const char* title() const { return title_; }
80+
const char* title() const { return title_.c_str(); }
8181

8282
/**
8383
* Returns the index of the last image given to AddImage
@@ -126,7 +126,7 @@ class TESS_API TessResultRenderer {
126126

127127
private:
128128
const char* file_extension_; // standard extension for generated output
129-
const char* title_; // title of document being renderered
129+
STRING title_; // title of document being renderered
130130
int imagenum_; // index of last image added
131131

132132
FILE* fout_; // output file pointer

arch/Makefile.am

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
AM_CPPFLAGS += -I$(top_srcdir)/ccutil
2+
AUTOMAKE_OPTIONS = subdir-objects
3+
SUBDIRS =
4+
AM_CXXFLAGS =
5+
6+
if VISIBILITY
7+
AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
8+
AM_CPPFLAGS += -DTESS_EXPORTS
9+
endif
10+
11+
include_HEADERS = \
12+
dotproductavx.h dotproductsse.h
13+
14+
noinst_HEADERS =
15+
16+
if !USING_MULTIPLELIBS
17+
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
18+
else
19+
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
20+
libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
21+
libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
22+
endif
23+
libtesseract_avx_la_CXXFLAGS = -mavx
24+
libtesseract_sse_la_CXXFLAGS = -msse4.1
25+
26+
libtesseract_avx_la_SOURCES = dotproductavx.cpp
27+
28+
libtesseract_sse_la_SOURCES = dotproductsse.cpp
29+

0 commit comments

Comments
 (0)