Skip to content

Commit 9c89cd5

Browse files
committed
Add a new renderer to create box files from images for LSTM training
(cherry picked from commit 921da6b) fix typo (cherry picked from commit 7bd1a0c) Add lstmboxrenderer to CMakeLists (cherry picked from commit cfef3a8) fix formatting (cherry picked from commit 7ba2b01)
1 parent 56725de commit 9c89cd5

10 files changed

+150
-1
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src}
252252
src/api/renderer.cpp
253253
src/api/altorenderer.cpp
254254
src/api/hocrrenderer.cpp
255+
src/api/lstmboxrenderer.cpp
255256
src/api/pdfrenderer.cpp
256257
)
257258

src/api/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ endif
3535
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
3636
libtesseract_api_la_SOURCES += altorenderer.cpp
3737
libtesseract_api_la_SOURCES += hocrrenderer.cpp
38+
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
3839
libtesseract_api_la_SOURCES += pdfrenderer.cpp
3940
libtesseract_api_la_SOURCES += renderer.cpp
4041

src/api/baseapi.h

+8
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,14 @@ class TESS_API TessBaseAPI {
613613
* Returned string must be freed with the delete [] operator.
614614
*/
615615
char* GetTSVText(int page_number);
616+
617+
/**
618+
* Make a box file for LSTM training from the internal data structures.
619+
* Constructs coordinates in the original image - not just the rectangle.
620+
* page_number is a 0-based page index that will appear in the box file.
621+
* Returned string must be freed with the delete [] operator.
622+
*/
623+
char* GetLSTMBOXText(int page_number);
616624

617625
/**
618626
* The recognized text is returned as a char* which is coded in the same

src/api/hocrrenderer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
268268
if (grapheme && grapheme[0] != 0) {
269269
if (hocr_boxes) {
270270
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
271-
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
271+
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
272272
<< left << " " << top << " " << right << " " << bottom
273273
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
274274
}

src/api/lstmboxrenderer.cpp

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/**********************************************************************
2+
* File: lstmboxrenderer.cpp
3+
* Description: Renderer for creating box file for LSTM training.
4+
* based on the tsv renderer.
5+
*
6+
* (C) Copyright 2006, Google Inc.
7+
** Licensed under the Apache License, Version 2.0 (the "License");
8+
** you may not use this file except in compliance with the License.
9+
** You may obtain a copy of the License at
10+
** http://www.apache.org/licenses/LICENSE-2.0
11+
** Unless required by applicable law or agreed to in writing, software
12+
** distributed under the License is distributed on an "AS IS" BASIS,
13+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
** See the License for the specific language governing permissions and
15+
** limitations under the License.
16+
*
17+
**********************************************************************/
18+
19+
20+
#include <locale> // for std::locale::classic
21+
#include <memory> // for std::unique_ptr
22+
#include <sstream> // for std::stringstream
23+
#include "baseapi.h" // for TessBaseAPI
24+
#include "renderer.h"
25+
#include "tesseractclass.h" // for Tesseract
26+
27+
namespace tesseract {
28+
29+
/**
30+
* Create a UTF8 box file for LSTM training from the internal data structures.
31+
* page_number is a 0-base page index that will appear in the box file.
32+
* Returned string must be freed with the delete [] operator.
33+
*/
34+
35+
char* TessBaseAPI::GetLSTMBOXText(int page_number) {
36+
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
37+
return nullptr;
38+
39+
STRING lstm_box_str("");
40+
41+
int page_num = page_number;
42+
bool first_word = true;
43+
44+
LTRResultIterator* res_it = GetLTRIterator();
45+
while (!res_it->Empty(RIL_BLOCK)) {
46+
if (res_it->Empty(RIL_SYMBOL)) {
47+
res_it->Next(RIL_SYMBOL);
48+
continue;
49+
}
50+
51+
int left, top, right, bottom;
52+
53+
if (!first_word) {
54+
if (res_it->IsAtBeginningOf(RIL_WORD)) {
55+
lstm_box_str.add_str_int(" ", left);
56+
lstm_box_str.add_str_int(" ", image_height_ - bottom);
57+
lstm_box_str.add_str_int(" ", right + 2);
58+
lstm_box_str.add_str_int(" ", image_height_ - top);
59+
lstm_box_str.add_str_int(" ", page_num); // level 5 - word
60+
lstm_box_str += "\n"; // end of row for word
61+
}
62+
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
63+
lstm_box_str.add_str_int("\t ", left);
64+
lstm_box_str.add_str_int(" ", image_height_ - bottom);
65+
lstm_box_str.add_str_int(" ", right + 5);
66+
lstm_box_str.add_str_int(" ", image_height_ - top);
67+
lstm_box_str.add_str_int(" ", page_num); // level 4 - line
68+
lstm_box_str += "\n"; // end of row for line
69+
}
70+
}
71+
first_word=false;
72+
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
73+
74+
do {
75+
lstm_box_str +=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
76+
res_it->Next(RIL_SYMBOL);
77+
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
78+
79+
lstm_box_str.add_str_int(" ", left);
80+
lstm_box_str.add_str_int(" ", image_height_ - bottom);
81+
lstm_box_str.add_str_int(" ", right);
82+
lstm_box_str.add_str_int(" ", image_height_ - top);
83+
lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol
84+
lstm_box_str += "\n"; // end of row
85+
86+
}
87+
88+
char* ret = new char[lstm_box_str.length() + 1];
89+
strcpy(ret, lstm_box_str.string());
90+
delete res_it;
91+
return ret;
92+
}
93+
94+
/**********************************************************************
95+
* LSTMBOX Renderer interface implementation
96+
**********************************************************************/
97+
TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase)
98+
: TessResultRenderer(outputbase, "box") {
99+
}
100+
101+
bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) {
102+
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBOXText(imagenum()));
103+
if (lstmbox == nullptr) return false;
104+
105+
AppendString(lstmbox.get());
106+
107+
return true;
108+
}
109+
110+
} // namespace tesseract.

src/api/renderer.h

+11
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
247247
virtual bool AddImageHandler(TessBaseAPI* api);
248248
};
249249

250+
/**
251+
* Renders tesseract output into a plain UTF-8 text string for LSTMBOX
252+
*/
253+
class TESS_API TessLSTMBOXRenderer : public TessResultRenderer {
254+
public:
255+
explicit TessLSTMBOXRenderer(const char *outputbase);
256+
257+
protected:
258+
virtual bool AddImageHandler(TessBaseAPI* api);
259+
};
260+
250261
/**
251262
* Renders tesseract output into a plain UTF-8 text string
252263
*/

src/api/tesseractmain.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,20 @@ static void PreloadRenderers(
494494
}
495495
}
496496

497+
api->GetBoolVariable("tessedit_create_lstmbox", &b);
498+
if (b) {
499+
tesseract::TessLSTMBOXRenderer* renderer =
500+
new tesseract::TessLSTMBOXRenderer(outputbase);
501+
if (renderer->happy()) {
502+
renderers->push_back(renderer);
503+
} else {
504+
delete renderer;
505+
tprintf("Error, could not create LSTM BOX output file: %s\n",
506+
strerror(errno));
507+
error = true;
508+
}
509+
}
510+
497511
api->GetBoolVariable("tessedit_create_boxfile", &b);
498512
if (b) {
499513
tesseract::TessBoxTextRenderer* renderer =

src/ccmain/tesseractclass.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,8 @@ Tesseract::Tesseract()
391391
this->params()),
392392
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
393393
this->params()),
394+
BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
395+
this->params()),
394396
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
395397
this->params()),
396398
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",

src/ccmain/tesseractclass.h

+1
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec {
10401040
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
10411041
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
10421042
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
1043+
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
10431044
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
10441045
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
10451046
BOOL_VAR_H(textonly_pdf, false,

tessdata/configs/lstmbox

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tessedit_create_lstmbox 1

0 commit comments

Comments
 (0)