Skip to content

Commit d7cee03

Browse files
jakesebrightstweil
authored andcommitted
Add support for ALTO output
1 parent 685b136 commit d7cee03

File tree

12 files changed

+309
-2
lines changed

12 files changed

+309
-2
lines changed

CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ set(tesseract_src ${tesseract_src}
215215
src/api/capi.cpp
216216
src/api/renderer.cpp
217217
src/api/pdfrenderer.cpp
218+
src/api/altorenderer.cpp
218219
)
219220

220221
if (WIN32)
@@ -223,7 +224,7 @@ if (WIN32)
223224
set(tesseract_hdr
224225
${tesseract_hdr}
225226
${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/resource.h)
226-
set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc)
227+
set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc src/api/altorenderer.cpp)
227228
set_source_files_properties(
228229
${CMAKE_CURRENT_SOURCE_DIR}/src/arch/dotproductsse.cpp
229230
PROPERTIES COMPILE_DEFINITIONS __SSE4_1__)

android/jni/Android.mk

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../
3131

3232
EXPLICIT_SRC_EXCLUDES := \
3333
$(LOCAL_PATH)/../../api/pdfrenderer.cpp \
34+
$(LOCAL_PATH)/../../api/altorenderer.cpp \
3435
$(LOCAL_PATH)/../../api/tesseractmain.cpp \
3536

3637
LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES))

src/api/Makefile.am

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
3232
if VISIBILITY
3333
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
3434
endif
35-
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp
35+
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp altorenderer.cpp
3636

3737
lib_LTLIBRARIES += libtesseract.la
3838
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS)

src/api/altorenderer.cpp

+252
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
// File: altorenderer.cpp
2+
// Description: ALTO rendering interface
3+
// Author: Jake Sebright
4+
5+
// (C) Copyright 2018
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
16+
#include "baseapi.h"
17+
#include <memory>
18+
#include "renderer.h"
19+
20+
namespace tesseract {
21+
22+
///
23+
/// Add coordinates to specified TextBlock, TextLine, or String bounding box
24+
/// Add word confidence if adding to a String bounding box
25+
///
26+
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
27+
STRING *alto_str) {
28+
int left, top, right, bottom;
29+
it->BoundingBox(level, &left, &top, &right, &bottom);
30+
31+
int hpos = left;
32+
int vpos = top;
33+
int height = bottom - top;
34+
int width = right - left;
35+
36+
*alto_str += " HPOS=\"";
37+
alto_str->add_str_int("", hpos);
38+
*alto_str += "\"";
39+
*alto_str += " VPOS=\"";
40+
alto_str->add_str_int("", vpos);
41+
*alto_str += "\"";
42+
*alto_str += " WIDTH=\"";
43+
alto_str->add_str_int("", width);
44+
*alto_str += "\"";
45+
*alto_str += " HEIGHT=\"";
46+
alto_str->add_str_int("", height);
47+
*alto_str += "\"";
48+
49+
if (level == RIL_WORD) {
50+
int wc = it->Confidence(RIL_WORD);
51+
*alto_str += " WC=\"0.";
52+
alto_str->add_str_int("", wc);
53+
*alto_str += "\"";
54+
}
55+
if (level != RIL_WORD) {
56+
57+
*alto_str += ">";
58+
}
59+
}
60+
61+
///
62+
/// Add a unique ID to an ALTO element
63+
///
64+
static void AddIdToAlto(STRING *alto_str, const std::string base, int num1) {
65+
const size_t BUFSIZE = 64;
66+
char id_buffer[BUFSIZE];
67+
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
68+
id_buffer[BUFSIZE - 1] = '\0';
69+
*alto_str += " ID=\"";
70+
*alto_str += id_buffer;
71+
*alto_str += "\"";
72+
}
73+
74+
///
75+
/// Append the ALTO XML for the beginning of the document
76+
///
77+
bool TessAltoRenderer::BeginDocumentHandler() {
78+
AppendString(
79+
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
80+
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
81+
"\t<Description>\n"
82+
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
83+
"\t\t<sourceImageInformation>\n"
84+
"\t\t\t<fileName>");
85+
86+
AppendString(title());
87+
88+
AppendString("\t\t\t</fileName>\n"
89+
"\t\t</sourceImageInformation>\n"
90+
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
91+
"\t\t\t<ocrProcessingStep>\n"
92+
"\t\t\t\t<processingSoftware>\n"
93+
"\t\t\t\t\t<softwareName>tesseract ");
94+
AppendString(TessBaseAPI::Version());
95+
AppendString("</softwareName>\n"
96+
"\t\t\t\t</processingSoftware>\n"
97+
"\t\t\t</ocrProcessingStep>\n"
98+
"\t\t</OCRProcessing>\n"
99+
"\t</Description>\n"
100+
"\t<Layout>\n");
101+
102+
return true;
103+
}
104+
105+
///
106+
/// Append the ALTO XML for the layout of the image
107+
///
108+
bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) {
109+
const std::unique_ptr<const char[]> hocr(api->GetAltoText(imagenum()));
110+
if (hocr == nullptr) return false;
111+
112+
AppendString(hocr.get());
113+
114+
return true;
115+
}
116+
117+
///
118+
/// Append the ALTO XML for the end of the document
119+
///
120+
bool TessAltoRenderer::EndDocumentHandler() {
121+
AppendString("\t</Layout>\n</alto>\n");
122+
123+
return true;
124+
}
125+
126+
TessAltoRenderer::TessAltoRenderer(const char *outputbase)
127+
: TessResultRenderer(outputbase, "xml") {
128+
}
129+
130+
///
131+
/// Make an XML-formatted string with ALTO markup from the internal
132+
/// data structures.
133+
///
134+
char *TessBaseAPI::GetAltoText(int page_number) {
135+
return GetAltoText(nullptr, page_number);
136+
}
137+
138+
///
139+
/// Make an XML-formatted string with ALTO markup from the internal
140+
/// data structures.
141+
///
142+
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
143+
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
144+
return nullptr;
145+
146+
int lcnt = 0, bcnt = 0, wcnt = 0;
147+
int page_id = page_number;
148+
149+
STRING alto_str("");
150+
151+
if (input_file_ == nullptr)
152+
SetInputName(nullptr);
153+
154+
#ifdef _WIN32
155+
// convert input name from ANSI encoding to utf-8
156+
int str16_len =
157+
MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
158+
wchar_t *uni16_str = new WCHAR[str16_len];
159+
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
160+
uni16_str, str16_len);
161+
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0,
162+
nullptr, nullptr);
163+
char *utf8_str = new char[utf8_len];
164+
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
165+
utf8_len, nullptr, nullptr);
166+
*input_file_ = utf8_str;
167+
delete[] uni16_str;
168+
delete[] utf8_str;
169+
#endif
170+
171+
alto_str += "\t\t<Page WIDTH=\"";
172+
alto_str.add_str_int("", rect_width_);
173+
alto_str += "\" HEIGHT=\"";
174+
alto_str.add_str_int("", rect_height_);
175+
alto_str += "\" PHYSICAL_IMG_NR=\"";
176+
alto_str.add_str_int("", rect_height_);
177+
alto_str += "\"";
178+
AddIdToAlto(&alto_str, "page", page_id);
179+
alto_str += ">\n";
180+
alto_str += ("\t\t\t<PrintSpace HPOS=\"0\" "
181+
"VPOS=\"0\""
182+
" WIDTH=\"");
183+
alto_str.add_str_int("", rect_width_);
184+
alto_str += "\" HEIGHT=\"";
185+
alto_str.add_str_int("", rect_height_);
186+
alto_str += "\">\n";
187+
188+
ResultIterator *res_it = GetIterator();
189+
while (!res_it->Empty(RIL_BLOCK)) {
190+
if (res_it->Empty(RIL_WORD)) {
191+
res_it->Next(RIL_WORD);
192+
continue;
193+
}
194+
195+
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
196+
alto_str += "\t\t\t\t<TextBlock ";
197+
AddIdToAlto(&alto_str, "block", bcnt);
198+
AddBoxToAlto(res_it, RIL_BLOCK, &alto_str);
199+
alto_str += "\n";
200+
}
201+
202+
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
203+
204+
alto_str += "\t\t\t\t\t<TextLine ";
205+
AddIdToAlto(&alto_str, "line", lcnt);
206+
AddBoxToAlto(res_it, RIL_TEXTLINE, &alto_str);
207+
alto_str += "\n";
208+
}
209+
210+
alto_str += "\t\t\t\t\t\t<String ";
211+
AddIdToAlto(&alto_str, "string", wcnt);
212+
AddBoxToAlto(res_it, RIL_WORD, &alto_str);
213+
alto_str += " CONTENT=\"";
214+
215+
216+
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
217+
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
218+
219+
do {
220+
const std::unique_ptr<const char[]> grapheme(
221+
res_it->GetUTF8Text(RIL_SYMBOL));
222+
if (grapheme && grapheme[0] != 0) {
223+
alto_str += HOcrEscape(grapheme.get());
224+
}
225+
res_it->Next(RIL_SYMBOL);
226+
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
227+
228+
alto_str += "\"/>\n";
229+
230+
wcnt++;
231+
232+
if (last_word_in_line) {
233+
alto_str += "\t\t\t\t\t</TextLine>\n";
234+
lcnt++;
235+
}
236+
237+
if (last_word_in_block) {
238+
alto_str += "\t\t\t\t</TextBlock>\n";
239+
bcnt++;
240+
}
241+
}
242+
243+
alto_str += "\t\t\t</PrintSpace>\n";
244+
alto_str += "\t\t</Page>\n";
245+
246+
char *ret = new char[alto_str.length() + 1];
247+
strcpy(ret, alto_str.string());
248+
delete res_it;
249+
return ret;
250+
}
251+
252+
}

src/api/baseapi.h

+13
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,19 @@ class TESS_API TessBaseAPI {
594594
*/
595595
char* GetHOCRText(int page_number);
596596

597+
/**
598+
* Make an XML-formatted string with Alto markup from the internal
599+
* data structures.
600+
*/
601+
char* GetAltoText(ETEXT_DESC* monitor, int page_number);
602+
603+
604+
/**
605+
* Make an XML-formatted string with Alto markup from the internal
606+
* data structures.
607+
*/
608+
char* GetAltoText(int page_number);
609+
597610
/**
598611
* Make a TSV-formatted string from the internal data structures.
599612
* page_number is 0-based but will appear in the output as 1-based.

src/api/capi.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu
6666
return new TessHOcrRenderer(outputbase, font_info);
6767
}
6868

69+
TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase)
70+
{
71+
return new TessAltoRenderer(outputbase);
72+
}
73+
6974
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
7075
BOOL textonly)
7176
{

src/api/capi.h

+4
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ extern "C" {
5656
typedef tesseract::TessResultRenderer TessResultRenderer;
5757
typedef tesseract::TessTextRenderer TessTextRenderer;
5858
typedef tesseract::TessHOcrRenderer TessHOcrRenderer;
59+
typedef tesseract::TessAltoRenderer TessAltoRenderer;
5960
typedef tesseract::TessPDFRenderer TessPDFRenderer;
6061
typedef tesseract::TessUnlvRenderer TessUnlvRenderer;
6162
typedef tesseract::TessBoxTextRenderer TessBoxTextRenderer;
@@ -126,6 +127,7 @@ TESS_API void TESS_CALL TessDeleteIntArray(int* arr);
126127
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase);
127128
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase);
128129
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info);
130+
TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase);
129131
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
130132
BOOL textonly);
131133
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase);
@@ -277,6 +279,8 @@ TESS_API TessMutableIterator*
277279
TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle);
278280
TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number);
279281

282+
TESS_API char* TESS_CALL TessBaseAPIGetAltoText(TessBaseAPI* handle, int page_number);
283+
280284
TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number);
281285

282286
TESS_API char* TESS_CALL TessBaseAPIGetUNLVText(TessBaseAPI* handle);

src/api/renderer.h

+14
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,20 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
166166
bool font_info_; // whether to print font information
167167
};
168168

169+
/**
170+
* Renders tesseract output into an alto text string
171+
*/
172+
class TESS_API TessAltoRenderer : public TessResultRenderer {
173+
public:
174+
explicit TessAltoRenderer(const char *outputbase);
175+
176+
protected:
177+
virtual bool BeginDocumentHandler();
178+
virtual bool AddImageHandler(TessBaseAPI* api);
179+
virtual bool EndDocumentHandler();
180+
181+
};
182+
169183
/**
170184
* Renders Tesseract output into a TSV string
171185
*/

src/api/tesseractmain.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,19 @@ static void PreloadRenderers(
419419
}
420420
}
421421

422+
api->GetBoolVariable("tessedit_create_alto", &b);
423+
if (b) {
424+
tesseract::TessAltoRenderer* renderer =
425+
new tesseract::TessAltoRenderer(outputbase);
426+
if (renderer->happy()) {
427+
renderers->push_back(renderer);
428+
} else {
429+
delete renderer;
430+
tprintf("Error, could not create ALTO output file: %s\n",
431+
strerror(errno));
432+
}
433+
}
434+
422435
api->GetBoolVariable("tessedit_create_tsv", &b);
423436
if (b) {
424437
bool font_info;

src/ccmain/tesseractclass.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,8 @@ Tesseract::Tesseract()
387387
this->params()),
388388
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
389389
this->params()),
390+
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
391+
this->params()),
390392
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
391393
this->params()),
392394
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",

0 commit comments

Comments
 (0)