Skip to content

Commit b1e4a82

Browse files
sundarcftfmorris
authored andcommitted
Render output in TSV format.
1 parent 738fe4f commit b1e4a82

File tree

2 files changed

+67
-91
lines changed

2 files changed

+67
-91
lines changed

api/baseapi.cpp

+62-63
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
14171417
*hocr_str += "\">";
14181418
}
14191419

1420+
static void AddBoxTohOCRTSV(const PageIterator *it,
1421+
PageIteratorLevel level,
1422+
STRING* hocr_str) {
1423+
int left, top, right, bottom;
1424+
it->BoundingBox(level, &left, &top, &right, &bottom);
1425+
hocr_str->add_str_int("\t", left);
1426+
hocr_str->add_str_int("\t", top);
1427+
hocr_str->add_str_int("\t", right - left + 1);
1428+
hocr_str->add_str_int("\t", bottom - top + 1);
1429+
}
1430+
1431+
1432+
14201433
/**
14211434
* Make a HTML-formatted string with hOCR markup from the internal
14221435
* data structures.
@@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
16411654
delete[] utf8_str;
16421655
#endif
16431656

1644-
hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
1645-
hocr_str += "' title='image \"";
1646-
if (input_file_) {
1647-
hocr_str += HOcrEscape(input_file_->string());
1648-
} else {
1649-
hocr_str += "unknown";
1650-
}
1651-
hocr_str.add_str_int("\"; bbox ", rect_left_);
1652-
hocr_str.add_str_int(" ", rect_top_);
1653-
hocr_str.add_str_int(" ", rect_width_);
1654-
hocr_str.add_str_int(" ", rect_height_);
1655-
hocr_str.add_str_int("; ppageno ", page_number);
1656-
hocr_str += "'>\n";
1657+
int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
1658+
1659+
hocr_str.add_str_int("1\t", page_num);
1660+
hocr_str.add_str_int("\t", block_num);
1661+
hocr_str.add_str_int("\t", par_num);
1662+
hocr_str.add_str_int("\t", line_num);
1663+
hocr_str.add_str_int("\t", word_num);
1664+
hocr_str.add_str_int("\t", rect_left_);
1665+
hocr_str.add_str_int("\t", rect_top_);
1666+
hocr_str.add_str_int("\t", rect_width_);
1667+
hocr_str.add_str_int("\t", rect_height_);
1668+
hocr_str += "\t-1\t\n";
16571669

16581670
ResultIterator *res_it = GetIterator();
16591671
while (!res_it->Empty(RIL_BLOCK)) {
@@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
16641676

16651677
// Open any new block/paragraph/textline.
16661678
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1667-
hocr_str.add_str_int(" <div class='ocr_carea' id='block_", page_id);
1668-
hocr_str.add_str_int("_", bcnt);
1669-
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1679+
block_num++, par_num = 0, line_num = 0, word_num = 0;
1680+
hocr_str.add_str_int("2\t", page_num);
1681+
hocr_str.add_str_int("\t", block_num);
1682+
hocr_str.add_str_int("\t", par_num);
1683+
hocr_str.add_str_int("\t", line_num);
1684+
hocr_str.add_str_int("\t", word_num);
1685+
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
1686+
hocr_str += "\t-1\t\n";
16701687
}
16711688
if (res_it->IsAtBeginningOf(RIL_PARA)) {
1672-
if (res_it->ParagraphIsLtr()) {
1673-
hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_",
1674-
page_id);
1675-
hocr_str.add_str_int("_", pcnt);
1676-
} else {
1677-
hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_",
1678-
page_id);
1679-
hocr_str.add_str_int("_", pcnt);
1680-
}
1681-
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1689+
par_num++, line_num = 0, word_num = 0;
1690+
hocr_str.add_str_int("3\t", page_num);
1691+
hocr_str.add_str_int("\t", block_num);
1692+
hocr_str.add_str_int("\t", par_num);
1693+
hocr_str.add_str_int("\t", line_num);
1694+
hocr_str.add_str_int("\t", word_num);
1695+
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
1696+
hocr_str += "\t-1\t\n";
16821697
}
16831698
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1684-
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
1685-
hocr_str.add_str_int("_", lcnt);
1686-
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1699+
line_num++, word_num = 0;
1700+
hocr_str.add_str_int("4\t", page_num);
1701+
hocr_str.add_str_int("\t", block_num);
1702+
hocr_str.add_str_int("\t", par_num);
1703+
hocr_str.add_str_int("\t", line_num);
1704+
hocr_str.add_str_int("\t", word_num);
1705+
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
1706+
hocr_str += "\t-1\t\n";
16871707
}
16881708

16891709
// Now, process the word...
1690-
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
1691-
hocr_str.add_str_int("_", wcnt);
16921710
int left, top, right, bottom;
16931711
bool bold, italic, underlined, monospace, serif, smallcaps;
16941712
int pointsize, font_id;
@@ -1697,34 +1715,21 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
16971715
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
16981716
&monospace, &serif, &smallcaps,
16991717
&pointsize, &font_id);
1700-
hocr_str.add_str_int("' title='bbox ", left);
1701-
hocr_str.add_str_int(" ", top);
1702-
hocr_str.add_str_int(" ", right);
1703-
hocr_str.add_str_int(" ", bottom);
1704-
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
1705-
if (font_info) {
1706-
hocr_str += "; x_font ";
1707-
hocr_str += HOcrEscape(font_name);
1708-
hocr_str.add_str_int("; x_fsize ", pointsize);
1709-
}
1710-
hocr_str += "'";
1711-
if (res_it->WordRecognitionLanguage()) {
1712-
hocr_str += " lang='";
1713-
hocr_str += res_it->WordRecognitionLanguage();
1714-
hocr_str += "'";
1715-
}
1716-
switch (res_it->WordDirection()) {
1717-
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
1718-
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
1719-
default: // Do nothing.
1720-
break;
1721-
}
1722-
hocr_str += ">";
1718+
word_num++;
1719+
hocr_str.add_str_int("5\t", page_num);
1720+
hocr_str.add_str_int("\t", block_num);
1721+
hocr_str.add_str_int("\t", par_num);
1722+
hocr_str.add_str_int("\t", line_num);
1723+
hocr_str.add_str_int("\t", word_num);
1724+
hocr_str.add_str_int("\t", left);
1725+
hocr_str.add_str_int("\t", top);
1726+
hocr_str.add_str_int("\t", right - left + 1);
1727+
hocr_str.add_str_int("\t", bottom - top + 1);
1728+
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
17231729
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
17241730
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
17251731
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1726-
if (bold) hocr_str += "<strong>";
1727-
if (italic) hocr_str += "<em>";
1732+
hocr_str += "\t";
17281733
do {
17291734
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
17301735
if (grapheme && grapheme[0] != 0) {
@@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
17371742
delete []grapheme;
17381743
res_it->Next(RIL_SYMBOL);
17391744
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1740-
if (italic) hocr_str += "</em>";
1741-
if (bold) hocr_str += "</strong>";
1742-
hocr_str += "</span> ";
1745+
hocr_str += "\n";
17431746
wcnt++;
17441747
// Close any ending block/paragraph/textline.
17451748
if (last_word_in_line) {
1746-
hocr_str += "\n </span>";
17471749
lcnt++;
17481750
}
17491751
if (last_word_in_para) {
1750-
hocr_str += "\n </p>\n";
17511752
pcnt++;
17521753
}
17531754
if (last_word_in_block) {
1754-
hocr_str += " </div>\n";
17551755
bcnt++;
17561756
}
17571757
}
1758-
hocr_str += " </div>\n";
17591758

17601759
char *ret = new char[hocr_str.length() + 1];
17611760
strcpy(ret, hocr_str.string());

api/renderer.cpp

+5-28
Original file line numberDiff line numberDiff line change
@@ -193,43 +193,20 @@ TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
193193
}
194194

195195
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
196-
AppendString(
197-
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
198-
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
199-
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
200-
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
201-
"lang=\"en\">\n <head>\n <title>\n");
202-
AppendString(title());
203-
AppendString(
204-
"</title>\n"
205-
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
206-
"charset=utf-8\" />\n"
207-
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
208-
"' />\n"
209-
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
210-
" ocr_line ocrx_word");
211-
if (font_info_)
212-
AppendString(
213-
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
214-
AppendString(
215-
"'/>\n"
216-
"</head>\n<body>\n");
217-
196+
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
218197
return true;
219198
}
220199

221200
bool TessHOcrTsvRenderer::EndDocumentHandler() {
222-
AppendString(" </body>\n</html>\n");
223-
224201
return true;
225202
}
226203

227204
bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
228-
char* hocr = api->GetHOCRText(imagenum());
229-
if (hocr == NULL) return false;
205+
char* hocrtsv = api->GetHOCRTSVText(imagenum());
206+
if (hocrtsv == NULL) return false;
230207

231-
AppendString(hocr);
232-
delete[] hocr;
208+
AppendString(hocrtsv);
209+
delete[] hocrtsv;
233210

234211
return true;
235212
}

0 commit comments

Comments
 (0)