Skip to content

Commit 48713f7

Browse files
committed
Move code for hOCR renderer to new file
Signed-off-by: Stefan Weil <sw@weilnetz.de>
1 parent e398601 commit 48713f7

File tree

5 files changed

+442
-409
lines changed

5 files changed

+442
-409
lines changed

src/api/Makefile.am

+5-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
3232
if VISIBILITY
3333
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
3434
endif
35-
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp altorenderer.cpp
35+
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
36+
libtesseract_api_la_SOURCES += altorenderer.cpp
37+
libtesseract_api_la_SOURCES += hocrrenderer.cpp
38+
libtesseract_api_la_SOURCES += pdfrenderer.cpp
39+
libtesseract_api_la_SOURCES += renderer.cpp
3640

3741
lib_LTLIBRARIES += libtesseract.la
3842
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS)

src/api/baseapi.cpp

+5-354
Original file line numberDiff line numberDiff line change
@@ -1340,363 +1340,14 @@ char* TessBaseAPI::GetUTF8Text() {
13401340
return result;
13411341
}
13421342

1343-
/**
1344-
* Gets the block orientation at the current iterator position.
1345-
*/
1346-
static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
1347-
tesseract::Orientation orientation;
1348-
tesseract::WritingDirection writing_direction;
1349-
tesseract::TextlineOrder textline_order;
1350-
float deskew_angle;
1351-
it->Orientation(&orientation, &writing_direction, &textline_order,
1352-
&deskew_angle);
1353-
return orientation;
1354-
}
1355-
1356-
/**
1357-
* Fits a line to the baseline at the given level, and appends its coefficients
1358-
* to the hOCR string.
1359-
* NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
1360-
* rotated textlines. For this reason, on textlines that are not upright, this
1361-
* method currently only inserts a 'textangle' property to indicate the rotation
1362-
* direction and does not add any baseline information to the hocr string.
1363-
*/
1364-
static void AddBaselineCoordsTohOCR(const PageIterator *it,
1365-
PageIteratorLevel level,
1366-
STRING* hocr_str) {
1367-
tesseract::Orientation orientation = GetBlockTextOrientation(it);
1368-
if (orientation != ORIENTATION_PAGE_UP) {
1369-
hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
1370-
return;
1371-
}
1372-
1373-
int left, top, right, bottom;
1374-
it->BoundingBox(level, &left, &top, &right, &bottom);
1375-
1376-
// Try to get the baseline coordinates at this level.
1377-
int x1, y1, x2, y2;
1378-
if (!it->Baseline(level, &x1, &y1, &x2, &y2))
1379-
return;
1380-
// Following the description of this field of the hOCR spec, we convert the
1381-
// baseline coordinates so that "the bottom left of the bounding box is the
1382-
// origin".
1383-
x1 -= left;
1384-
x2 -= left;
1385-
y1 -= bottom;
1386-
y2 -= bottom;
1387-
1388-
// Now fit a line through the points so we can extract coefficients for the
1389-
// equation: y = p1 x + p0
1390-
double p1 = 0;
1391-
double p0 = 0;
1392-
if (x1 == x2) {
1393-
// Problem computing the polynomial coefficients.
1394-
return;
1395-
}
1396-
p1 = (y2 - y1) / static_cast<double>(x2 - x1);
1397-
p0 = y1 - static_cast<double>(p1 * x1);
1398-
1399-
hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
1400-
hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
1401-
}
1402-
1403-
static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
1404-
int num2) {
1405-
const size_t BUFSIZE = 64;
1406-
char id_buffer[BUFSIZE];
1407-
if (num2 >= 0) {
1408-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2);
1409-
} else {
1410-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
1411-
}
1412-
id_buffer[BUFSIZE - 1] = '\0';
1413-
*hocr_str += " id='";
1414-
*hocr_str += id_buffer;
1415-
*hocr_str += "'";
1416-
}
1417-
1418-
static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
1419-
int num2, int num3) {
1420-
const size_t BUFSIZE = 64;
1421-
char id_buffer[BUFSIZE];
1422-
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3);
1423-
id_buffer[BUFSIZE - 1] = '\0';
1424-
*hocr_str += " id='";
1425-
*hocr_str += id_buffer;
1426-
*hocr_str += "'";
1427-
}
1428-
1429-
static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
1430-
STRING* hocr_str) {
1431-
int left, top, right, bottom;
1432-
it->BoundingBox(level, &left, &top, &right, &bottom);
1433-
// This is the only place we use double quotes instead of single quotes,
1434-
// but it may too late to change for consistency
1435-
hocr_str->add_str_int(" title=\"bbox ", left);
1436-
hocr_str->add_str_int(" ", top);
1437-
hocr_str->add_str_int(" ", right);
1438-
hocr_str->add_str_int(" ", bottom);
1439-
// Add baseline coordinates & heights for textlines only.
1440-
if (level == RIL_TEXTLINE) {
1441-
AddBaselineCoordsTohOCR(it, level, hocr_str);
1442-
// add custom height measures
1443-
float row_height, descenders, ascenders; // row attributes
1444-
it->RowAttributes(&row_height, &descenders, &ascenders);
1445-
// TODO(rays): Do we want to limit these to a single decimal place?
1446-
hocr_str->add_str_double("; x_size ", row_height);
1447-
hocr_str->add_str_double("; x_descenders ", descenders * -1);
1448-
hocr_str->add_str_double("; x_ascenders ", ascenders);
1449-
}
1450-
*hocr_str += "\">";
1451-
}
1452-
14531343
static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
1454-
STRING* hocr_str) {
1344+
STRING* text) {
14551345
int left, top, right, bottom;
14561346
it->BoundingBox(level, &left, &top, &right, &bottom);
1457-
hocr_str->add_str_int("\t", left);
1458-
hocr_str->add_str_int("\t", top);
1459-
hocr_str->add_str_int("\t", right - left);
1460-
hocr_str->add_str_int("\t", bottom - top);
1461-
}
1462-
1463-
/**
1464-
* Make a HTML-formatted string with hOCR markup from the internal
1465-
* data structures.
1466-
* page_number is 0-based but will appear in the output as 1-based.
1467-
* Image name/input_file_ can be set by SetInputName before calling
1468-
* GetHOCRText
1469-
* STL removed from original patch submission and refactored by rays.
1470-
* Returned string must be freed with the delete [] operator.
1471-
*/
1472-
char* TessBaseAPI::GetHOCRText(int page_number) {
1473-
return GetHOCRText(nullptr, page_number);
1474-
}
1475-
1476-
/**
1477-
* Make a HTML-formatted string with hOCR markup from the internal
1478-
* data structures.
1479-
* page_number is 0-based but will appear in the output as 1-based.
1480-
* Image name/input_file_ can be set by SetInputName before calling
1481-
* GetHOCRText
1482-
* STL removed from original patch submission and refactored by rays.
1483-
* Returned string must be freed with the delete [] operator.
1484-
*/
1485-
char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1486-
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
1487-
return nullptr;
1488-
1489-
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
1490-
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1491-
bool para_is_ltr = true; // Default direction is LTR
1492-
const char* paragraph_lang = nullptr;
1493-
bool font_info = false;
1494-
GetBoolVariable("hocr_font_info", &font_info);
1495-
1496-
STRING hocr_str("");
1497-
1498-
if (input_file_ == nullptr)
1499-
SetInputName(nullptr);
1500-
1501-
#ifdef _WIN32
1502-
// convert input name from ANSI encoding to utf-8
1503-
int str16_len =
1504-
MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
1505-
wchar_t *uni16_str = new WCHAR[str16_len];
1506-
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1507-
uni16_str, str16_len);
1508-
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0,
1509-
nullptr, nullptr);
1510-
char *utf8_str = new char[utf8_len];
1511-
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
1512-
utf8_len, nullptr, nullptr);
1513-
*input_file_ = utf8_str;
1514-
delete[] uni16_str;
1515-
delete[] utf8_str;
1516-
#endif
1517-
1518-
hocr_str += " <div class='ocr_page'";
1519-
AddIdTohOCR(&hocr_str, "page", page_id, -1);
1520-
hocr_str += " title='image \"";
1521-
if (input_file_) {
1522-
hocr_str += HOcrEscape(input_file_->string());
1523-
} else {
1524-
hocr_str += "unknown";
1525-
}
1526-
hocr_str.add_str_int("\"; bbox ", rect_left_);
1527-
hocr_str.add_str_int(" ", rect_top_);
1528-
hocr_str.add_str_int(" ", rect_width_);
1529-
hocr_str.add_str_int(" ", rect_height_);
1530-
hocr_str.add_str_int("; ppageno ", page_number);
1531-
hocr_str += "'>\n";
1532-
1533-
ResultIterator *res_it = GetIterator();
1534-
while (!res_it->Empty(RIL_BLOCK)) {
1535-
if (res_it->Empty(RIL_WORD)) {
1536-
res_it->Next(RIL_WORD);
1537-
continue;
1538-
}
1539-
1540-
// Open any new block/paragraph/textline.
1541-
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1542-
para_is_ltr = true; // reset to default direction
1543-
hocr_str += " <div class='ocr_carea'";
1544-
AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
1545-
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1546-
}
1547-
if (res_it->IsAtBeginningOf(RIL_PARA)) {
1548-
hocr_str += "\n <p class='ocr_par'";
1549-
para_is_ltr = res_it->ParagraphIsLtr();
1550-
if (!para_is_ltr) {
1551-
hocr_str += " dir='rtl'";
1552-
}
1553-
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
1554-
paragraph_lang = res_it->WordRecognitionLanguage();
1555-
if (paragraph_lang) {
1556-
hocr_str += " lang='";
1557-
hocr_str += paragraph_lang;
1558-
hocr_str += "'";
1559-
}
1560-
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1561-
}
1562-
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1563-
hocr_str += "\n <span class='ocr_line'";
1564-
AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
1565-
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1566-
}
1567-
1568-
// Now, process the word...
1569-
std::vector<std::vector<std::pair<const char*, float>>>* confidencemap = nullptr;
1570-
if (tesseract_->lstm_choice_mode) {
1571-
confidencemap = res_it->GetBestLSTMSymbolChoices();
1572-
}
1573-
hocr_str += "\n <span class='ocrx_word'";
1574-
AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
1575-
int left, top, right, bottom;
1576-
bool bold, italic, underlined, monospace, serif, smallcaps;
1577-
int pointsize, font_id;
1578-
const char *font_name;
1579-
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1580-
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
1581-
&monospace, &serif, &smallcaps,
1582-
&pointsize, &font_id);
1583-
hocr_str.add_str_int(" title='bbox ", left);
1584-
hocr_str.add_str_int(" ", top);
1585-
hocr_str.add_str_int(" ", right);
1586-
hocr_str.add_str_int(" ", bottom);
1587-
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
1588-
if (font_info) {
1589-
if (font_name) {
1590-
hocr_str += "; x_font ";
1591-
hocr_str += HOcrEscape(font_name);
1592-
}
1593-
hocr_str.add_str_int("; x_fsize ", pointsize);
1594-
}
1595-
hocr_str += "'";
1596-
const char* lang = res_it->WordRecognitionLanguage();
1597-
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
1598-
hocr_str += " lang='";
1599-
hocr_str += lang;
1600-
hocr_str += "'";
1601-
}
1602-
switch (res_it->WordDirection()) {
1603-
// Only emit direction if different from current paragraph direction
1604-
case DIR_LEFT_TO_RIGHT:
1605-
if (!para_is_ltr) hocr_str += " dir='ltr'";
1606-
break;
1607-
case DIR_RIGHT_TO_LEFT:
1608-
if (para_is_ltr) hocr_str += " dir='rtl'";
1609-
break;
1610-
case DIR_MIX:
1611-
case DIR_NEUTRAL:
1612-
default: // Do nothing.
1613-
break;
1614-
}
1615-
hocr_str += ">";
1616-
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1617-
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1618-
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1619-
if (bold) hocr_str += "<strong>";
1620-
if (italic) hocr_str += "<em>";
1621-
do {
1622-
const std::unique_ptr<const char[]> grapheme(
1623-
res_it->GetUTF8Text(RIL_SYMBOL));
1624-
if (grapheme && grapheme[0] != 0) {
1625-
hocr_str += HOcrEscape(grapheme.get());
1626-
}
1627-
res_it->Next(RIL_SYMBOL);
1628-
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1629-
if (italic) hocr_str += "</em>";
1630-
if (bold) hocr_str += "</strong>";
1631-
// If the lstm choice mode is required it is added here
1632-
if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr) {
1633-
for (size_t i = 0; i < confidencemap->size(); i++) {
1634-
hocr_str += "\n <span class='ocrx_cinfo'";
1635-
AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
1636-
hocr_str += ">";
1637-
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
1638-
for (std::pair<const char*, float> conf : timestep) {
1639-
hocr_str += "<span class='ocr_glyph'";
1640-
AddIdTohOCR(&hocr_str, "choice", page_id, wcnt, gcnt);
1641-
hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
1642-
hocr_str += "'";
1643-
hocr_str += ">";
1644-
hocr_str += conf.first;
1645-
hocr_str += "</span>";
1646-
gcnt++;
1647-
}
1648-
hocr_str += "</span>";
1649-
tcnt++;
1650-
}
1651-
} else if (tesseract_->lstm_choice_mode == 2 && confidencemap != nullptr) {
1652-
for (size_t i = 0; i < confidencemap->size(); i++) {
1653-
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
1654-
if (timestep.size() > 0) {
1655-
hocr_str += "\n <span class='ocrx_cinfo'";
1656-
AddIdTohOCR(&hocr_str, "lstm_choices", page_id, wcnt, tcnt);
1657-
hocr_str += " chosen='";
1658-
hocr_str += timestep[0].first;
1659-
hocr_str += "'>";
1660-
for (size_t j = 1; j < timestep.size(); j++) {
1661-
hocr_str += "<span class='ocr_glyph'";
1662-
AddIdTohOCR(&hocr_str, "choice", page_id, wcnt, gcnt);
1663-
hocr_str.add_str_int(" title='x_confs ", int(timestep[j].second * 100));
1664-
hocr_str += "'";
1665-
hocr_str += ">";
1666-
hocr_str += timestep[j].first;
1667-
hocr_str += "</span>";
1668-
gcnt++;
1669-
}
1670-
hocr_str += "</span>";
1671-
tcnt++;
1672-
}
1673-
}
1674-
}
1675-
hocr_str += "</span>";
1676-
tcnt = 1;
1677-
gcnt = 1;
1678-
wcnt++;
1679-
// Close any ending block/paragraph/textline.
1680-
if (last_word_in_line) {
1681-
hocr_str += "\n </span>";
1682-
lcnt++;
1683-
}
1684-
if (last_word_in_para) {
1685-
hocr_str += "\n </p>\n";
1686-
pcnt++;
1687-
para_is_ltr = true; // back to default direction
1688-
}
1689-
if (last_word_in_block) {
1690-
hocr_str += " </div>\n";
1691-
bcnt++;
1692-
}
1693-
}
1694-
hocr_str += " </div>\n";
1695-
1696-
char *ret = new char[hocr_str.length() + 1];
1697-
strcpy(ret, hocr_str.string());
1698-
delete res_it;
1699-
return ret;
1347+
text->add_str_int("\t", left);
1348+
text->add_str_int("\t", top);
1349+
text->add_str_int("\t", right - left);
1350+
text->add_str_int("\t", bottom - top);
17001351
}
17011352

17021353
/**

0 commit comments

Comments
 (0)