Skip to content

Commit 068eb4c

Browse files
committed
Add different classes to hocr output depending on BlockType
These classes are taken from the hOCR specification, and seem to map well onto the BlockType types. There are probably more that could be added.
1 parent b9b74a6 commit 068eb4c

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

src/api/hocrrenderer.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,21 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
209209
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
210210
}
211211
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
212-
hocr_str << "\n <span class='ocr_line'"
213-
<< " id='"
212+
hocr_str << "\n <span class='";
213+
switch (res_it->BlockType()) {
214+
case PT_HEADING_TEXT:
215+
hocr_str << "ocr_header";
216+
break;
217+
case PT_PULLOUT_TEXT:
218+
hocr_str << "ocr_textfloat";
219+
break;
220+
case PT_CAPTION_TEXT:
221+
hocr_str << "ocr_caption";
222+
break;
223+
default:
224+
hocr_str << "ocr_line";
225+
}
226+
hocr_str << "' id='"
214227
<< "line_" << page_id << "_" << lcnt << "'";
215228
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
216229
}

0 commit comments

Comments
 (0)