Skip to content

Commit 754e38d

Browse files
committed
Added the option to get the timesteps separated by the suggested segmentation
Signed-off-by: Noah Metzger <noah.metzger@bib.uni-mannheim.de>
1 parent d2c3309 commit 754e38d

File tree

8 files changed

+98
-15
lines changed

8 files changed

+98
-15
lines changed

src/api/hocrrenderer.cpp

+36-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
130130
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
131131
return nullptr;
132132

133-
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
133+
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
134134
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
135135
bool para_is_ltr = true; // Default direction is LTR
136136
const char* paragraph_lang = nullptr;
@@ -215,8 +215,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
215215
// Now, process the word...
216216
std::vector<std::vector<std::pair<const char*, float>>>* confidencemap =
217217
nullptr;
218+
std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
219+
symbolMap = nullptr;
218220
if (tesseract_->lstm_choice_mode) {
219221
confidencemap = res_it->GetBestLSTMSymbolChoices();
222+
symbolMap = res_it->GetBestSegmentedLSTMSymbolChoices();
220223
}
221224
hocr_str << "\n <span class='ocrx_word'"
222225
<< " id='"
@@ -324,6 +327,38 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
324327
tcnt++;
325328
}
326329
}
330+
} else if (tesseract_->lstm_choice_mode == 3 && symbolMap != nullptr) {
331+
for (size_t j = 0; j < symbolMap->size(); j++) {
332+
std::vector<std::vector<std::pair<const char*, float>>> timesteps =
333+
(*symbolMap)[j];
334+
hocr_str << "\n <span class='ocr_symbol'"
335+
<< " id='"
336+
<< "symbolstep_" << page_id << "_" << wcnt << "_" << scnt
337+
<< "'>"
338+
<< timesteps[0][0].first;
339+
for (size_t i = 1; i < timesteps.size(); i++) {
340+
hocr_str << "\n <span class='ocrx_cinfo'"
341+
<< " id='"
342+
<< "timestep_" << page_id << "_" << wcnt << "_" << tcnt
343+
<< "'"
344+
<< ">";
345+
std::vector<std::pair<const char*, float>> timestep =
346+
timesteps[i];
347+
for (std::pair<const char*, float> conf : timestep) {
348+
hocr_str << "<span class='ocr_glyph'"
349+
<< " id='"
350+
<< "choice_" << page_id << "_" << wcnt << "_" << gcnt
351+
<< "'"
352+
<< " title='x_confs " << int(conf.second * 100) << "'>"
353+
<< conf.first << "</span>";
354+
gcnt++;
355+
}
356+
hocr_str << "</span>";
357+
tcnt++;
358+
}
359+
hocr_str << "</span>";
360+
scnt++;
361+
}
327362
}
328363
hocr_str << "</span>";
329364
tcnt = 1;

src/ccmain/resultiterator.cpp

+11-1
Original file line numberDiff line numberDiff line change
@@ -605,14 +605,24 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
605605
return result;
606606
}
607607

608-
std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetBestLSTMSymbolChoices() const {
608+
std::vector<std::vector<std::pair<const char*, float>>>*
609+
ResultIterator::GetBestLSTMSymbolChoices() const {
609610
if (it_->word() != nullptr) {
610611
return &it_->word()->timesteps;
611612
} else {
612613
return nullptr;
613614
}
614615
}
615616

617+
std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
618+
ResultIterator::GetBestSegmentedLSTMSymbolChoices() const {
619+
if (it_->word() != nullptr) {
620+
return &it_->word()->symbol_steps;
621+
} else {
622+
return nullptr;
623+
}
624+
}
625+
616626
void ResultIterator::AppendUTF8WordText(STRING *text) const {
617627
if (!it_->word()) return;
618628
ASSERT_HOST(it_->word()->best_choice != nullptr);

src/ccmain/resultiterator.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,10 @@ class TESS_API ResultIterator : public LTRResultIterator {
100100
/**
101101
* Returns the LSTM choices for every LSTM timestep for the current word.
102102
*/
103-
virtual std::vector<std::vector<std::pair<const char*, float>>>* GetBestLSTMSymbolChoices() const;
103+
virtual std::vector<std::vector<std::pair<const char*, float>>>*
104+
GetBestLSTMSymbolChoices() const;
105+
virtual std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
106+
GetBestSegmentedLSTMSymbolChoices() const;
104107

105108
/**
106109
* Return whether the current paragraph's dominant reading direction

src/ccmain/tesseractclass.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,9 @@ Tesseract::Tesseract()
526526
"Allows to include alternative symbols choices in the hOCR output. "
527527
"Valid input values are 0, 1 and 2. 0 is the default value. "
528528
"With 1 the alternative symbol choices per timestep are included. "
529-
"With 2 the alternative symbol choices are accumulated per character.",
529+
"With 2 the alternative symbol choices are accumulated per character."
530+
"With 3 the alternative symbol choices per timestep are included and "
531+
"separated by the suggested segmentation of Tesseract",
530532
this->params()),
531533

532534
backup_config_file_(nullptr),

src/ccmain/tesseractclass.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -1127,7 +1127,9 @@ class Tesseract : public Wordrec {
11271127
"Allows to include alternative symbols choices in the hOCR output. "
11281128
"Valid input values are 0, 1 and 2. 0 is the default value. "
11291129
"With 1 the alternative symbol choices per timestep are included. "
1130-
"With 2 the alternative symbol choices are accumulated per character.");
1130+
"With 2 the alternative symbol choices are accumulated per character."
1131+
"With 3 the alternative symbol choices per timestep are included and "
1132+
"separated by the suggested segmentation of Tesseract");
11311133

11321134
//// ambigsrecog.cpp /////////////////////////////////////////////////////////
11331135
FILE *init_recog_training(const STRING &fname);

src/ccstruct/pageres.h

+2
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ class WERD_RES : public ELIST_LINK {
222222
GenericVector<int> blob_gaps;
223223
// Stores the lstm choices of every timestep
224224
std::vector<std::vector<std::pair<const char*, float>>> timesteps;
225+
std::vector<std::vector<std::vector<std::pair<const char*, float>>>>
226+
symbol_steps;
225227
// Ratings matrix contains classifier choices for each classified combination
226228
// of blobs. The dimension is the same as the number of blobs in chopped_word
227229
// and the leading diagonal corresponds to classifier results of the blobs

src/lstm/recodebeam.cpp

+37-9
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <deque>
2626
#include <map>
2727
#include <set>
28+
#include <tuple>
2829
#include <vector>
2930

3031
#include <algorithm>
@@ -187,7 +188,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
187188
GenericVector<int> xcoords;
188189
GenericVector<const RecodeNode*> best_nodes;
189190
GenericVector<const RecodeNode*> second_nodes;
190-
std::deque<std::pair<int,int>> best_choices;
191+
std::deque<std::tuple<int, int, double>> best_choices;
191192
ExtractBestPaths(&best_nodes, &second_nodes);
192193
if (debug) {
193194
DebugPath(unicharset, best_nodes);
@@ -201,13 +202,14 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
201202
int timestepEnd = 0;
202203
//if lstm choice mode is required in granularity level 2 it stores the x
203204
//Coordinates of every chosen character to match the alternative choices to it
204-
if (lstm_choice_mode == 2) {
205+
if (lstm_choice_mode == 2 || lstm_choice_mode == 3) {
205206
ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
206207
&xcoords, &best_choices);
207208
if (best_choices.size() > 0) {
208-
current_char = best_choices.front().first;
209-
timestepEnd = best_choices.front().second;
210-
best_choices.pop_front();
209+
current_char = std::get<0>(best_choices.front());
210+
timestepEnd = std::get<1>(best_choices.front());
211+
if(lstm_choice_mode == 2)
212+
best_choices.pop_front();
211213
}
212214
} else {
213215
ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
@@ -258,7 +260,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
258260
choice_pairs.push_back(choice);
259261
}
260262
}
261-
if ((best_choices.size() > 0 && i == best_choices.front().second - 1)
263+
if ((best_choices.size() > 0 && i == std::get<1>(best_choices.front()) - 1)
262264
|| i == xcoords[word_end]-1) {
263265
std::map<const char*, float> summed_propabilities;
264266
for (auto it = choice_pairs.begin(); it != choice_pairs.end(); ++it) {
@@ -283,7 +285,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
283285
it->second));
284286
}
285287
if (best_choices.size() > 0) {
286-
current_char = best_choices.front().first;
288+
current_char = std::get<0>(best_choices.front());
287289
best_choices.pop_front();
288290
}
289291
choice_pairs.clear();
@@ -292,6 +294,25 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
292294
}
293295
}
294296
timestepEnd = xcoords[word_end];
297+
} else if (lstm_choice_mode == 3) {
298+
std::vector<std::vector<std::pair<const char*, float>>> currentSymbol;
299+
for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
300+
if (i == std::get<1>(best_choices.front())) {
301+
if (currentSymbol.size() > 0) {
302+
word_res->symbol_steps.push_back(currentSymbol);
303+
currentSymbol.clear();
304+
}
305+
std::vector<std::pair<const char*, float>> choice_Header;
306+
choice_Header.push_back(std::pair<const char*, float>(
307+
unicharset->id_to_unichar_ext(std::get<0>(best_choices.front())),
308+
2.0));
309+
currentSymbol.push_back(choice_Header);
310+
if(best_choices.size()>1) best_choices.pop_front();
311+
}
312+
currentSymbol.push_back(timesteps[i]);
313+
}
314+
word_res->symbol_steps.push_back(currentSymbol);
315+
timestepEnd = xcoords[word_end];
295316
}
296317
for (int i = word_start; i < word_end; ++i) {
297318
BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
@@ -366,7 +387,7 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
366387
const GenericVector<const RecodeNode*>& best_nodes,
367388
GenericVector<int>* unichar_ids, GenericVector<float>* certs,
368389
GenericVector<float>* ratings, GenericVector<int>* xcoords,
369-
std::deque<std::pair<int, int>>* best_choices) {
390+
std::deque<std::tuple<int, int, double>>* best_choices) {
370391
unichar_ids->truncate(0);
371392
certs->truncate(0);
372393
ratings->truncate(0);
@@ -375,6 +396,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
375396
int t = 0;
376397
int width = best_nodes.size();
377398
while (t < width) {
399+
int id;
400+
int tposition;
378401
double certainty = 0.0;
379402
double rating = 0.0;
380403
while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) {
@@ -396,7 +419,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
396419
unichar_ids->push_back(unichar_id);
397420
xcoords->push_back(t);
398421
if (best_choices != nullptr) {
399-
best_choices->push_back(std::pair<int, int>(unichar_id, t));
422+
tposition = t;
423+
id = unichar_id;
400424
}
401425
do {
402426
double cert = best_nodes[t++]->certainty;
@@ -414,6 +438,10 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
414438
if (certainty < certs->back()) certs->back() = certainty;
415439
ratings->back() += rating;
416440
}
441+
if (best_choices != nullptr) {
442+
best_choices->push_back(
443+
std::tuple<int, int, double>(id, tposition, rating));
444+
}
417445
}
418446
xcoords->push_back(width);
419447
}

src/lstm/recodebeam.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "unicharcompress.h"
3030
#include <deque>
3131
#include <set>
32+
#include <tuple>
3233
#include <vector>
3334

3435
namespace tesseract {
@@ -281,7 +282,7 @@ class RecodeBeamSearch {
281282
const GenericVector<const RecodeNode*>& best_nodes,
282283
GenericVector<int>* unichar_ids, GenericVector<float>* certs,
283284
GenericVector<float>* ratings, GenericVector<int>* xcoords,
284-
std::deque<std::pair<int,int>>* best_choices = nullptr);
285+
std::deque<std::tuple<int,int,double>>* best_choices = nullptr);
285286

286287
// Sets up a word with the ratings matrix and fake blobs with boxes in the
287288
// right places.

0 commit comments

Comments
 (0)