Skip to content

Commit ab09b09

Browse files
authored
Merge pull request #2294 from bertsky/lstm-with-char-whitelist
trying to add tessedit_char_whitelist etc. again:
2 parents be617b3 + f80508b commit ab09b09

File tree

6 files changed

+45
-26
lines changed

6 files changed

+45
-26
lines changed

CONTRIBUTING.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,14 @@ your question has been asked (and has been answered) many times before...
6767

6868
## For Developers: Creating a Pull Request
6969

70-
TBD
70+
You should always make sure your changes build and run successfully.
71+
72+
For that, your clone needs to have all submodules (`abseil`, `googletest`, `test`) included. To do so, either specify `--recurse-submodules` during the initial clone, or run `git submodule update --init --recursive NAME` for each `NAME` later. If `configure` already created those directories (blocking the clone), remove them first (or `make distclean`), then clone and reconfigure.
73+
74+
Have a look at [the README](./README.md) and [testing README](./test/testing/README.md) and the [wiki page](https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation#unit-test-builds) on installation.
75+
76+
In short, after running `configure` from the build directory of your choice, to build the library and CLI, run `make`. To test it, run `make check`. To build the training tools, run `make training`.
77+
78+
As soon as your changes are building and tests are succeeding, you can publish them. If you have not already, please [fork](https://guides.github.com/activities/forking/) tesseract (somewhere) on GitHub, and push your changes to that fork (in a new branch). Then [submit as PR](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork).
79+
80+
Please also keep track of reports from CI (automated build status) and Coverity/LGTM (quality scan). When the indicators show deterioration after your changes, further action may be required to improve them.

src/ccmain/tesseractclass.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -621,11 +621,23 @@ void Tesseract::SetBlackAndWhitelist() {
621621
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
622622
tessedit_char_whitelist.string(),
623623
tessedit_char_unblacklist.string());
624+
if (lstm_recognizer_) {
625+
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
626+
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
627+
tessedit_char_whitelist.string(),
628+
tessedit_char_unblacklist.string());
629+
}
624630
// Black and white lists should apply to all loaded classifiers.
625631
for (int i = 0; i < sub_langs_.size(); ++i) {
626632
sub_langs_[i]->unicharset.set_black_and_whitelist(
627633
tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
628634
tessedit_char_unblacklist.string());
635+
if (sub_langs_[i]->lstm_recognizer_) {
636+
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (sub_langs_[i]->lstm_recognizer_->GetUnicharset());
637+
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
638+
tessedit_char_whitelist.string(),
639+
tessedit_char_unblacklist.string());
640+
}
629641
}
630642
}
631643

src/ccutil/unicharset.h

+1
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,7 @@ class UNICHARSET {
871871

872872
// Return the enabled property of the given unichar.
873873
bool get_enabled(UNICHAR_ID unichar_id) const {
874+
ASSERT_HOST(contains_unichar_id(unichar_id));
874875
return unichars[unichar_id].properties.enabled;
875876
}
876877

src/lstm/recodebeam.cpp

+12-7
Original file line numberDiff line numberDiff line change
@@ -521,10 +521,10 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
521521
if (t == 0) {
522522
// The first step can only use singles and initials.
523523
ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2,
524-
dict_ratio, cert_offset, worst_dict_cert, step);
524+
charset, dict_ratio, cert_offset, worst_dict_cert, step);
525525
if (dict_ != nullptr) {
526-
ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs,
527-
TN_TOP2, dict_ratio, cert_offset, worst_dict_cert, step);
526+
ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2,
527+
charset, dict_ratio, cert_offset, worst_dict_cert, step);
528528
}
529529
} else {
530530
RecodeBeam* prev = beam_[t - 1];
@@ -556,9 +556,8 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
556556
// best first, but it comes before a lot of the worst, so it is slightly
557557
// more efficient than going forwards.
558558
for (int i = prev->beams_[index].size() - 1; i >= 0; --i) {
559-
ContinueContext(&prev->beams_[index].get(i).data, index, outputs,
560-
top_n, dict_ratio, cert_offset, worst_dict_cert,
561-
step);
559+
ContinueContext(&prev->beams_[index].get(i).data, index, outputs, top_n,
560+
charset, dict_ratio, cert_offset, worst_dict_cert, step);
562561
}
563562
}
564563
for (int index = 0; index < kNumBeams; ++index) {
@@ -585,7 +584,9 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
585584
// choices for which top_n_flags[index] == top_n_flag.
586585
void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index,
587586
const float* outputs,
588-
TopNState top_n_flag, double dict_ratio,
587+
TopNState top_n_flag,
588+
const UNICHARSET* charset,
589+
double dict_ratio,
589590
double cert_offset,
590591
double worst_dict_cert,
591592
RecodeBeam* step) {
@@ -648,6 +649,10 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index,
648649
int unichar_id = recoder_.DecodeUnichar(full_code);
649650
// Map the null char to INVALID.
650651
if (length == 0 && code == null_char_) unichar_id = INVALID_UNICHAR_ID;
652+
if (unichar_id != INVALID_UNICHAR_ID &&
653+
charset != nullptr &&
654+
!charset->get_enabled(unichar_id))
655+
continue; // disabled by whitelist/blacklist
651656
ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio,
652657
use_dawgs, NC_ANYTHING, prev, step);
653658
if (top_n_flag == TN_TOP2 && code != null_char_) {

src/lstm/recodebeam.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -312,9 +312,9 @@ class RecodeBeamSearch {
312312
// using the given network outputs to provide scores to the choices. Uses only
313313
// those choices for which top_n_flags[code] == top_n_flag.
314314
void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
315-
TopNState top_n_flag, double dict_ratio,
316-
double cert_offset, double worst_dict_cert,
317-
RecodeBeam* step);
315+
TopNState top_n_flag, const UNICHARSET* unicharset,
316+
double dict_ratio, double cert_offset,
317+
double worst_dict_cert, RecodeBeam* step);
318318
// Continues for a new unichar, using dawg or non-dawg as per flag.
319319
void ContinueUnichar(int code, int unichar_id, float cert,
320320
float worst_dict_cert, float dict_ratio, bool use_dawgs,

src/training/tesstrain_utils.sh

+6-15
Original file line numberDiff line numberDiff line change
@@ -70,23 +70,14 @@ err_exit() {
7070
# if the program file is not found.
7171
# Usage: run_command CMD ARG1 ARG2...
7272
run_command() {
73-
local cmd=$(which $1)
74-
if [[ -z ${cmd} ]]; then
75-
for d in api training; do
76-
cmd=$(which $d/$1)
77-
if [[ ! -z ${cmd} ]]; then
78-
break
79-
fi
80-
done
81-
if [[ -z ${cmd} ]]; then
82-
err_exit "$1 not found"
83-
fi
84-
fi
73+
local cmd
74+
cmd=$(which $1 || \
75+
for d in api training; do
76+
which $d/$1 && break
77+
done) || err_exit "'$1' not found"
8578
shift
8679
tlog "[$(date)] ${cmd} $@"
87-
"${cmd}" "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
88-
# check completion status
89-
if [[ $? -gt 0 ]]; then
80+
if ! "${cmd}" "$@" |& tee -a ${LOG_FILE}; then
9081
err_exit "Program $(basename ${cmd}) failed. Abort."
9182
fi
9283
}

0 commit comments

Comments
 (0)