Skip to content

Commit da03e4e

Browse files
committed
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion
1 parent f5c18f7 commit da03e4e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+1061
-1180
lines changed

api/baseapi.cpp

+11-10
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@
4141
#include <string.h>
4242
#endif // _WIN32
4343

44+
#include <fstream>
4445
#include <iostream>
45-
#include <string>
4646
#include <iterator>
47-
#include <fstream>
48-
#include <memory> // std::unique_ptr
47+
#include <memory> // std::unique_ptr
48+
#include <string>
4949

5050
#include "allheaders.h"
5151

@@ -1540,7 +1540,8 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
15401540
if (bold) hocr_str += "<strong>";
15411541
if (italic) hocr_str += "<em>";
15421542
do {
1543-
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
1543+
const std::unique_ptr<const char[]> grapheme(
1544+
res_it->GetUTF8Text(RIL_SYMBOL));
15441545
if (grapheme && grapheme[0] != 0) {
15451546
hocr_str += HOcrEscape(grapheme.get());
15461547
}
@@ -1662,7 +1663,8 @@ char* TessBaseAPI::GetTSVText(int page_number) {
16621663
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
16631664

16641665
do {
1665-
tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1666+
tsv_str +=
1667+
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
16661668
res_it->Next(RIL_SYMBOL);
16671669
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
16681670
tsv_str += "\n"; // end of row
@@ -1720,16 +1722,16 @@ char* TessBaseAPI::GetBoxText(int page_number) {
17201722
do {
17211723
int left, top, right, bottom;
17221724
if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1723-
const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1725+
const std::unique_ptr</*non-const*/ char[]> text(
1726+
it->GetUTF8Text(RIL_SYMBOL));
17241727
// Tesseract uses space for recognition failure. Fix to a reject
17251728
// character, kTesseractReject so we don't create illegal box files.
17261729
for (int i = 0; text[i] != '\0'; ++i) {
17271730
if (text[i] == ' ')
17281731
text[i] = kTesseractReject;
17291732
}
17301733
snprintf(result + output_length, total_length - output_length,
1731-
"%s %d %d %d %d %d\n",
1732-
text.get(), left, image_height_ - bottom,
1734+
"%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
17331735
right, image_height_ - top, page_number);
17341736
output_length += strlen(result + output_length);
17351737
// Just in case...
@@ -2063,8 +2065,7 @@ void TessBaseAPI::End() {
20632065
delete paragraph_models_;
20642066
paragraph_models_ = NULL;
20652067
}
2066-
if (osd_tesseract_ == tesseract_)
2067-
osd_tesseract_ = nullptr;
2068+
if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
20682069
delete tesseract_;
20692070
tesseract_ = nullptr;
20702071
delete osd_tesseract_;

api/pdfrenderer.cpp

+16-19
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
#include "config_auto.h"
2121
#endif
2222

23-
#include <memory> // std::unique_ptr
23+
#include <memory> // std::unique_ptr
2424
#include "allheaders.h"
2525
#include "baseapi.h"
2626
#include "math.h"
@@ -457,13 +457,12 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
457457
STRING pdf_word("");
458458
int pdf_word_len = 0;
459459
do {
460-
const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
460+
const std::unique_ptr<const char[]> grapheme(
461+
res_it->GetUTF8Text(RIL_SYMBOL));
461462
if (grapheme && grapheme[0] != '\0') {
462-
GenericVector<int> unicodes;
463-
UNICHAR::UTF8ToUnicode(grapheme.get(), &unicodes);
463+
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(grapheme.get());
464464
char utf16[kMaxBytesPerCodepoint];
465-
for (int i = 0; i < unicodes.length(); i++) {
466-
int code = unicodes[i];
465+
for (char32 code : unicodes) {
467466
if (CodepointToUtf16be(code, utf16)) {
468467
pdf_word += utf16;
469468
pdf_word_len++;
@@ -566,13 +565,13 @@ bool TessPDFRenderer::BeginDocumentHandler() {
566565

567566
// CIDTOGIDMAP
568567
const int kCIDToGIDMapSize = 2 * (1 << 16);
569-
const std::unique_ptr</*non-const*/ unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
568+
const std::unique_ptr<unsigned char[]> cidtogidmap(
569+
new unsigned char[kCIDToGIDMapSize]);
570570
for (int i = 0; i < kCIDToGIDMapSize; i++) {
571571
cidtogidmap[i] = (i % 2) ? 1 : 0;
572572
}
573573
size_t len;
574-
unsigned char *comp =
575-
zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
574+
unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
576575
n = snprintf(buf, sizeof(buf),
577576
"5 0 obj\n"
578577
"<<\n"
@@ -665,8 +664,8 @@ bool TessPDFRenderer::BeginDocumentHandler() {
665664
fseek(fp, 0, SEEK_END);
666665
long int size = ftell(fp);
667666
fseek(fp, 0, SEEK_SET);
668-
const std::unique_ptr</*non-const*/ char[]> buffer(new char[size]);
669-
if (fread(buffer.get(), 1, size, fp) != static_cast<unsigned long>(size)) {
667+
const std::unique_ptr<char[]> buffer(new char[size]);
668+
if (fread(buffer.get(), 1, size, fp) != static_cast<size_t>(size)) {
670669
fclose(fp);
671670
return false;
672671
}
@@ -879,11 +878,11 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
879878
AppendPDFObject(buf);
880879

881880
// CONTENTS
882-
const std::unique_ptr</*non-const*/ char[]> pdftext(GetPDFTextObjects(api, width, height));
883-
const long pdftext_len = strlen(pdftext.get());
881+
const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
882+
const size_t pdftext_len = strlen(pdftext.get());
884883
size_t len;
885-
unsigned char *comp_pdftext =
886-
zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
884+
unsigned char *comp_pdftext = zlibCompress(
885+
reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
887886
long comp_pdftext_len = len;
888887
n = snprintf(buf, sizeof(buf),
889888
"%ld 0 obj\n"
@@ -960,11 +959,9 @@ bool TessPDFRenderer::EndDocumentHandler() {
960959

961960
// INFO
962961
STRING utf16_title = "FEFF"; // byte_order_marker
963-
GenericVector<int> unicodes;
964-
UNICHAR::UTF8ToUnicode(title(), &unicodes);
962+
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
965963
char utf16[kMaxBytesPerCodepoint];
966-
for (int i = 0; i < unicodes.length(); i++) {
967-
int code = unicodes[i];
964+
for (char32 code : unicodes) {
968965
if (CodepointToUtf16be(code, utf16)) {
969966
utf16_title += utf16;
970967
}

api/renderer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
#include "config_auto.h"
2020
#endif
2121

22-
#include <memory> // std::unique_ptr
2322
#include <string.h>
23+
#include <memory> // std::unique_ptr
2424
#include "baseapi.h"
2525
#include "genericvector.h"
2626
#include "renderer.h"

api/tesseractmain.cpp

+19-19
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
/**********************************************************************
2-
* File: tesseractmain.cpp (Formerly tessedit.c)
3-
* Description: Main program for merge of tess and editor.
4-
* Author: Ray Smith
5-
* Created: Tue Jan 07 15:21:46 GMT 1992
6-
*
7-
* (C) Copyright 1992, Hewlett-Packard Ltd.
8-
** Licensed under the Apache License, Version 2.0 (the "License");
9-
** you may not use this file except in compliance with the License.
10-
** You may obtain a copy of the License at
11-
** http://www.apache.org/licenses/LICENSE-2.0
12-
** Unless required by applicable law or agreed to in writing, software
13-
** distributed under the License is distributed on an "AS IS" BASIS,
14-
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15-
** See the License for the specific language governing permissions and
16-
** limitations under the License.
17-
*
18-
**********************************************************************/
2+
* File: tesseractmain.cpp (Formerly tessedit.c)
3+
* Description: Main program for merge of tess and editor.
4+
* Author: Ray Smith
5+
* Created: Tue Jan 07 15:21:46 GMT 1992
6+
*
7+
* (C) Copyright 1992, Hewlett-Packard Ltd.
8+
** Licensed under the Apache License, Version 2.0 (the "License");
9+
** you may not use this file except in compliance with the License.
10+
** You may obtain a copy of the License at
11+
** http://www.apache.org/licenses/LICENSE-2.0
12+
** Unless required by applicable law or agreed to in writing, software
13+
** distributed under the License is distributed on an "AS IS" BASIS,
14+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
** See the License for the specific language governing permissions and
16+
** limitations under the License.
17+
*
18+
**********************************************************************/
1919

2020
// Include automatically generated configuration file if running autoconf
2121
#ifdef HAVE_CONFIG_H
@@ -404,7 +404,7 @@ int main(int argc, char** argv) {
404404
static GenericVector<STRING> vars_vec;
405405
static GenericVector<STRING> vars_values;
406406

407-
#ifdef NDEBUG
407+
#if !defined(DEBUG)
408408
// Disable debugging and informational messages from Leptonica.
409409
setMsgSeverity(L_SEVERITY_ERROR);
410410
#endif
@@ -431,7 +431,7 @@ int main(int argc, char** argv) {
431431
// first TessBaseAPI must be destructed, DawgCache must be the last object.
432432
tesseract::Dict::GlobalDawgCache();
433433

434-
// Avoid memory leak caused by auto variable when exit() is called.
434+
// Avoid memory leak caused by auto variable when return is called.
435435
static tesseract::TessBaseAPI api;
436436

437437
api.SetOutputName(outputbase);

ccmain/control.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -1878,11 +1878,11 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
18781878
*
18791879
* Find the modal font and remove from the stats.
18801880
*/
1881-
static void find_modal_font( //good chars in word
1882-
STATS *fonts, //font stats
1883-
inT16 *font_out, //output font
1884-
int8_t *font_count //output count
1885-
) {
1881+
static void find_modal_font( // good chars in word
1882+
STATS* fonts, // font stats
1883+
inT16* font_out, // output font
1884+
int8_t* font_count // output count
1885+
) {
18861886
inT16 font; //font index
18871887
inT32 count; //pile couat
18881888

@@ -1999,7 +1999,7 @@ void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
19991999
}
20002000
}
20012001
inT16 doc_font; // modal font
2002-
int8_t doc_font_count; // modal font
2002+
int8_t doc_font_count; // modal font
20032003
find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
20042004
if (doc_font_count == 0)
20052005
return;

ccmain/docqual.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -511,9 +511,9 @@ BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
511511
int adjusted_len;
512512
int crunch_mode = 0;
513513

514-
if ((word->best_choice->unichar_string().length () == 0) ||
515-
(strspn (word->best_choice->unichar_string().string(), " ") ==
516-
word->best_choice->unichar_string().unsigned_size ()))
514+
if ((word->best_choice->unichar_string().length() == 0) ||
515+
(strspn(word->best_choice->unichar_string().string(), " ") ==
516+
word->best_choice->unichar_string().unsigned_size()))
517517
crunch_mode = 1;
518518
else {
519519
adjusted_len = word->reject_map.length ();

ccmain/equationdetect.cpp

+3-5
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,7 @@ EquationDetect::EquationDetect(const char* equ_datapath,
116116
cps_super_bbox_ = NULL;
117117
}
118118

119-
EquationDetect::~EquationDetect() {
120-
delete(cps_super_bbox_);
121-
}
119+
EquationDetect::~EquationDetect() { delete (cps_super_bbox_); }
122120

123121
void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) {
124122
lang_tesseract_ = lang_tesseract;
@@ -258,8 +256,8 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(
258256

259257
void EquationDetect::IdentifySpecialText() {
260258
// Set configuration for Tesseract::AdaptiveClassifier.
261-
equ_tesseract_.tess_cn_matching.set_value(true); // turn it on
262-
equ_tesseract_.tess_bn_matching.set_value(false);
259+
equ_tesseract_.tess_cn_matching.set_value(1); // turn it on
260+
equ_tesseract_.tess_bn_matching.set_value(0);
263261

264262
// Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
265263
int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;

ccmain/paragraphs.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
#endif
2222

2323
#include <ctype.h>
24-
#include <memory> // std::unique_ptr
24+
#include <memory> // std::unique_ptr
2525

2626
#include "genericvector.h"
2727
#include "helpers.h"

ccmain/paramsd.cpp

+2-4
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,8 @@ void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,
183183

184184
// Compare two VC objects by their name.
185185
int ParamContent::Compare(const void* v1, const void* v2) {
186-
const ParamContent* one =
187-
*static_cast<const ParamContent* const *>(v1);
188-
const ParamContent* two =
189-
*static_cast<const ParamContent* const *>(v2);
186+
const ParamContent* one = *static_cast<const ParamContent* const*>(v1);
187+
const ParamContent* two = *static_cast<const ParamContent* const*>(v2);
190188
return strcmp(one->GetName(), two->GetName());
191189
}
192190

ccmain/pgedit.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,8 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics
544544
break;
545545

546546
default:
547-
sprintf(msg, "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
547+
snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
548+
cmd_event, new_value);
548549
image_win->AddMessage(msg);
549550
break;
550551
}

ccmain/thresholder.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,8 @@ void ImageThresholder::ThresholdRectToPix(Pix* src_pix,
311311
for (int x = 0; x < rect_width_; ++x) {
312312
bool white_result = true;
313313
for (int ch = 0; ch < num_channels; ++ch) {
314-
int pixel = GET_DATA_BYTE(linedata,
315-
(x + rect_left_) * num_channels + ch);
314+
int pixel =
315+
GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
316316
if (hi_values[ch] >= 0 &&
317317
(pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
318318
white_result = false;

ccstruct/boxread.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
206206
// Validate UTF8 by making unichars with it.
207207
int used = 0;
208208
while (used < uch_len) {
209-
UNICHAR ch(uch + used, uch_len - used);
209+
tesseract::UNICHAR ch(uch + used, uch_len - used);
210210
int new_used = ch.utf8_len();
211211
if (new_used == 0) {
212212
tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",

ccstruct/coutln.cpp

+4-16
Original file line numberDiff line numberDiff line change
@@ -652,22 +652,10 @@ static void ComputeGradient(const l_uint32* data, int wpl,
652652
int x, int y, int width, int height,
653653
ICOORD* gradient) {
654654
const l_uint32* line = data + y * wpl;
655-
int pix_x_y =
656-
x < width && y < height
657-
? GET_DATA_BYTE(line, x)
658-
: 255;
659-
int pix_x_prevy =
660-
x < width && y > 0
661-
? GET_DATA_BYTE(line - wpl, x)
662-
: 255;
663-
int pix_prevx_prevy =
664-
x > 0 && y > 0
665-
? GET_DATA_BYTE(line - wpl, x - 1)
666-
: 255;
667-
int pix_prevx_y =
668-
x > 0 && y < height
669-
? GET_DATA_BYTE(line, x - 1)
670-
: 255;
655+
int pix_x_y = x < width && y < height ? GET_DATA_BYTE(line, x) : 255;
656+
int pix_x_prevy = x < width && y > 0 ? GET_DATA_BYTE(line - wpl, x) : 255;
657+
int pix_prevx_prevy = x > 0 && y > 0 ? GET_DATA_BYTE(line - wpl, x - 1) : 255;
658+
int pix_prevx_y = x > 0 && y < height ? GET_DATA_BYTE(line, x - 1) : 255;
671659
gradient->set_x(pix_x_y + pix_x_prevy - (pix_prevx_y + pix_prevx_prevy));
672660
gradient->set_y(pix_x_prevy + pix_prevx_prevy - (pix_x_y + pix_prevx_y));
673661
}

ccstruct/coutln.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**********************************************************************
2-
* File: coutln.h (Formerly: coutline.c)
3-
* Description: Code for the C_OUTLINE class.
4-
* Author: Ray Smith
2+
* File: coutln.h (Formerly:
3+
*coutline.c) Description: Code for the C_OUTLINE class. Author:
4+
*Ray Smith
55
* Created: Mon Oct 07 16:01:57 BST 1991
66
*
77
* (C) Copyright 1991, Hewlett-Packard Ltd.

0 commit comments

Comments
 (0)