@@ -809,9 +809,7 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
809
809
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
810
810
* DetectOS, or anything else that changes the internal PAGE_RES.
811
811
*/
812
- PageIterator* TessBaseAPI::AnalyseLayout () {
813
- return AnalyseLayout (false );
814
- }
812
+ PageIterator* TessBaseAPI::AnalyseLayout () { return AnalyseLayout (false ); }
815
813
816
814
PageIterator* TessBaseAPI::AnalyseLayout (bool merge_similar_words) {
817
815
if (FindLines () == 0 ) {
@@ -1379,8 +1377,9 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it,
1379
1377
hocr_str->add_str_double (" " , round (p0 * 1000.0 ) / 1000.0 );
1380
1378
}
1381
1379
1382
- static void AddIdTohOCR (STRING* hocr_str, const std::string base, int num1, int num2) {
1383
- const unsigned long BUFSIZE = 64 ;
1380
+ static void AddIdTohOCR (STRING* hocr_str, const std::string base, int num1,
1381
+ int num2) {
1382
+ const size_t BUFSIZE = 64 ;
1384
1383
char id_buffer[BUFSIZE];
1385
1384
if (num2 >= 0 ) {
1386
1385
snprintf (id_buffer, BUFSIZE - 1 , " %s_%d_%d" , base.c_str (), num1, num2);
@@ -1393,8 +1392,7 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int
1393
1392
*hocr_str += " '" ;
1394
1393
}
1395
1394
1396
- static void AddBoxTohOCR (const ResultIterator *it,
1397
- PageIteratorLevel level,
1395
+ static void AddBoxTohOCR (const ResultIterator* it, PageIteratorLevel level,
1398
1396
STRING* hocr_str) {
1399
1397
int left, top, right, bottom;
1400
1398
it->BoundingBox (level, &left, &top, &right, &bottom);
@@ -1410,17 +1408,16 @@ static void AddBoxTohOCR(const ResultIterator *it,
1410
1408
// add custom height measures
1411
1409
float row_height, descenders, ascenders; // row attributes
1412
1410
it->RowAttributes (&row_height, &descenders, &ascenders);
1413
- // TODO: Do we want to limit these to a single decimal place?
1411
+ // TODO(rays) : Do we want to limit these to a single decimal place?
1414
1412
hocr_str->add_str_double (" ; x_size " , row_height);
1415
1413
hocr_str->add_str_double (" ; x_descenders " , descenders * -1 );
1416
1414
hocr_str->add_str_double (" ; x_ascenders " , ascenders);
1417
1415
}
1418
1416
*hocr_str += " \" >" ;
1419
1417
}
1420
1418
1421
- static void AddBoxToTSV (const PageIterator *it,
1422
- PageIteratorLevel level,
1423
- STRING* hocr_str) {
1419
+ static void AddBoxToTSV (const PageIterator* it, PageIteratorLevel level,
1420
+ STRING* hocr_str) {
1424
1421
int left, top, right, bottom;
1425
1422
it->BoundingBox (level, &left, &top, &right, &bottom);
1426
1423
hocr_str->add_str_int (" \t " , left);
@@ -1429,8 +1426,6 @@ static void AddBoxToTSV(const PageIterator *it,
1429
1426
hocr_str->add_str_int (" \t " , bottom - top);
1430
1427
}
1431
1428
1432
-
1433
-
1434
1429
/* *
1435
1430
* Make a HTML-formatted string with hOCR markup from the internal
1436
1431
* data structures.
@@ -1440,7 +1435,7 @@ static void AddBoxToTSV(const PageIterator *it,
1440
1435
* STL removed from original patch submission and refactored by rays.
1441
1436
*/
1442
1437
char * TessBaseAPI::GetHOCRText (int page_number) {
1443
- return GetHOCRText (NULL ,page_number);
1438
+ return GetHOCRText (NULL , page_number);
1444
1439
}
1445
1440
1446
1441
/* *
@@ -1452,13 +1447,12 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
1452
1447
* STL removed from original patch submission and refactored by rays.
1453
1448
*/
1454
1449
char * TessBaseAPI::GetHOCRText (ETEXT_DESC* monitor, int page_number) {
1455
- if (tesseract_ == NULL ||
1456
- (page_res_ == NULL && Recognize (monitor) < 0 ))
1450
+ if (tesseract_ == NULL || (page_res_ == NULL && Recognize (monitor) < 0 ))
1457
1451
return NULL ;
1458
1452
1459
1453
int lcnt = 1 , bcnt = 1 , pcnt = 1 , wcnt = 1 ;
1460
1454
int page_id = page_number + 1 ; // hOCR uses 1-based page numbers.
1461
- bool para_is_ltr = true ; // Default direction is LTR
1455
+ bool para_is_ltr = true ; // Default direction is LTR
1462
1456
const char * paragraph_lang = NULL ;
1463
1457
bool font_info = false ;
1464
1458
GetBoolVariable (" hocr_font_info" , &font_info);
@@ -1470,13 +1464,13 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1470
1464
1471
1465
#ifdef _WIN32
1472
1466
// convert input name from ANSI encoding to utf-8
1473
- int str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_-> string (), - 1 ,
1474
- NULL , 0 );
1467
+ int str16_len =
1468
+ MultiByteToWideChar (CP_ACP, 0 , input_file_-> string (), - 1 , NULL , 0 );
1475
1469
wchar_t *uni16_str = new WCHAR[str16_len];
1476
1470
str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 ,
1477
1471
uni16_str, str16_len);
1478
- int utf8_len = WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, NULL ,
1479
- 0 , NULL , NULL );
1472
+ int utf8_len = WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, NULL , 0 ,
1473
+ NULL , NULL );
1480
1474
char *utf8_str = new char [utf8_len];
1481
1475
WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, utf8_str,
1482
1476
utf8_len, NULL , NULL );
@@ -1509,7 +1503,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1509
1503
1510
1504
// Open any new block/paragraph/textline.
1511
1505
if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
1512
- para_is_ltr = true ; // reset to default direction
1506
+ para_is_ltr = true ; // reset to default direction
1513
1507
hocr_str += " <div class='ocr_carea'" ;
1514
1508
AddIdTohOCR (&hocr_str, " block" , page_id, bcnt);
1515
1509
AddBoxTohOCR (res_it, RIL_BLOCK, &hocr_str);
@@ -1523,9 +1517,9 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1523
1517
AddIdTohOCR (&hocr_str, " par" , page_id, pcnt);
1524
1518
paragraph_lang = res_it->WordRecognitionLanguage ();
1525
1519
if (paragraph_lang) {
1526
- hocr_str += " lang='" ;
1527
- hocr_str += paragraph_lang;
1528
- hocr_str += " '" ;
1520
+ hocr_str += " lang='" ;
1521
+ hocr_str += paragraph_lang;
1522
+ hocr_str += " '" ;
1529
1523
}
1530
1524
AddBoxTohOCR (res_it, RIL_PARA, &hocr_str);
1531
1525
}
@@ -1567,8 +1561,12 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1567
1561
}
1568
1562
switch (res_it->WordDirection ()) {
1569
1563
// Only emit direction if different from current paragraph direction
1570
- case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str += " dir='ltr'" ; break ;
1571
- case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str += " dir='rtl'" ; break ;
1564
+ case DIR_LEFT_TO_RIGHT:
1565
+ if (!para_is_ltr) hocr_str += " dir='ltr'" ;
1566
+ break ;
1567
+ case DIR_RIGHT_TO_LEFT:
1568
+ if (para_is_ltr) hocr_str += " dir='rtl'" ;
1569
+ break ;
1572
1570
case DIR_MIX:
1573
1571
case DIR_NEUTRAL:
1574
1572
default : // Do nothing.
@@ -1600,7 +1598,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1600
1598
if (last_word_in_para) {
1601
1599
hocr_str += " \n </p>\n " ;
1602
1600
pcnt++;
1603
- para_is_ltr = true ; // back to default direction
1601
+ para_is_ltr = true ; // back to default direction
1604
1602
}
1605
1603
if (last_word_in_block) {
1606
1604
hocr_str += " </div>\n " ;
@@ -1620,18 +1618,18 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1620
1618
* page_number is 0-based but will appear in the output as 1-based.
1621
1619
*/
1622
1620
char * TessBaseAPI::GetTSVText (int page_number) {
1623
- if (tesseract_ == NULL ||
1624
- (page_res_ == NULL && Recognize (NULL ) < 0 ))
1621
+ if (tesseract_ == NULL || (page_res_ == NULL && Recognize (NULL ) < 0 ))
1625
1622
return NULL ;
1626
1623
1627
1624
int lcnt = 1 , bcnt = 1 , pcnt = 1 , wcnt = 1 ;
1628
1625
int page_id = page_number + 1 ; // we use 1-based page numbers.
1629
1626
1630
1627
STRING tsv_str (" " );
1631
1628
1632
- int page_num = page_id, block_num = 0 , par_num = 0 , line_num = 0 , word_num = 0 ;
1629
+ int page_num = page_id, block_num = 0 , par_num = 0 , line_num = 0 ,
1630
+ word_num = 0 ;
1633
1631
1634
- tsv_str.add_str_int (" 1\t " , page_num); // level 1 - page
1632
+ tsv_str.add_str_int (" 1\t " , page_num); // level 1 - page
1635
1633
tsv_str.add_str_int (" \t " , block_num);
1636
1634
tsv_str.add_str_int (" \t " , par_num);
1637
1635
tsv_str.add_str_int (" \t " , line_num);
@@ -1642,7 +1640,7 @@ char* TessBaseAPI::GetTSVText(int page_number) {
1642
1640
tsv_str.add_str_int (" \t " , rect_height_);
1643
1641
tsv_str += " \t -1\t\n " ;
1644
1642
1645
- ResultIterator * res_it = GetIterator ();
1643
+ ResultIterator* res_it = GetIterator ();
1646
1644
while (!res_it->Empty (RIL_BLOCK)) {
1647
1645
if (res_it->Empty (RIL_WORD)) {
1648
1646
res_it->Next (RIL_WORD);
@@ -1652,46 +1650,46 @@ char* TessBaseAPI::GetTSVText(int page_number) {
1652
1650
// Add rows for any new block/paragraph/textline.
1653
1651
if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
1654
1652
block_num++, par_num = 0 , line_num = 0 , word_num = 0 ;
1655
- tsv_str.add_str_int (" 2\t " , page_num); // level 2 - block
1653
+ tsv_str.add_str_int (" 2\t " , page_num); // level 2 - block
1656
1654
tsv_str.add_str_int (" \t " , block_num);
1657
1655
tsv_str.add_str_int (" \t " , par_num);
1658
1656
tsv_str.add_str_int (" \t " , line_num);
1659
1657
tsv_str.add_str_int (" \t " , word_num);
1660
1658
AddBoxToTSV (res_it, RIL_BLOCK, &tsv_str);
1661
- tsv_str += " \t -1\t\n " ; // end of row for block
1659
+ tsv_str += " \t -1\t\n " ; // end of row for block
1662
1660
}
1663
1661
if (res_it->IsAtBeginningOf (RIL_PARA)) {
1664
1662
par_num++, line_num = 0 , word_num = 0 ;
1665
- tsv_str.add_str_int (" 3\t " , page_num); // level 3 - paragraph
1663
+ tsv_str.add_str_int (" 3\t " , page_num); // level 3 - paragraph
1666
1664
tsv_str.add_str_int (" \t " , block_num);
1667
1665
tsv_str.add_str_int (" \t " , par_num);
1668
1666
tsv_str.add_str_int (" \t " , line_num);
1669
1667
tsv_str.add_str_int (" \t " , word_num);
1670
1668
AddBoxToTSV (res_it, RIL_PARA, &tsv_str);
1671
- tsv_str += " \t -1\t\n " ; // end of row for para
1669
+ tsv_str += " \t -1\t\n " ; // end of row for para
1672
1670
}
1673
1671
if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
1674
1672
line_num++, word_num = 0 ;
1675
- tsv_str.add_str_int (" 4\t " , page_num); // level 4 - line
1673
+ tsv_str.add_str_int (" 4\t " , page_num); // level 4 - line
1676
1674
tsv_str.add_str_int (" \t " , block_num);
1677
1675
tsv_str.add_str_int (" \t " , par_num);
1678
1676
tsv_str.add_str_int (" \t " , line_num);
1679
1677
tsv_str.add_str_int (" \t " , word_num);
1680
1678
AddBoxToTSV (res_it, RIL_TEXTLINE, &tsv_str);
1681
- tsv_str += " \t -1\t\n " ; // end of row for line
1679
+ tsv_str += " \t -1\t\n " ; // end of row for line
1682
1680
}
1683
1681
1684
1682
// Now, process the word...
1685
1683
int left, top, right, bottom;
1686
1684
bool bold , italic , underlined, monospace, serif, smallcaps;
1687
1685
int pointsize, font_id;
1688
- const char * font_name;
1686
+ const char * font_name;
1689
1687
res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
1690
- font_name = res_it-> WordFontAttributes (& bold , & italic , &underlined,
1691
- &monospace , &serif , &smallcaps ,
1692
- &pointsize, &font_id);
1688
+ font_name =
1689
+ res_it-> WordFontAttributes (& bold , & italic , &underlined , &monospace ,
1690
+ &serif, &smallcaps, &pointsize, &font_id);
1693
1691
word_num++;
1694
- tsv_str.add_str_int (" 5\t " , page_num); // level 5 - word
1692
+ tsv_str.add_str_int (" 5\t " , page_num); // level 5 - word
1695
1693
tsv_str.add_str_int (" \t " , block_num);
1696
1694
tsv_str.add_str_int (" \t " , par_num);
1697
1695
tsv_str.add_str_int (" \t " , line_num);
@@ -1712,11 +1710,11 @@ char* TessBaseAPI::GetTSVText(int page_number) {
1712
1710
tsv_str += res_it->GetUTF8Text (RIL_SYMBOL);
1713
1711
res_it->Next (RIL_SYMBOL);
1714
1712
} while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
1715
- tsv_str += " \n " ; // end of row
1713
+ tsv_str += " \n " ; // end of row
1716
1714
wcnt++;
1717
1715
}
1718
1716
1719
- char * ret = new char [tsv_str.length () + 1 ];
1717
+ char * ret = new char [tsv_str.length () + 1 ];
1720
1718
strcpy (ret, tsv_str.string ());
1721
1719
delete res_it;
1722
1720
return ret;
@@ -1760,7 +1758,7 @@ char* TessBaseAPI::GetBoxText(int page_number) {
1760
1758
int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
1761
1759
kMaxBytesPerLine ;
1762
1760
char * result = new char [total_length];
1763
- strcpy ( result, " \0 " ) ;
1761
+ result[ 0 ] = ' \0 ' ;
1764
1762
int output_length = 0 ;
1765
1763
LTRResultIterator* it = GetLTRIterator ();
1766
1764
do {
@@ -1907,17 +1905,17 @@ char* TessBaseAPI::GetUNLVText() {
1907
1905
return result;
1908
1906
}
1909
1907
1910
- /* *
1911
- * The recognized text is returned as a char* which is coded
1912
- * as UTF8 and must be freed with the delete [] operator.
1913
- * page_number is a 0-based page index that will appear in the osd file.
1914
- */
1908
+ /* *
1909
+ * The recognized text is returned as a char* which is coded
1910
+ * as UTF8 and must be freed with the delete [] operator.
1911
+ * page_number is a 0-based page index that will appear in the osd file.
1912
+ */
1915
1913
char * TessBaseAPI::GetOsdText (int page_number) {
1916
1914
OSResults osr;
1917
1915
1918
1916
bool osd = DetectOS (&osr);
1919
1917
if (!osd) {
1920
- return NULL ;
1918
+ return NULL ;
1921
1919
}
1922
1920
1923
1921
int orient_id = osr.best_result .orientation_id ;
@@ -1931,19 +1929,19 @@ char* TessBaseAPI::GetOsdText(int page_number) {
1931
1929
int orient_deg = orient_id * 90 ;
1932
1930
1933
1931
// clockwise rotation needed to make the page upright
1934
- int rotate = OrientationIdToValue (orient_id);
1935
-
1936
- char * osd_buf = new char [ 255 ] ;
1937
- snprintf ( osd_buf, 255 ,
1938
- " Page number: %d \n "
1939
- " Orientation in degrees : %d\n "
1940
- " Rotate : %d\n "
1941
- " Orientation confidence : %.2f \n "
1942
- " Script : %s \n "
1943
- " Script confidence : %.2f \n " ,
1944
- page_number ,
1945
- orient_deg, rotate, orient_conf,
1946
- script_name, script_conf);
1932
+ int rotate = OrientationIdToValue (orient_id);
1933
+
1934
+ const int kOsdBufsize = 255 ;
1935
+ char * osd_buf = new char [ kOsdBufsize ];
1936
+ snprintf (osd_buf, kOsdBufsize ,
1937
+ " Page number : %d\n "
1938
+ " Orientation in degrees : %d\n "
1939
+ " Rotate : %d \n "
1940
+ " Orientation confidence : %.2f \n "
1941
+ " Script: %s \n "
1942
+ " Script confidence: %.2f \n " ,
1943
+ page_number, orient_deg, rotate, orient_conf, script_name ,
1944
+ script_conf);
1947
1945
1948
1946
return osd_buf;
1949
1947
}
@@ -2063,7 +2061,7 @@ void TessBaseAPI::Clear() {
2063
2061
if (thresholder_ != NULL )
2064
2062
thresholder_->Clear ();
2065
2063
ClearResults ();
2066
- SetInputImage (NULL );
2064
+ if (tesseract_ != NULL ) SetInputImage (NULL );
2067
2065
}
2068
2066
2069
2067
/* *
@@ -2767,7 +2765,7 @@ void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob,
2767
2765
INT_FX_RESULT_STRUCT fx_info;
2768
2766
tesseract_->ExtractFeatures (*blob, false , &bl_features,
2769
2767
&cn_features, &fx_info, &outline_counts);
2770
- if (cn_features.size () == 0 || cn_features.size () > MAX_NUM_INT_FEATURES) {
2768
+ if (cn_features.empty () || cn_features.size () > MAX_NUM_INT_FEATURES) {
2771
2769
*num_features = 0 ;
2772
2770
return ; // Feature extraction failed.
2773
2771
}
0 commit comments