@@ -1340,363 +1340,14 @@ char* TessBaseAPI::GetUTF8Text() {
1340
1340
return result;
1341
1341
}
1342
1342
1343
- /* *
1344
- * Gets the block orientation at the current iterator position.
1345
- */
1346
- static tesseract::Orientation GetBlockTextOrientation (const PageIterator *it) {
1347
- tesseract::Orientation orientation;
1348
- tesseract::WritingDirection writing_direction;
1349
- tesseract::TextlineOrder textline_order;
1350
- float deskew_angle;
1351
- it->Orientation (&orientation, &writing_direction, &textline_order,
1352
- &deskew_angle);
1353
- return orientation;
1354
- }
1355
-
1356
- /* *
1357
- * Fits a line to the baseline at the given level, and appends its coefficients
1358
- * to the hOCR string.
1359
- * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
1360
- * rotated textlines. For this reason, on textlines that are not upright, this
1361
- * method currently only inserts a 'textangle' property to indicate the rotation
1362
- * direction and does not add any baseline information to the hocr string.
1363
- */
1364
- static void AddBaselineCoordsTohOCR (const PageIterator *it,
1365
- PageIteratorLevel level,
1366
- STRING* hocr_str) {
1367
- tesseract::Orientation orientation = GetBlockTextOrientation (it);
1368
- if (orientation != ORIENTATION_PAGE_UP) {
1369
- hocr_str->add_str_int (" ; textangle " , 360 - orientation * 90 );
1370
- return ;
1371
- }
1372
-
1373
- int left, top, right, bottom;
1374
- it->BoundingBox (level, &left, &top, &right, &bottom);
1375
-
1376
- // Try to get the baseline coordinates at this level.
1377
- int x1, y1 , x2, y2;
1378
- if (!it->Baseline (level, &x1, &y1 , &x2, &y2))
1379
- return ;
1380
- // Following the description of this field of the hOCR spec, we convert the
1381
- // baseline coordinates so that "the bottom left of the bounding box is the
1382
- // origin".
1383
- x1 -= left;
1384
- x2 -= left;
1385
- y1 -= bottom;
1386
- y2 -= bottom;
1387
-
1388
- // Now fit a line through the points so we can extract coefficients for the
1389
- // equation: y = p1 x + p0
1390
- double p1 = 0 ;
1391
- double p0 = 0 ;
1392
- if (x1 == x2) {
1393
- // Problem computing the polynomial coefficients.
1394
- return ;
1395
- }
1396
- p1 = (y2 - y1 ) / static_cast <double >(x2 - x1);
1397
- p0 = y1 - static_cast <double >(p1 * x1);
1398
-
1399
- hocr_str->add_str_double (" ; baseline " , round (p1 * 1000.0 ) / 1000.0 );
1400
- hocr_str->add_str_double (" " , round (p0 * 1000.0 ) / 1000.0 );
1401
- }
1402
-
1403
- static void AddIdTohOCR (STRING* hocr_str, const std::string base, int num1,
1404
- int num2) {
1405
- const size_t BUFSIZE = 64 ;
1406
- char id_buffer[BUFSIZE];
1407
- if (num2 >= 0 ) {
1408
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d_%d" , base.c_str (), num1, num2);
1409
- } else {
1410
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d" , base.c_str (), num1);
1411
- }
1412
- id_buffer[BUFSIZE - 1 ] = ' \0 ' ;
1413
- *hocr_str += " id='" ;
1414
- *hocr_str += id_buffer;
1415
- *hocr_str += " '" ;
1416
- }
1417
-
1418
- static void AddIdTohOCR (STRING* hocr_str, const std::string base, int num1,
1419
- int num2, int num3) {
1420
- const size_t BUFSIZE = 64 ;
1421
- char id_buffer[BUFSIZE];
1422
- snprintf (id_buffer, BUFSIZE - 1 , " %s_%d_%d_%d" , base.c_str (), num1, num2,num3);
1423
- id_buffer[BUFSIZE - 1 ] = ' \0 ' ;
1424
- *hocr_str += " id='" ;
1425
- *hocr_str += id_buffer;
1426
- *hocr_str += " '" ;
1427
- }
1428
-
1429
- static void AddBoxTohOCR (const ResultIterator* it, PageIteratorLevel level,
1430
- STRING* hocr_str) {
1431
- int left, top, right, bottom;
1432
- it->BoundingBox (level, &left, &top, &right, &bottom);
1433
- // This is the only place we use double quotes instead of single quotes,
1434
- // but it may too late to change for consistency
1435
- hocr_str->add_str_int (" title=\" bbox " , left);
1436
- hocr_str->add_str_int (" " , top);
1437
- hocr_str->add_str_int (" " , right);
1438
- hocr_str->add_str_int (" " , bottom);
1439
- // Add baseline coordinates & heights for textlines only.
1440
- if (level == RIL_TEXTLINE) {
1441
- AddBaselineCoordsTohOCR (it, level, hocr_str);
1442
- // add custom height measures
1443
- float row_height, descenders, ascenders; // row attributes
1444
- it->RowAttributes (&row_height, &descenders, &ascenders);
1445
- // TODO(rays): Do we want to limit these to a single decimal place?
1446
- hocr_str->add_str_double (" ; x_size " , row_height);
1447
- hocr_str->add_str_double (" ; x_descenders " , descenders * -1 );
1448
- hocr_str->add_str_double (" ; x_ascenders " , ascenders);
1449
- }
1450
- *hocr_str += " \" >" ;
1451
- }
1452
-
1453
1343
static void AddBoxToTSV (const PageIterator* it, PageIteratorLevel level,
1454
- STRING* hocr_str ) {
1344
+ STRING* text ) {
1455
1345
int left, top, right, bottom;
1456
1346
it->BoundingBox (level, &left, &top, &right, &bottom);
1457
- hocr_str->add_str_int (" \t " , left);
1458
- hocr_str->add_str_int (" \t " , top);
1459
- hocr_str->add_str_int (" \t " , right - left);
1460
- hocr_str->add_str_int (" \t " , bottom - top);
1461
- }
1462
-
1463
- /* *
1464
- * Make a HTML-formatted string with hOCR markup from the internal
1465
- * data structures.
1466
- * page_number is 0-based but will appear in the output as 1-based.
1467
- * Image name/input_file_ can be set by SetInputName before calling
1468
- * GetHOCRText
1469
- * STL removed from original patch submission and refactored by rays.
1470
- * Returned string must be freed with the delete [] operator.
1471
- */
1472
- char * TessBaseAPI::GetHOCRText (int page_number) {
1473
- return GetHOCRText (nullptr , page_number);
1474
- }
1475
-
1476
- /* *
1477
- * Make a HTML-formatted string with hOCR markup from the internal
1478
- * data structures.
1479
- * page_number is 0-based but will appear in the output as 1-based.
1480
- * Image name/input_file_ can be set by SetInputName before calling
1481
- * GetHOCRText
1482
- * STL removed from original patch submission and refactored by rays.
1483
- * Returned string must be freed with the delete [] operator.
1484
- */
1485
- char * TessBaseAPI::GetHOCRText (ETEXT_DESC* monitor, int page_number) {
1486
- if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize (monitor) < 0 ))
1487
- return nullptr ;
1488
-
1489
- int lcnt = 1 , bcnt = 1 , pcnt = 1 , wcnt = 1 , tcnt = 1 , gcnt = 1 ;
1490
- int page_id = page_number + 1 ; // hOCR uses 1-based page numbers.
1491
- bool para_is_ltr = true ; // Default direction is LTR
1492
- const char * paragraph_lang = nullptr ;
1493
- bool font_info = false ;
1494
- GetBoolVariable (" hocr_font_info" , &font_info);
1495
-
1496
- STRING hocr_str (" " );
1497
-
1498
- if (input_file_ == nullptr )
1499
- SetInputName (nullptr );
1500
-
1501
- #ifdef _WIN32
1502
- // convert input name from ANSI encoding to utf-8
1503
- int str16_len =
1504
- MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 , nullptr , 0 );
1505
- wchar_t *uni16_str = new WCHAR[str16_len];
1506
- str16_len = MultiByteToWideChar (CP_ACP, 0 , input_file_->string (), -1 ,
1507
- uni16_str, str16_len);
1508
- int utf8_len = WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, nullptr , 0 ,
1509
- nullptr , nullptr );
1510
- char *utf8_str = new char [utf8_len];
1511
- WideCharToMultiByte (CP_UTF8, 0 , uni16_str, str16_len, utf8_str,
1512
- utf8_len, nullptr , nullptr );
1513
- *input_file_ = utf8_str;
1514
- delete[] uni16_str;
1515
- delete[] utf8_str;
1516
- #endif
1517
-
1518
- hocr_str += " <div class='ocr_page'" ;
1519
- AddIdTohOCR (&hocr_str, " page" , page_id, -1 );
1520
- hocr_str += " title='image \" " ;
1521
- if (input_file_) {
1522
- hocr_str += HOcrEscape (input_file_->string ());
1523
- } else {
1524
- hocr_str += " unknown" ;
1525
- }
1526
- hocr_str.add_str_int (" \" ; bbox " , rect_left_);
1527
- hocr_str.add_str_int (" " , rect_top_);
1528
- hocr_str.add_str_int (" " , rect_width_);
1529
- hocr_str.add_str_int (" " , rect_height_);
1530
- hocr_str.add_str_int (" ; ppageno " , page_number);
1531
- hocr_str += " '>\n " ;
1532
-
1533
- ResultIterator *res_it = GetIterator ();
1534
- while (!res_it->Empty (RIL_BLOCK)) {
1535
- if (res_it->Empty (RIL_WORD)) {
1536
- res_it->Next (RIL_WORD);
1537
- continue ;
1538
- }
1539
-
1540
- // Open any new block/paragraph/textline.
1541
- if (res_it->IsAtBeginningOf (RIL_BLOCK)) {
1542
- para_is_ltr = true ; // reset to default direction
1543
- hocr_str += " <div class='ocr_carea'" ;
1544
- AddIdTohOCR (&hocr_str, " block" , page_id, bcnt);
1545
- AddBoxTohOCR (res_it, RIL_BLOCK, &hocr_str);
1546
- }
1547
- if (res_it->IsAtBeginningOf (RIL_PARA)) {
1548
- hocr_str += " \n <p class='ocr_par'" ;
1549
- para_is_ltr = res_it->ParagraphIsLtr ();
1550
- if (!para_is_ltr) {
1551
- hocr_str += " dir='rtl'" ;
1552
- }
1553
- AddIdTohOCR (&hocr_str, " par" , page_id, pcnt);
1554
- paragraph_lang = res_it->WordRecognitionLanguage ();
1555
- if (paragraph_lang) {
1556
- hocr_str += " lang='" ;
1557
- hocr_str += paragraph_lang;
1558
- hocr_str += " '" ;
1559
- }
1560
- AddBoxTohOCR (res_it, RIL_PARA, &hocr_str);
1561
- }
1562
- if (res_it->IsAtBeginningOf (RIL_TEXTLINE)) {
1563
- hocr_str += " \n <span class='ocr_line'" ;
1564
- AddIdTohOCR (&hocr_str, " line" , page_id, lcnt);
1565
- AddBoxTohOCR (res_it, RIL_TEXTLINE, &hocr_str);
1566
- }
1567
-
1568
- // Now, process the word...
1569
- std::vector<std::vector<std::pair<const char *, float >>>* confidencemap = nullptr ;
1570
- if (tesseract_->lstm_choice_mode ) {
1571
- confidencemap = res_it->GetBestLSTMSymbolChoices ();
1572
- }
1573
- hocr_str += " \n <span class='ocrx_word'" ;
1574
- AddIdTohOCR (&hocr_str, " word" , page_id, wcnt);
1575
- int left, top, right, bottom;
1576
- bool bold , italic , underlined, monospace, serif, smallcaps;
1577
- int pointsize, font_id;
1578
- const char *font_name;
1579
- res_it->BoundingBox (RIL_WORD, &left, &top, &right, &bottom);
1580
- font_name = res_it->WordFontAttributes (&bold , &italic , &underlined,
1581
- &monospace, &serif, &smallcaps,
1582
- &pointsize, &font_id);
1583
- hocr_str.add_str_int (" title='bbox " , left);
1584
- hocr_str.add_str_int (" " , top);
1585
- hocr_str.add_str_int (" " , right);
1586
- hocr_str.add_str_int (" " , bottom);
1587
- hocr_str.add_str_int (" ; x_wconf " , res_it->Confidence (RIL_WORD));
1588
- if (font_info) {
1589
- if (font_name) {
1590
- hocr_str += " ; x_font " ;
1591
- hocr_str += HOcrEscape (font_name);
1592
- }
1593
- hocr_str.add_str_int (" ; x_fsize " , pointsize);
1594
- }
1595
- hocr_str += " '" ;
1596
- const char * lang = res_it->WordRecognitionLanguage ();
1597
- if (lang && (!paragraph_lang || strcmp (lang, paragraph_lang))) {
1598
- hocr_str += " lang='" ;
1599
- hocr_str += lang;
1600
- hocr_str += " '" ;
1601
- }
1602
- switch (res_it->WordDirection ()) {
1603
- // Only emit direction if different from current paragraph direction
1604
- case DIR_LEFT_TO_RIGHT:
1605
- if (!para_is_ltr) hocr_str += " dir='ltr'" ;
1606
- break ;
1607
- case DIR_RIGHT_TO_LEFT:
1608
- if (para_is_ltr) hocr_str += " dir='rtl'" ;
1609
- break ;
1610
- case DIR_MIX:
1611
- case DIR_NEUTRAL:
1612
- default : // Do nothing.
1613
- break ;
1614
- }
1615
- hocr_str += " >" ;
1616
- bool last_word_in_line = res_it->IsAtFinalElement (RIL_TEXTLINE, RIL_WORD);
1617
- bool last_word_in_para = res_it->IsAtFinalElement (RIL_PARA, RIL_WORD);
1618
- bool last_word_in_block = res_it->IsAtFinalElement (RIL_BLOCK, RIL_WORD);
1619
- if (bold ) hocr_str += " <strong>" ;
1620
- if (italic ) hocr_str += " <em>" ;
1621
- do {
1622
- const std::unique_ptr<const char []> grapheme (
1623
- res_it->GetUTF8Text (RIL_SYMBOL));
1624
- if (grapheme && grapheme[0 ] != 0 ) {
1625
- hocr_str += HOcrEscape (grapheme.get ());
1626
- }
1627
- res_it->Next (RIL_SYMBOL);
1628
- } while (!res_it->Empty (RIL_BLOCK) && !res_it->IsAtBeginningOf (RIL_WORD));
1629
- if (italic ) hocr_str += " </em>" ;
1630
- if (bold ) hocr_str += " </strong>" ;
1631
- // If the lstm choice mode is required it is added here
1632
- if (tesseract_->lstm_choice_mode == 1 && confidencemap != nullptr ) {
1633
- for (size_t i = 0 ; i < confidencemap->size (); i++) {
1634
- hocr_str += " \n <span class='ocrx_cinfo'" ;
1635
- AddIdTohOCR (&hocr_str, " timestep" , page_id, wcnt, tcnt);
1636
- hocr_str += " >" ;
1637
- std::vector<std::pair<const char *, float >> timestep = (*confidencemap)[i];
1638
- for (std::pair<const char *, float > conf : timestep) {
1639
- hocr_str += " <span class='ocr_glyph'" ;
1640
- AddIdTohOCR (&hocr_str, " choice" , page_id, wcnt, gcnt);
1641
- hocr_str.add_str_int (" title='x_confs " , int (conf.second * 100 ));
1642
- hocr_str += " '" ;
1643
- hocr_str += " >" ;
1644
- hocr_str += conf.first ;
1645
- hocr_str += " </span>" ;
1646
- gcnt++;
1647
- }
1648
- hocr_str += " </span>" ;
1649
- tcnt++;
1650
- }
1651
- } else if (tesseract_->lstm_choice_mode == 2 && confidencemap != nullptr ) {
1652
- for (size_t i = 0 ; i < confidencemap->size (); i++) {
1653
- std::vector<std::pair<const char *, float >> timestep = (*confidencemap)[i];
1654
- if (timestep.size () > 0 ) {
1655
- hocr_str += " \n <span class='ocrx_cinfo'" ;
1656
- AddIdTohOCR (&hocr_str, " lstm_choices" , page_id, wcnt, tcnt);
1657
- hocr_str += " chosen='" ;
1658
- hocr_str += timestep[0 ].first ;
1659
- hocr_str += " '>" ;
1660
- for (size_t j = 1 ; j < timestep.size (); j++) {
1661
- hocr_str += " <span class='ocr_glyph'" ;
1662
- AddIdTohOCR (&hocr_str, " choice" , page_id, wcnt, gcnt);
1663
- hocr_str.add_str_int (" title='x_confs " , int (timestep[j].second * 100 ));
1664
- hocr_str += " '" ;
1665
- hocr_str += " >" ;
1666
- hocr_str += timestep[j].first ;
1667
- hocr_str += " </span>" ;
1668
- gcnt++;
1669
- }
1670
- hocr_str += " </span>" ;
1671
- tcnt++;
1672
- }
1673
- }
1674
- }
1675
- hocr_str += " </span>" ;
1676
- tcnt = 1 ;
1677
- gcnt = 1 ;
1678
- wcnt++;
1679
- // Close any ending block/paragraph/textline.
1680
- if (last_word_in_line) {
1681
- hocr_str += " \n </span>" ;
1682
- lcnt++;
1683
- }
1684
- if (last_word_in_para) {
1685
- hocr_str += " \n </p>\n " ;
1686
- pcnt++;
1687
- para_is_ltr = true ; // back to default direction
1688
- }
1689
- if (last_word_in_block) {
1690
- hocr_str += " </div>\n " ;
1691
- bcnt++;
1692
- }
1693
- }
1694
- hocr_str += " </div>\n " ;
1695
-
1696
- char *ret = new char [hocr_str.length () + 1 ];
1697
- strcpy (ret, hocr_str.string ());
1698
- delete res_it;
1699
- return ret;
1347
+ text->add_str_int (" \t " , left);
1348
+ text->add_str_int (" \t " , top);
1349
+ text->add_str_int (" \t " , right - left);
1350
+ text->add_str_int (" \t " , bottom - top);
1700
1351
}
1701
1352
1702
1353
/* *
0 commit comments