Skip to content

Commit effa574

Browse files
committed
Implement invisible text only for PDF
1 parent a979494 commit effa574

File tree

6 files changed

+40
-23
lines changed

6 files changed

+40
-23
lines changed

api/capi.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,10 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu
6464
return new TessHOcrRenderer(outputbase, font_info);
6565
}
6666

67-
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir)
67+
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
68+
BOOL textonly)
6869
{
69-
return new TessPDFRenderer(outputbase, datadir);
70+
return new TessPDFRenderer(outputbase, datadir, textonly);
7071
}
7172

7273
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase)

api/pdfrenderer.cpp

+26-15
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,12 @@ const int kCharWidth = 2;
178178
* PDF Renderer interface implementation
179179
**********************************************************************/
180180

181-
TessPDFRenderer::TessPDFRenderer(const char* outputbase, const char *datadir)
181+
TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir,
182+
bool textonly)
182183
: TessResultRenderer(outputbase, "pdf") {
183184
obj_ = 0;
184185
datadir_ = datadir;
186+
textonly_ = textonly;
185187
offsets_.push_back(0);
186188
}
187189

@@ -326,7 +328,11 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
326328
pdf_str.add_str_double("", prec(width));
327329
pdf_str += " 0 0 ";
328330
pdf_str.add_str_double("", prec(height));
329-
pdf_str += " 0 0 cm /Im1 Do Q\n";
331+
pdf_str += " 0 0 cm";
332+
if (!textonly_) {
333+
pdf_str += " /Im1 Do";
334+
}
335+
pdf_str += " Q\n";
330336

331337
int line_x1 = 0;
332338
int line_y1 = 0;
@@ -837,6 +843,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
837843
bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
838844
size_t n;
839845
char buf[kBasicBufSize];
846+
char buf2[kBasicBufSize];
840847
Pix *pix = api->GetInputImage();
841848
char *filename = (char *)api->GetInputName();
842849
int ppi = api->GetSourceYResolution();
@@ -845,6 +852,9 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
845852
double width = pixGetWidth(pix) * 72.0 / ppi;
846853
double height = pixGetHeight(pix) * 72.0 / ppi;
847854

855+
snprintf(buf2, sizeof(buf2), "XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
856+
const char *xobject = (textonly_) ? "" : buf2;
857+
848858
// PAGE
849859
n = snprintf(buf, sizeof(buf),
850860
"%ld 0 obj\n"
@@ -855,19 +865,18 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
855865
" /Contents %ld 0 R\n"
856866
" /Resources\n"
857867
" <<\n"
858-
" /XObject << /Im1 %ld 0 R >>\n"
868+
" %s"
859869
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
860870
" /Font << /f-0-0 %ld 0 R >>\n"
861871
" >>\n"
862872
">>\n"
863873
"endobj\n",
864874
obj_,
865-
2L, // Pages object
866-
width,
867-
height,
868-
obj_ + 1, // Contents object
869-
obj_ + 2, // Image object
870-
3L); // Type0 Font
875+
2L, // Pages object
876+
width, height,
877+
obj_ + 1, // Contents object
878+
xobject, // Image object
879+
3L); // Type0 Font
871880
if (n >= sizeof(buf)) return false;
872881
pages_.push_back(obj_);
873882
AppendPDFObject(buf);
@@ -904,13 +913,15 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
904913
objsize += strlen(b2);
905914
AppendPDFObjectDIY(objsize);
906915

907-
char *pdf_object;
908-
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
909-
return false;
916+
if (!textonly_) {
917+
char *pdf_object = nullptr;
918+
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
919+
return false;
920+
}
921+
AppendData(pdf_object, objsize);
922+
AppendPDFObjectDIY(objsize);
923+
delete[] pdf_object;
910924
}
911-
AppendData(pdf_object, objsize);
912-
AppendPDFObjectDIY(objsize);
913-
delete[] pdf_object;
914925
return true;
915926
}
916927

api/renderer.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ class TESS_API TessPDFRenderer : public TessResultRenderer {
186186
public:
187187
// datadir is the location of the TESSDATA. We need it because
188188
// we load a custom PDF font from this location.
189-
TessPDFRenderer(const char *outputbase, const char *datadir);
189+
TessPDFRenderer(const char* outputbase, const char* datadir, bool textonly);
190190

191191
protected:
192192
virtual bool BeginDocumentHandler();
@@ -196,20 +196,20 @@ class TESS_API TessPDFRenderer : public TessResultRenderer {
196196
private:
197197
// We don't want to have every image in memory at once,
198198
// so we store some metadata as we go along producing
199-
// PDFs one page at a time. At the end that metadata is
199+
// PDFs one page at a time. At the end, that metadata is
200200
// used to make everything that isn't easily handled in a
201201
// streaming fashion.
202202
long int obj_; // counter for PDF objects
203203
GenericVector<long int> offsets_; // offset of every PDF object in bytes
204204
GenericVector<long int> pages_; // object number for every /Page object
205205
const char *datadir_; // where to find the custom font
206+
bool textonly_; // skip images if set
206207
// Bookkeeping only. DIY = Do It Yourself.
207208
void AppendPDFObjectDIY(size_t objectsize);
208209
// Bookkeeping + emit data.
209210
void AppendPDFObject(const char *data);
210211
// Create the /Contents object for an entire page.
211-
static char* GetPDFTextObjects(TessBaseAPI* api,
212-
double width, double height);
212+
char* GetPDFTextObjects(TessBaseAPI* api, double width, double height);
213213
// Turn an image into a PDF object. Only transcode if we have to.
214214
static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
215215
char **pdf_object, long int *pdf_object_size);

api/tesseractmain.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -348,8 +348,10 @@ void PreloadRenderers(
348348

349349
api->GetBoolVariable("tessedit_create_pdf", &b);
350350
if (b) {
351-
renderers->push_back(
352-
new tesseract::TessPDFRenderer(outputbase, api->GetDatapath()));
351+
bool textonly;
352+
api->GetBoolVariable("textonly_pdf", &textonly);
353+
renderers->push_back(new tesseract::TessPDFRenderer(
354+
outputbase, api->GetDatapath(), textonly));
353355
}
354356

355357
api->GetBoolVariable("tessedit_write_unlv", &b);

ccmain/tesseractclass.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,8 @@ Tesseract::Tesseract()
389389
this->params()),
390390
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
391391
this->params()),
392+
BOOL_MEMBER(textonly_pdf, false, "Invisible text only for PDF",
393+
this->params()),
392394
STRING_MEMBER(unrecognised_char, "|",
393395
"Output char for unidentified blobs", this->params()),
394396
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),

ccmain/tesseractclass.h

+1
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,7 @@ class Tesseract : public Wordrec {
10261026
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
10271027
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
10281028
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
1029+
BOOL_VAR_H(textonly_pdf, false, "Invisible text only for PDF");
10291030
STRING_VAR_H(unrecognised_char, "|",
10301031
"Output char for unidentified blobs");
10311032
INT_VAR_H(suspect_level, 99, "Suspect marker level");

0 commit comments

Comments
 (0)