19
19
import sys
20
20
from datetime import date
21
21
from tempfile import TemporaryDirectory , mkdtemp
22
- from pathlib import Path
22
+ import pathlib
23
23
import logging
24
24
import subprocess
25
25
import argparse
@@ -75,8 +75,13 @@ def run_command(cmd, *args, env=None):
75
75
err_exit (f"{ cmd } not found" )
76
76
77
77
log .debug (f"Running { cmd } " )
78
- for arg in args :
78
+ args = list (args )
79
+ for idx , arg in enumerate (args ):
79
80
log .debug (arg )
81
+ # Workaround for https://bugs.python.org/issue33617
82
+ # TypeError: argument of type 'WindowsPath' is not iterable
83
+ if isinstance (arg , pathlib .WindowsPath ):
84
+ args [idx ] = str (arg )
80
85
81
86
proc = subprocess .run (
82
87
[cmd , * args ], stdout = subprocess .PIPE , stderr = subprocess .STDOUT , env = env
@@ -96,11 +101,11 @@ def run_command(cmd, *args, env=None):
96
101
# Used to check required input files and produced output files in each phase.
97
102
# Usage: check_file_readable FILE1 FILE2...
98
103
def check_file_readable (* filenames ):
99
- if isinstance (filenames , (str , Path )):
104
+ if isinstance (filenames , (str , pathlib . Path )):
100
105
filenames = [filenames ]
101
106
for filename in filenames :
102
107
try :
103
- with Path (filename ).open ():
108
+ with pathlib . Path (filename ).open ():
104
109
pass
105
110
except FileNotFoundError :
106
111
err_exit (f"Required/expected file '{ filename } ' does not exist" )
@@ -227,13 +232,13 @@ def parse_flags(argv=None):
227
232
else :
228
233
ctx .training_dir = mkdtemp (prefix = f"{ ctx .lang_code } -{ ctx .timestamp } " , dir = ctx .tmp_dir )
229
234
# Location of log file for the whole run.
230
- ctx .log_file = Path (ctx .training_dir ) / "tesstrain.log"
235
+ ctx .log_file = pathlib . Path (ctx .training_dir ) / "tesstrain.log"
231
236
log .info (f"Log file location: { ctx .log_file } " )
232
237
233
238
def show_tmpdir_location (training_dir ):
234
239
# On successful exit we will delete this first; on failure we want to let the user
235
240
# know where the log is
236
- if Path (training_dir ).exists ():
241
+ if pathlib . Path (training_dir ).exists ():
237
242
print (f"Temporary files retained at: { training_dir } " )
238
243
239
244
atexit .register (show_tmpdir_location , ctx .training_dir )
@@ -242,27 +247,27 @@ def show_tmpdir_location(training_dir):
242
247
# specified in the command-line.
243
248
if not ctx .training_text :
244
249
ctx .training_text = (
245
- Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .training_text"
250
+ pathlib . Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .training_text"
246
251
)
247
252
if not ctx .wordlist_file :
248
253
ctx .wordlist_file = (
249
- Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .wordlist"
254
+ pathlib . Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .wordlist"
250
255
)
251
256
252
257
ctx .word_bigrams_file = (
253
- Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .word.bigrams"
258
+ pathlib . Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .word.bigrams"
254
259
)
255
260
ctx .numbers_file = (
256
- Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .numbers"
261
+ pathlib . Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .numbers"
257
262
)
258
- ctx .punc_file = Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .punc"
259
- ctx .bigram_freqs_file = Path (ctx .training_text ).with_suffix (
263
+ ctx .punc_file = pathlib . Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .punc"
264
+ ctx .bigram_freqs_file = pathlib . Path (ctx .training_text ).with_suffix (
260
265
".training_text.bigram_freqs"
261
266
)
262
- ctx .unigram_freqs_file = Path (ctx .training_text ).with_suffix (
267
+ ctx .unigram_freqs_file = pathlib . Path (ctx .training_text ).with_suffix (
263
268
".training_text.unigram_freqs"
264
269
)
265
- ctx .train_ngrams_file = Path (ctx .training_text ).with_suffix (
270
+ ctx .train_ngrams_file = pathlib . Path (ctx .training_text ).with_suffix (
266
271
".training_text.train_ngrams"
267
272
)
268
273
ctx .generate_dawgs = 1
@@ -278,8 +283,8 @@ def cleanup(ctx):
278
283
279
284
# Function initializes font config with a unique font cache dir.
280
285
def initialize_fontconfig (ctx ):
281
- sample_path = Path (ctx .font_config_cache ) / "sample_text.txt"
282
- Path (sample_path ).write_text ("Text\n " )
286
+ sample_path = pathlib . Path (ctx .font_config_cache ) / "sample_text.txt"
287
+ pathlib . Path (sample_path ).write_text ("Text\n " )
283
288
log .info (f"Testing font: { ctx .fonts [0 ]} " )
284
289
run_command (
285
290
"text2image" ,
@@ -296,7 +301,7 @@ def make_fontname(font):
296
301
297
302
298
303
def make_outbase (ctx , fontname , exposure ):
299
- return Path (ctx .training_dir ) / f"{ ctx .lang_code } .{ fontname } .exp{ exposure } "
304
+ return pathlib . Path (ctx .training_dir ) / f"{ ctx .lang_code } .{ fontname } .exp{ exposure } "
300
305
301
306
302
307
# Helper function for phaseI_generate_image. Generates the image for a single
@@ -336,7 +341,7 @@ def generate_font_image(ctx, font, exposure, char_spacing):
336
341
337
342
check_file_readable (str (outbase ) + ".box" , str (outbase ) + ".tif" )
338
343
339
- if ctx .extract_font_properties and Path (ctx .train_ngrams_file ).exists ():
344
+ if ctx .extract_font_properties and pathlib . Path (ctx .train_ngrams_file ).exists ():
340
345
log .info (f"Extracting font properties of { font } " )
341
346
run_command (
342
347
"text2image" ,
@@ -362,16 +367,16 @@ def phase_I_generate_image(ctx, par_factor):
362
367
char_spacing = 0.0
363
368
364
369
for exposure in ctx .exposures :
365
- if ctx .extract_font_properties and Path (ctx .bigram_freqs_file ).exists ():
370
+ if ctx .extract_font_properties and pathlib . Path (ctx .bigram_freqs_file ).exists ():
366
371
# Parse .bigram_freqs file and compose a .train_ngrams file with text
367
372
# for tesseract to recognize during training. Take only the ngrams whose
368
373
# combined weight accounts for 95% of all the bigrams in the language.
369
- lines = Path (ctx .bigram_freqs_file ).read_text (encoding = "utf-8" ).split ("\n " )
374
+ lines = pathlib . Path (ctx .bigram_freqs_file ).read_text (encoding = "utf-8" ).split ("\n " )
370
375
records = (line .split (" " ) for line in lines )
371
376
p = 0.99
372
377
ngram_frac = p * sum (int (rec [1 ]) for rec in records if len (rec ) >= 2 )
373
378
374
- with Path (ctx .train_ngrams_file ).open ("w" , encoding = "utf-8" ) as f :
379
+ with pathlib . Path (ctx .train_ngrams_file ).open ("w" , encoding = "utf-8" ) as f :
375
380
cumsum = 0
376
381
for bigram , count in sorted (records , key = itemgetter (1 ), reverse = True ):
377
382
if cumsum > ngram_frac :
@@ -408,9 +413,9 @@ def phase_I_generate_image(ctx, par_factor):
408
413
def phase_UP_generate_unicharset (ctx ):
409
414
log .info ("=== Phase UP: Generating unicharset and unichar properties files ===" )
410
415
411
- box_files = Path (ctx .training_dir ).glob ("*.box" )
416
+ box_files = pathlib . Path (ctx .training_dir ).glob ("*.box" )
412
417
413
- ctx .unicharset_file = Path (ctx .training_dir ) / f"{ ctx .lang_code } .unicharset"
418
+ ctx .unicharset_file = pathlib . Path (ctx .training_dir ) / f"{ ctx .lang_code } .unicharset"
414
419
415
420
run_command (
416
421
"unicharset_extractor" ,
@@ -422,7 +427,7 @@ def phase_UP_generate_unicharset(ctx):
422
427
)
423
428
check_file_readable (ctx .unicharset_file )
424
429
425
- ctx .xheights_file = Path (ctx .training_dir ) / f"{ ctx .lang_code } .xheights"
430
+ ctx .xheights_file = pathlib . Path (ctx .training_dir ) / f"{ ctx .lang_code } .xheights"
426
431
run_command (
427
432
"set_unicharset_properties" ,
428
433
"-U" ,
@@ -512,12 +517,12 @@ def phase_UP_generate_unicharset(ctx):
512
517
def phase_E_extract_features (ctx , box_config , ext ):
513
518
log .info (f"=== Phase E: Generating { ext } files ===" )
514
519
515
- img_files = list (Path (ctx .training_dir ).glob ("*.exp*.tif" ))
520
+ img_files = list (pathlib . Path (ctx .training_dir ).glob ("*.exp*.tif" ))
516
521
log .debug (img_files )
517
522
518
523
# Use any available language-specific configs.
519
524
config = ""
520
- testconfig = Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .config"
525
+ testconfig = pathlib . Path (ctx .langdata_dir ) / ctx .lang_code / f"{ ctx .lang_code } .config"
521
526
if testconfig .exists ():
522
527
config = testconfig
523
528
log .info (f"Using { ctx .lang_code } .config" )
@@ -536,7 +541,7 @@ def phase_E_extract_features(ctx, box_config, ext):
536
541
run_command ,
537
542
"tesseract" ,
538
543
img_file ,
539
- Path (img_file ).with_suffix ("" ),
544
+ pathlib . Path (img_file ).with_suffix ("" ),
540
545
* box_config ,
541
546
config ,
542
547
env = tessdata_environ ,
@@ -552,7 +557,7 @@ def phase_E_extract_features(ctx, box_config, ext):
552
557
pbar .update (1 )
553
558
# Check that all the output files were produced.
554
559
for img_file in img_files :
555
- check_file_readable (Path (img_file .with_suffix ("." + ext )))
560
+ check_file_readable (pathlib . Path (img_file .with_suffix ("." + ext )))
556
561
557
562
return
558
563
@@ -640,7 +645,7 @@ def phase_E_extract_features(ctx, box_config, ext):
640
645
def make_lstmdata (ctx ):
641
646
log .info ("=== Constructing LSTM training data ===" )
642
647
lang_prefix = f"{ ctx .langdata_dir } /{ ctx .lang_code } /{ ctx .lang_code } "
643
- path_output = Path (ctx .output_dir )
648
+ path_output = pathlib . Path (ctx .output_dir )
644
649
if not path_output .is_dir ():
645
650
log .info (f"Creating new directory { ctx .output_dir } " )
646
651
path_output .mkdir (exist_ok = True , parents = True )
@@ -672,7 +677,7 @@ def make_lstmdata(ctx):
672
677
)
673
678
674
679
def get_file_list ():
675
- training_path = Path (ctx .training_dir )
680
+ training_path = pathlib . Path (ctx .training_dir )
676
681
if ctx .save_box_tiff :
677
682
log .info ("=== Saving box/tiff pairs for training data ===" )
678
683
yield from training_path .glob (f"{ ctx .lang_code } *.box" )
@@ -686,7 +691,7 @@ def get_file_list():
686
691
687
692
lstm_list = f"{ ctx .output_dir } /{ ctx .lang_code } .training_files.txt"
688
693
dir_listing = (str (p ) for p in path_output .glob (f"{ ctx .lang_code } .*.lstmf" ))
689
- Path (lstm_list ).write_text ("\n " .join (dir_listing ))
694
+ pathlib . Path (lstm_list ).write_text ("\n " .join (dir_listing ))
690
695
691
696
692
697
# make__traineddata() {
0 commit comments